### Importing libraries

In [121]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import string
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from subprocess import check_output

### Email Data Loading

In [122]:
# I use default email library just for simplicity. For real product I would use more complicated parsing tools or write my own
# We extract email artificials and content from raw text

def email_from_string(raw_email):
    msg = email.message_from_string(raw_email)
    
    content = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            content.append(part.get_payload())
            
    result = {}
    for key in msg.keys(): 
        result[key] = msg[key]
    result["content"] = ''.join(content)
    
    return result

In [130]:
# Import the email modules we'll need
import glob
from email import policy
from email.parser import BytesParser

path = '../data/raw/Email_Classification/*'
email_types = glob.glob(path)
appendFilesData = []
for folder in email_types:
    files = glob.glob(folder+"/*.txt")
    email_type = folder.split('\\')[1]
    for name in files:
        try:
            with open(name) as fp:
                msg = email_from_string(fp.read())
                #appendFilesData.append(fp.read())
                #msg = email.message_from_string(fp)
                #msg = BytesParser(policy=policy.default).parse(fp)
                #richest = ''.join(msg.get_body(preferencelist=('plain', 'html')).get_content().splitlines(keepends=True)[:])
                appendFilesData.append({
#                     "to":msg['To'],
#                     "from":msg['From'],
#                     "subject":msg['Subject'],
#                     "sent":msg["Sent"],
#                     "importance":msg["Importance"],
                    "content":msg["content"],
                    "class":email_type,
                })
         
        except IOError as exc:
            print('Exception')

In [119]:
appendFilesData[0]

'From: Meemendra Kumar [mailto:meemendra.kumar@andbeyondindia.com] \nSent: Monday, November 19, 2018 3:09 PM\nTo: Gateway Varanasi\nSubject: Mr. Nagdev X 04\nImportance: High\n \nDear Rohit,\n \nAs per our conversation a while ago, please book and confirm 02 Executive room at Taj Gateway Ganges from 24/26 December 2018 for two nights.\n \nKind Regards,\nMeemendra\n \nMeemendra Kumar | Tour Consultant | &BEYOND South Asia| Meemendra.kumar@andbeyondindia.com | T +91 11 46269000 | D +91 11 46269022 | M +91 9910092664 New Delhi | India | www.andBeyond.com\n \n\n '

### Email Data preprocessing

In [102]:
data = pd.DataFrame(appendFilesData)
data['length'] = data['text'].apply(len)
data.head()

Unnamed: 0,class,raw_text,subject,text,to,length
0,Booking,<_io.BufferedReader name='../data/raw/Email_Cl...,Mr. Nagdev X 04,"Dear Rohit,\n \nAs per our conversation a whil...","""Gateway Varanasi""",376
1,Booking,<_io.BufferedReader name='../data/raw/Email_Cl...,RE: Rate Proposal - Taj Connemara,"Hi Agnes,\n \nI have not received a response t...",Reservations <Reservations@tajhotels.com>,308
2,Booking,<_io.BufferedReader name='../data/raw/Email_Cl...,TEHP/18/12809: Booking request of Arun Gadamsh...,"As per corporate holiday plan, kindly arrange ...",Reservations Coorg <reservations.coorg@tajhote...,3460
3,Booking,<_io.BufferedReader name='../data/raw/Email_Cl...,Valley Period/3 double room/Lerner group,"Dear Gampa ji,\n \nGreetings of the day !\n \n...","""Neeraj Gampa""",457
4,Cancellation,<_io.BufferedReader name='../data/raw/Email_Cl...,Re: Taj Chandigarh Payment link for reservatio...,Hi\n\nPlease cancel my booking .\nThanks\nRAji...,Reservations <Reservations@tajhotels.com>,45


In [103]:
def pre_process(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    words = ""
    for i in text:
            stemmer = SnowballStemmer("english")
            words += (stemmer.stem(i))+" "
    return words

In [104]:
textFeatures = data['text'].copy()
textFeatures = textFeatures.apply(pre_process)
vectorizer = TfidfVectorizer("english")
features = vectorizer.fit_transform(textFeatures)

features_train, features_test, labels_train, labels_test = train_test_split(features, data['class'], test_size=0.3, random_state=111)

In [105]:
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

svc = SVC(kernel='sigmoid', gamma=1.0)
svc.fit(features_train, labels_train)
prediction = svc.predict(features_test)
accuracy_score(labels_test,prediction)

0.7222222222222222

In [106]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB(alpha=0.2)
mnb.fit(features_train, labels_train)
prediction = mnb.predict(features_test)
accuracy_score(labels_test,prediction)

0.7777777777777778

### transforming the data 

In [107]:
# I use default email library just for simplicity. For real product I would use more complicated parsing tools or write my own
# We extract email artificials and content from raw text

def email_from_string(raw_email):
    msg = email.message_from_string(raw_email)
    
    content = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            content.append(part.get_payload())
            
    result = {}
    for key in msg.keys(): 
        result[key] = msg[key]
    result["content"] = ''.join(content)
    
    return result

In [120]:
email_from_string(appendFilesData[0])

{'From': 'Meemendra Kumar [mailto:meemendra.kumar@andbeyondindia.com] ',
 'Sent': 'Monday, November 19, 2018 3:09 PM',
 'To': 'Gateway Varanasi',
 'Subject': 'Mr. Nagdev X 04',
 'Importance': 'High\n ',
 'content': 'Dear Rohit,\n \nAs per our conversation a while ago, please book and confirm 02 Executive room at Taj Gateway Ganges from 24/26 December 2018 for two nights.\n \nKind Regards,\nMeemendra\n \nMeemendra Kumar | Tour Consultant | &BEYOND South Asia| Meemendra.kumar@andbeyondindia.com | T +91 11 46269000 | D +91 11 46269022 | M +91 9910092664 New Delhi | India | www.andBeyond.com\n \n\n '}

In [96]:
import email