### Importing libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import string
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from subprocess import check_output

### Email Data Loading

In [2]:
# I use default email library just for simplicity. For real product I would use more complicated parsing tools or write my own
# We extract email artificials and content from raw text

def email_from_string(raw_email):
    msg = email.message_from_string(raw_email)
    
    content = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            content.append(part.get_payload())
            
    result = {}
    for key in msg.keys(): 
        result[key] = msg[key]
    result["content"] = ''.join(content)
    
    return result

In [6]:
# Import the email modules we'll need
import glob
import email
from email import policy
from email.parser import BytesParser

path = '../datawe/raw/Email_Classification/*'
email_types = glob.glob(path)
appendFilesData = []
for folder in email_types:
    files = glob.glob(folder+"/*.txt")
    email_type = folder.split('\\')[1]
    for name in files:
        try:
            with open(name) as fp:
                msg = email_from_string(fp.read())
                #appendFilesData.append(fp.read())
                #msg = email.message_from_string(fp)
                #msg = BytesParser(policy=policy.default).parse(fp)
                #richest = ''.join(msg.get_body(preferencelist=('plain', 'html')).get_content().splitlines(keepends=True)[:])
                appendFilesData.append({
#                     "to":msg['To'],
#                     "from":msg['From'],
#                     "subject":msg['Subject'],
#                     "sent":msg["Sent"],
#                     "importance":msg["Importance"],
                    "content":msg["content"],
                    "class":email_type,
                })
         
        except IOError as exc:
            print('Exception')

In [7]:
appendFilesData[0]

{'content': 'Dear Rohit,\n \nAs per our conversation a while ago, please book and confirm 02 Executive room at Taj Gateway Ganges from 24/26 December 2018 for two nights.\n \nKind Regards,\nMeemendra\n \nMeemendra Kumar | Tour Consultant | &BEYOND South Asia| Meemendra.kumar@andbeyondindia.com | T +91 11 46269000 | D +91 11 46269022 | M +91 9910092664 New Delhi | India | www.andBeyond.com\n \n\n ',
 'class': 'Booking'}

### Email Data preprocessing

In [9]:
data = pd.DataFrame(appendFilesData)
#data['length'] = data['content'].apply(len)
data.head()

Unnamed: 0,class,content
0,Booking,"Dear Rohit,\n \nAs per our conversation a whil..."
1,Booking,"Hi Agnes,\n \nI have not received a response t..."
2,Booking,"As per corporate holiday plan, kindly arrange ..."
3,Booking,"Dear Gampa ji,\n \nGreetings of the day !\n \n..."
4,Cancellation,Hi\n\nPlease cancel my booking .\nThanks\nRAji...


In [31]:
def pre_process(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    text = [word for word in text if word.isalpha()]
    words = ""
    for i in text:
            stemmer = SnowballStemmer("english")
            words += (stemmer.stem(i))+" "
    return words

In [32]:
textFeatures = data['content'].copy()
textFeatures = textFeatures.apply(pre_process)
vectorizer = TfidfVectorizer("english")
features = vectorizer.fit_transform(textFeatures)

features_train, features_test, labels_train, labels_test = train_test_split(features, data['class'], test_size=0.3, random_state=111)

In [35]:
textFeatures

0     dear rohit per convers ago pleas book confirm ...
1     hi agn receiv respons email yet pleas could le...
2     per corpor holiday plan kind arrang book follo...
3     dear gampa ji greet day lerner group x pax ple...
4                     hi pleas cancel book thank rajiv 
5     dear mr athaly per request cancel reserv favor...
6     dear team pleas cancel attach book without cha...
7     dear team inner circl plan stay taj mahal mumb...
8     hi hotel recommend local abb offic bangalor wa...
9     dear reserv center good day would like ask air...
10    hi look accommod famili jan jan adult kid coul...
11    kind check member tariff taj palac vivanta taj...
12    sent monday novemb pm reserv reservationstajho...
13    hi process book holiday famili detail pleas co...
14    manag reserv desk hotel taj yeshwantpur bangal...
15    haiku hospit haikuvsnlcom sent monday novemb p...
16    dear team famili friend plan visit chennai loo...
17    dear sir need follow inform regard reserv 

In [33]:
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

svc = SVC(kernel='sigmoid', gamma=1.0)
svc.fit(features_train, labels_train)
prediction = svc.predict(features_test)
accuracy_score(labels_test,prediction)

0.7222222222222222

In [34]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB(alpha=0.2)
mnb.fit(features_train, labels_train)
prediction = mnb.predict(features_test)
accuracy_score(labels_test,prediction)

0.7777777777777778

### transforming the data 

In [107]:
# I use default email library just for simplicity. For real product I would use more complicated parsing tools or write my own
# We extract email artificials and content from raw text

def email_from_string(raw_email):
    msg = email.message_from_string(raw_email)
    
    content = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            content.append(part.get_payload())
            
    result = {}
    for key in msg.keys(): 
        result[key] = msg[key]
    result["content"] = ''.join(content)
    
    return result

In [120]:
email_from_string(appendFilesData[0])

{'From': 'Meemendra Kumar [mailto:meemendra.kumar@andbeyondindia.com] ',
 'Sent': 'Monday, November 19, 2018 3:09 PM',
 'To': 'Gateway Varanasi',
 'Subject': 'Mr. Nagdev X 04',
 'Importance': 'High\n ',
 'content': 'Dear Rohit,\n \nAs per our conversation a while ago, please book and confirm 02 Executive room at Taj Gateway Ganges from 24/26 December 2018 for two nights.\n \nKind Regards,\nMeemendra\n \nMeemendra Kumar | Tour Consultant | &BEYOND South Asia| Meemendra.kumar@andbeyondindia.com | T +91 11 46269000 | D +91 11 46269022 | M +91 9910092664 New Delhi | India | www.andBeyond.com\n \n\n '}

In [96]:
import email

In [12]:
features

<57x1407 sparse matrix of type '<class 'numpy.float64'>'
	with 3285 stored elements in Compressed Sparse Row format>

In [13]:
help(features)

Help on csr_matrix in module scipy.sparse.csr object:

class csr_matrix(scipy.sparse.compressed._cs_matrix, scipy.sparse.sputils.IndexMixin)
 |  Compressed Sparse Row matrix
 |  
 |  This can be instantiated in several ways:
 |      csr_matrix(D)
 |          with a dense matrix or rank-2 ndarray D
 |  
 |      csr_matrix(S)
 |          with another sparse matrix S (equivalent to S.tocsr())
 |  
 |      csr_matrix((M, N), [dtype])
 |          to construct an empty matrix with shape (M, N)
 |          dtype is optional, defaulting to dtype='d'.
 |  
 |      csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])
 |          where ``data``, ``row_ind`` and ``col_ind`` satisfy the
 |          relationship ``a[row_ind[k], col_ind[k]] = data[k]``.
 |  
 |      csr_matrix((data, indices, indptr), [shape=(M, N)])
 |          is the standard CSR representation where the column indices for
 |          row i are stored in ``indices[indptr[i]:indptr[i+1]]`` and their
 |          corresponding value

In [20]:
features.shape

(57, 1407)

In [23]:
features.toarray( order=None, out=None).shape

(57, 1407)

In [26]:
textFeatures[1]

'hi agn receiv respons email yet pleas could let know room confirm thank vinay vinay kurien counsel jone day® one firm worldwid 138 market street level 28 capitagreen singapor 048946 offic 6562335969 fax 6565363939 '

In [1]:
vectorizers

NameError: name 'vectorizers' is not defined

In [None]:

df1 = pd.DataFrame(x.toarray(), columns=v.get_feature_names())
print(df1)