# Email Auto Fill
- **Text Preprocessing**
    - *Contractions, Sentence Tokenization*
- **Basic EDA**
    - *Word Cloud*
- **Probabilistic Language Models**
    - *Unigram, Bigrams, Trigrams, N-grams*

In [1]:
import numpy as np
import pandas as pd
import nltk, re, string, contractions
from nltk.tokenize import sent_tokenize, word_tokenize
import email

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# df = pd.read_csv(r'F:\Muthu_2023\Personal\NextStep\NLP\NLP\Dataset\Email\email_truncated.csv')
df = pd.read_csv(r'E:\Nextstep\NLP\Dataset\Email\email_truncated.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,file,message
0,0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [4]:
def extractMessage(message):
    e = email.message_from_string(message)
    return e.get_payload().lower()

**Text Preprocessing:**
- **`Using email library, extract body from the complete message`**
- **`Remove all new line characters`**
- **`Remove all non alpha numeric characters`**
- **`Strip the and lower case the text`**
- **`Apply contractions`**

In [5]:
df['content'] = df['message'].apply(extractMessage)
df['content'] = df['content'].str.replace("\n", " ")
df['content'] = df['content'].apply(lambda x: re.sub("[^a-zA-Z0-9 \.]", "", x))
df['content'] = df['content'].str.strip().str.lower()
df['content'] = df['content'].apply(lambda x: contractions.fix(x))
df['content']

0                                     here is our forecast
1        traveling to have a business meeting takes the...
2                              test successful.  way to go
3        randy   can you send me a schedule of the sala...
4                        let us shoot for tuesday at 1145.
                               ...                        
19995    don  i have extended your trial to tradersnews...
19996    were pleased to announce two new price reporte...
19997    for the tradersnews indexes and more industry ...
19998    sounds great  keep up the good work power god....
19999    great talking with you.  see you the other guy...
Name: content, Length: 20000, dtype: object

# EDA

**`Histogram plot for number of words in a message`**

In [None]:
plt.hist(df['content'].apply(lambda x: len(x)), bins=1000)
plt.xlim(0,10000)
plt.show()

**`Generate Word Count Vector for the complete corpus`**

In [10]:
d = {}
for sent_tokens in df['sent_list']:
    for sent in sent_tokens:
        for word in sent.split():
            word = word.replace(".", "").strip()
            if word in d:
                d[word] += 1
            else:
                d[word] = 1

**`Create a column with sentences as list elements for each message in main dataframe`**

In [7]:
def sentence_tokenization(text):
    sentence_list = sent_tokenize(text)
    transformed_sent = []
    for sentence in sentence_list:
        transformed_sent.append(re.sub("[^a-zA-Z0-9 ]", "", sentence))
    return transformed_sent

In [8]:
df['sent_list'] = df['content'].apply(sentence_tokenization)

**`Sort Top N words by total count in the corpus `**

In [11]:
sorted(d.items(), key=lambda x: x[1], reverse=True)[:20]

[('the', 194347),
 ('to', 140013),
 ('and', 87514),
 ('a', 79911),
 ('of', 72795),
 ('in', 61234),
 ('you', 54237),
 ('for', 52873),
 ('is', 50794),
 ('on', 47504),
 ('i', 39791),
 ('this', 34549),
 ('that', 34048),
 ('not', 29304),
 ('be', 29176),
 ('will', 28690),
 ('from', 28524),
 ('with', 27117),
 ('at', 26695),
 ('have', 26266)]

***All Top20 words are Stopwords***

# Word Cloud

**`Build Word cloud from the email body texts`**

# Bigram Model

In [12]:
from nltk.util import bigrams, trigrams

**`Generate bigram dictionary with frequency of occurence in {(currentword, nextword): freq}`**

In [13]:
bi_dict = {}
for message in df['sent_list']:
    for sentence in message:
        for words in bigrams(sentence.split()):
            if words in bi_dict:
                bi_dict[words] += 1
            else:
                bi_dict[words] = 1            

**`Sort the dictinary based on key and values`**

In [23]:
sorted(sorted(bi_dict.items(), key=lambda x: (x[0], x[1])), key = lambda x: x[1], reverse=True)

[(('of', 'the'), 17627),
 (('in', 'the'), 15582),
 (('on', 'the'), 9656),
 (('to', 'the'), 9623),
 (('for', 'the'), 9379),
 (('if', 'you'), 8921),
 (('will', 'be'), 8185),
 (('cc', 'subject'), 7977),
 (('do', 'not'), 6674),
 (('i', 'am'), 6125),
 (('to', 'be'), 5959),
 (('pm', 'to'), 5597),
 (('am', 'to'), 5504),
 (('original', 'message'), 5324),
 (('message', 'from'), 5258),
 (('you', 'have'), 5007),
 (('forwarded', 'by'), 4605),
 (('you', 'are'), 4594),
 (('subject', 're'), 4517),
 (('with', 'the'), 4439),
 (('is', 'a'), 4291),
 (('and', 'the'), 4189),
 (('at', 'the'), 3984),
 (('09', '09'), 3899),
 (('i', 'have'), 3870),
 (('this', 'week'), 3857),
 (('is', 'the'), 3766),
 (('from', 'the'), 3494),
 (('with', 'a'), 3342),
 (('it', 'is'), 3303),
 (('has', 'been'), 3219),
 (('you', 'can'), 3195),
 (('for', 'a'), 3008),
 (('let', 'me'), 2976),
 (('we', 'have'), 2927),
 (('in', 'a'), 2902),
 (('to', 'get'), 2838),
 (('is', 'not'), 2815),
 (('have', 'a'), 2785),
 (('want', 'to'), 2777),
 (

In [24]:
temp = sorted(bi_dict.items(), key=lambda x: (x[0], x[1]))

In [27]:
len(temp)

753728

In [28]:
temp[0]

(('0', '0'), 326)

In [32]:
temp[:][0][0]

('0', '0')