# Email Auto Fill
- **Text Preprocessing**
    - *Contractions, Sentence Tokenization*
- **Basic EDA**
    - *Word Cloud*
- **Probabilistic Language Models**
    - *Unigram, Bigrams, Trigrams, N-grams*

In [1]:
import numpy as np
import pandas as pd
import nltk, re, string, contractions
from nltk.tokenize import sent_tokenize, word_tokenize
import email
from nltk.util import bigrams, trigrams

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv(r'F:\Muthu_2023\Personal\NextStep\NLP\NLP\Dataset\Email\email_truncated.csv')
# df = pd.read_csv(r'E:\Nextstep\NLP\Dataset\Email\email_truncated.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,file,message
0,0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


**Text Preprocessing:**
- **`Using email library, extract body from the complete message`**
- **`Remove all new line characters`**
- **`Remove all non alpha numeric characters`**
- **`Strip the and lower case the text`**
- **`Apply contractions`**

In [3]:
def extractMessage(message):
    e = email.message_from_string(message)
    return e.get_payload().lower()

In [4]:
df['content'] = df['message'].apply(extractMessage)
df['content'] = df['content'].str.replace("\n", " ")
df['content'] = df['content'].apply(lambda x: re.sub("[^a-zA-Z0-9 \.]", "", x))
df['content'] = df['content'].str.strip().str.lower()
df['content'] = df['content'].apply(lambda x: contractions.fix(x))
df['content']

0                                     here is our forecast
1        traveling to have a business meeting takes the...
2                              test successful.  way to go
3        randy   can you send me a schedule of the sala...
4                        let us shoot for tuesday at 1145.
                               ...                        
19995    don  i have extended your trial to tradersnews...
19996    were pleased to announce two new price reporte...
19997    for the tradersnews indexes and more industry ...
19998    sounds great  keep up the good work power god....
19999    great talking with you.  see you the other guy...
Name: content, Length: 20000, dtype: object

# EDA

**`Histogram plot for number of words in a message`**

In [None]:
plt.hist(df['content'].apply(lambda x: len(x)), bins=1000)
plt.xlim(0,10000)
plt.show()

**`Create a column with sentences as list elements for each message in main dataframe`**

In [48]:
def sentence_tokenization(text):
    sentence_list = sent_tokenize(text)
    transformed_sent = []
    for sentence in sentence_list:
        sentence = (re.sub("[^a-zA-Z0-9 ]", "", sentence))
        words = []
        for word in sentence.split():
            if len(word) < 20 and word.strip().isalpha():
                words.append(word.strip())
        if len(words) > 0:
            transformed_sent.append(" ".join(words))
    return transformed_sent

In [49]:
df['sent_list'] = df['content'].apply(sentence_tokenization)

**`Generate Word Count Vector for the complete corpus`**

In [8]:
d = {}
for sent_tokens in df['sent_list']:
    for sent in sent_tokens:
        for word in sent.split():
            word = word.replace(".", "").strip()
            if word in d:
                d[word] += 1
            else:
                d[word] = 1

**`Sort Top N words by total count in the corpus `**

In [11]:
sorted(d.items(), key=lambda x: x[1], reverse=True)[:20]

[('the', 194347),
 ('to', 140013),
 ('and', 87514),
 ('a', 79911),
 ('of', 72795),
 ('in', 61234),
 ('you', 54237),
 ('for', 52873),
 ('is', 50794),
 ('on', 47504),
 ('i', 39791),
 ('this', 34549),
 ('that', 34048),
 ('not', 29304),
 ('be', 29176),
 ('will', 28690),
 ('from', 28524),
 ('with', 27117),
 ('at', 26695),
 ('have', 26266)]

***All Top20 words are Stopwords***

# Word Cloud

**`Build Word cloud from the email body texts`**

# Bigram Model

**`Generate bigram dictionary with frequency of occurence in {(currentword, nextword): freq}`**

In [138]:
bi_dict = {}
for message in df['sent_list']:
    for sentence in message:
        for words in bigrams(sentence.split()):
            if words in bi_dict:
                bi_dict[words] += 1
            else:
                bi_dict[words] = 1            

In [139]:
bi_dict_prob = {}
for w1, w2 in bi_dict:
    bi_dict_prob[(w1, w2)] = bi_dict[(w1, w2)] / d[w1]

**`Sort the dictinary based on key and values`**

In [112]:
bi_dict_sorted = dict(sorted(bi_dict_prob.items(), key=lambda x: (x[0][0], x[1]), reverse=True))

In [113]:
len(bi_dict_sorted)

626705

**`Create data frame from the dictinary for easier processing`**

In [114]:
bi_df = pd.DataFrame(data = bi_dict_sorted.values(), columns=['Count'], index=bi_dict_sorted.keys())
bi_df.reset_index(inplace=True)
bi_df.head()

Unnamed: 0,level_0,level_1,Count
0,zypfje,baughmandon,1.0
1,zy,for,1.0
2,zwiebel,calls,0.5
3,zwiebel,and,0.5
4,zwerneman,jazztotalzonecom,1.0


**`Extract top N Next words in list for each Current word`**

In [115]:
N = 3
filtered_bi = bi_df.drop('Count', axis=1).groupby('level_0').head(N)
filtered_bi = filtered_bi.groupby('level_0')['level_1'].apply(list).reset_index()

**`Transform dataframe to dictionary with key as current word and values as N next words`**

In [116]:
filtered_bi_dict = dict()
for i in range(len(filtered_bi)):
    filtered_bi_dict[filtered_bi['level_0'].iloc[i]] = filtered_bi['level_1'].iloc[i]

**`Derive the next N words for the current word from the dictionary`**

In [104]:
def get_nextwords(Queryword, filtered_bi_dict):
    if Queryword.lower() in filtered_bi_dict:
        return filtered_bi_dict[Queryword.lower()]
    else:
        return "Word not exist in dictionary"

In [117]:
get_nextwords('I', filtered_bi_dict)

['am', 'have', 'will']

In [124]:
get_nextwords('how', filtered_bi_dict)

['to', 'about', 'much']

In [119]:
get_nextwords('to', filtered_bi_dict)

['the', 'be', 'get']

In [120]:
get_nextwords('the', filtered_bi_dict)

['following', 'last', 'new']

In [125]:
get_nextwords('they', filtered_bi_dict)

['are', 'have', 'rank']

In [127]:
get_nextwords('can', filtered_bi_dict)

['you', 'be', 'do']

**`Generate the next M sequence words for the current word`**

In [100]:
M = 10
CurrWord = 'Enron'
word_list = [CurrWord]
for x in range(M):    
    CurrWord = get_nextwords(CurrWord, filtered_bi_dict)[0]
    word_list.append(CurrWord)
print(" ".join(word_list))

Enron north america corp from the following the following the following


# Trigram Model

**`Generate conditional probability of trigram`**

In [145]:
tri_dict = {}
tri_dict_prob = {}
for message in df['sent_list']:
    for sentence in message:
        for words in trigrams(sentence.split()):
            if words in tri_dict:
                tri_dict[words] += 1
            else:
                tri_dict[words] = 1

for words in tri_dict:
    currwords = words[:-1]
    tri_dict_prob[words] = tri_dict[words] / bi_dict[currwords]

In [151]:
tri_dict_sort = sorted(tri_dict_prob.items(), key = (lambda x: (x[0][:2], x[1])), reverse=True)

In [None]:
tri_dict_df = pd.DataFrame(data = tri_dict_sort.values(), index = tri_dict_sort.keys())
tri_dict_df.head()
tri_dict_df