##  Data Modification and Data Preperation

In [1]:
import os
import email # to process email more effectively and easily
from prettytable import PrettyTable
from tqdm import tqdm
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import re #regex
from tqdm import tqdm
import tensorflow as tf

In [2]:
#Importing pickle file saved in Data Preprocessing Notebook
Data = pd.read_pickle('ProcessedEmailSentences.pkl')
Data.head()

Unnamed: 0,Sentence
0,we have had three meetings which brought out ...
1,it will help determine which requests fall un...
2,use the above to formulate a project plan
3,customized rows and columns in the position m...
4,position and pandl aggregation across all gas...


In [3]:
Data.shape

(312065, 1)

### Splitting The Dataset in X and Y
Points to take into consideration,
- Minimum Count of of Words in sentence and Maximum count of words of all sentences is 15.
- Therefore, Minimum words we will set for input would be 4. Having less than 4 words will not make any sense and it'll not be very useful for user.
- If we add 3 or less than 3 words as input then model might not take semantic meaning of the words into consideration and might recommend false output

I'll be using below steps break sentences in input and output sentences
1. Creating New column for first 3 words and New column for words after the 3 words for each sentence
        Example Sentence : "Use the above to formulate a project plan" will be splitted as:
        input sentence : Use the above
        output sentence : to formulate a project plan
        
Note: We can create a method to return input and output String as directly. We are performing above step, because operation on strings are very fast in pandas which will save us lot of time

2. After splitting, there are still more than 1 words present in output sentences,
    So, We'll create new DataFrame Which will contain input string concatenated with output string


In [51]:
from tqdm import tqdm

In [7]:
def CreateInOut(listSent):
    '''
    This Method Creates New Dataframe of input and output strings based on above operations
    '''
    NewDf = pd.DataFrame()
    input_Sentence = []
    output_Sentence = []
    
    for ind,sent in enumerate(tqdm(listSent)):
        wordList = sent.split()
        inpW = wordList[0]
        opL = wordList[1:]
        input_Sentence.append(inpW)
        output_Sentence.append(' '.join(opL))
        if len(opL)>1:
            for wordsIndx in range(len(opL)-1):
                inpW = inpW +' '+opL[wordsIndx]
                newOpId = wordsIndx+1
                opW = ' '.join(opL[newOpId:])
                
                input_Sentence.append(inpW)
                output_Sentence.append(opW)
        
        
    NewDf['input Sentence'] = input_Sentence
    NewDf['output Sentence'] = output_Sentence
    return NewDf

In [8]:
NewDf = CreateInOut(list(Data['Sentence'].values))
NewDf.head()

100%|██████████████████████████████████████████████████████████████████████| 312065/312065 [00:01<00:00, 211357.86it/s]


Unnamed: 0,input Sentence,output Sentence
0,we,have had three meetings which brought out very...
1,we have,had three meetings which brought out very diff...
2,we have had,three meetings which brought out very differen...
3,we have had three,meetings which brought out very different issu...
4,we have had three meetings,which brought out very different issues from d...


In [9]:
NewDf.to_pickle('DattForAttention.pkl')

In [6]:
NewDf = CreateInOut(list(Data['Sentence'].values))
NewDf.head()

100%|██████████████████████████████████████████████████████████████████████| 312065/312065 [00:01<00:00, 214140.26it/s]


Unnamed: 0,input Sentence,output Sentence
0,we have had,have had three meetings which brought out very...
1,we have had have,had three meetings which brought out very diff...
2,we have had have had,three meetings which brought out very differen...
3,we have had have had three,meetings which brought out very different issu...
4,we have had have had three meetings,which brought out very different issues from d...


In [54]:
#deleting previous dataframe to free some space
del Data

In [55]:
NewDf.shape

(1894885, 2)

In [56]:
#Removing all rows which contain 1 letter
NewDf = NewDf[NewDf['output Sentence'].str.len()!=1]
NewDf.head()

Unnamed: 0,input Sentence,output Sentence
0,we have had,three meetings which brought out very differen...
1,we have had three,meetings which brought out very different issu...
2,we have had three meetings,which brought out very different issues from d...
3,we have had three meetings which,brought out very different issues from differe...
4,we have had three meetings which brought,out very different issues from different traders


In [57]:
NewDf.shape

(1887787, 2)

### Train Test Split
- Whole data after splitting the sentenes with words will not fit into memory
- Therfore splitting the dataset now into training, testing and validation sets 

In [58]:
from sklearn.model_selection import train_test_split
#Using Only 15% of data as test and validation data, as we need lots of data for training
XTrain,XTest,YTrain,YTest = train_test_split(NewDf['input Sentence'].values,NewDf['output Sentence'].values,random_state=42,test_size=0.15)
XTrain,XCv,YTrain,YCv = train_test_split(XTrain,YTrain,random_state=42,test_size=0.14)
print('Shape of input training data: ',XTrain.shape)
print('Shape of input cross validation data: ',XCv.shape)
print('Shape of input testing data: ',XTest.shape)
print('Shape of output training data: ',YTrain.shape)
print('Shape of output cross validation data: ',YCv.shape)
print('Shape of output testing data: ',YTest.shape)

Shape of input training data:  (1379971,)
Shape of input cross validation data:  (224647,)
Shape of input testing data:  (283169,)
Shape of output training data:  (1379971,)
Shape of output cross validation data:  (224647,)
Shape of output testing data:  (283169,)


### Word Splitting
- Work is not done Yet...! With the above data, we can surely recommend new words to the user after giving more than 3 words as a input. We have to give full 3 words as a input.
- But we have to recommend new sentences even if user inputs more than 3 words and any new alphabet or half words afterwords as input.
- We can acheive this by splitting Words

    **Example** Sentence: we have had three meetings
    
    input: we have had th
    
    output: ree meetings

In [59]:
def SplitWords(inpList,opList):
    '''
    This Method Creates New Dataframe after splitting words
    '''
    WordSplitDf = pd.DataFrame()
    input_Sentence = []
    output_Sentence = []
    
    for ind,sent in enumerate(tqdm(opList)):
        inpS = inpList[ind]+' '
        oupS = sent
        input_Sentence.append(inpS)
        output_Sentence.append(oupS)
        for alphbtsIndx in range(len(oupS)-2):
            if oupS[alphbtsIndx] == ' ':
                inpS = inpS+' '
            else:
                inpS = inpS+oupS[alphbtsIndx]
                oupSIdx = alphbtsIndx+1
                outS = oupS[oupSIdx:]
                input_Sentence.append(inpS)
                output_Sentence.append(outS)
                 
    WordSplitDf['X'] = input_Sentence
    WordSplitDf['Y'] = output_Sentence
    return WordSplitDf

In [60]:
#Saving Training Data into 2 files as ram will not support both together
n = math.floor(len(XTrain)/3)
dfTrain = pd.DataFrame()
dfTrain = SplitWords(list(XTrain[:n]),list(YTrain[:n]))
dfTrain.to_pickle('PreProcessed\DTrain1.pkl')
print('-'*5,'DTrain1.pkl Created','-'*5)                    
dfTrain = SplitWords(list(XTrain[n:n*2]),list(YTrain[n:n*2]))
dfTrain.to_pickle('PreProcessed\DTrain2.pkl')
print('-'*5,'DTrain2.pkl Created','-'*5)
dfTrain = SplitWords(list(XTrain[n*2:]),list(YTrain[n*2:]))
dfTrain.to_pickle('PreProcessed\DTrain3.pkl')
print('-'*5,'DTrain3.pkl Created','-'*5)
                     
dfTrain = SplitWords(list(XCv),list(YCv))
dfTrain.to_pickle('PreProcessed\DCv.pkl')
print('-'*5,'DCv.pkl Created','-'*5)
                     
dfTrain = SplitWords(list(XTest),list(YTest))
dfTrain.to_pickle('PreProcessed\DTest3.pkl')
print('-'*5,'DTest.pkl Created','-'*5)

100%|██████████████████████████████████████████████████████████████████████| 459990/459990 [00:03<00:00, 142195.46it/s]


----- DTrain1.pkl Created -----


100%|██████████████████████████████████████████████████████████████████████| 459990/459990 [00:03<00:00, 133147.95it/s]


----- DTrain2.pkl Created -----


100%|██████████████████████████████████████████████████████████████████████| 459991/459991 [00:03<00:00, 139741.00it/s]


----- DTrain3.pkl Created -----


100%|██████████████████████████████████████████████████████████████████████| 224647/224647 [00:01<00:00, 150127.11it/s]


----- DCv.pkl Created -----


100%|██████████████████████████████████████████████████████████████████████| 283169/283169 [00:01<00:00, 150183.46it/s]


----- DTest.pkl Created -----


In [61]:
dfTrain.head(10)

Unnamed: 0,X,Y
0,not unless you,count having two refrigerators to be a constit...
1,not unless you c,ount having two refrigerators to be a constitu...
2,not unless you co,unt having two refrigerators to be a constitut...
3,not unless you cou,nt having two refrigerators to be a constituti...
4,not unless you coun,t having two refrigerators to be a constitutio...
5,not unless you count,having two refrigerators to be a constitution...
6,not unless you count h,aving two refrigerators to be a constitutional...
7,not unless you count ha,ving two refrigerators to be a constitutionall...
8,not unless you count hav,ing two refrigerators to be a constitutionally...
9,not unless you count havi,ng two refrigerators to be a constitutionally ...


In [62]:
NewDf.to_pickle('PreProcessed\WordLevelData.pkl')

In [63]:
DfTrain = pd.DataFrame()
DfVal = pd.DataFrame()
DfTest = pd.DataFrame()
DfTrain['X'] = XTrain
DfTrain['Y'] = YTrain
DfVal['X'] = XCv
DfVal['Y'] = YCv
DfTest['X'] = XTest
DfTest['Y'] = YTest

In [64]:
DfTrain.to_pickle('PreProcessed\WordLevelTrainingData.pkl')
DfVal.to_pickle('PreProcessed\WordLevelValidationData.pkl')
DfTest.to_pickle('PreProcessed\WordLevelTestingData.pkl')