In [3]:
#Generic Packages
import numpy as np
from langdetect import detect
np.random.seed(1337)
import pandas as pd
import json
import re

#disable wanring levels
import warnings
warnings.filterwarnings("ignore")

#configuring logging levels
import logging
import pickle

#SKLearn Models
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier,LogisticRegression

#Tree Based SKLearn Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

#NLTK Packages
from nltk.corpus import stopwords
stop = stopwords.words('english')

#Spliting Package
from sklearn.model_selection import StratifiedShuffleSplit

In [4]:
#Importing Data
data = pd.read_excel("..\\en_smalltalk.xlsx")#training data path

In [5]:
#Regex for any data cleansing activities
def RegexRemoval(data):
    removed_spc=[]
    for i in data['Content']:
        removal_spc=re.sub('[^a-zA-Z]',' ',i)
        removal_spc=re.sub(r'\.+','.', removal_spc)
        removed_spc.append(removal_spc)
    return removed_spc

In [6]:
#feature engineering steps like stemming,lemmatization,tokenization can be handled below
def cleanData(text, lowercase = False, remove_stops = True, stemming = False, lemmatization = False):
        
    txt = str(text)
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])

    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stop])
        
    if stemming:
        st = PorterStemmer() #choose different stemmers like lancaster for testing activities
        txt = " ".join([st.stem(w) for w in txt.split()])

    if lemmatization:
        wordnet_lemmatizer = WordNetLemmatizer()
        txt = " ".join([wordnet_lemmatizer.lemmatize(w, pos='v') for w in txt.split()])
    return txt

In [7]:
def MLSplit(data):
    X = data.content
    y = data.result
    size = 0.3

    #Stratified shuffle Split is used. Please use random split(if required)
    dataSplit = StratifiedShuffleSplit(n_splits=5, test_size=size, random_state=0)
    for train_index, validation_index in dataSplit.split(X,y):
        X_train, X_validation = X[train_index], X[validation_index]
        y_train, y_validation = y[train_index], y[validation_index]

    X_train = X_train[:]
    X_validation = X_validation[:]

    trainData = pd.concat([X_train,y_train],axis=1)
    validateData = pd.concat([X_validation,y_validation],axis=1)
    print("Train Data Features:",trainData.shape)
    print("Validation Data Features:",validateData.shape)
    
    return X_train,y_train

In [37]:
def MLTrain(data):
    
    #RegEx to remove the alpha numerical data
    cleanedData=RegexRemoval(data)
    cleanedData = pd.DataFrame(cleanedData)
    cleanedData.rename(columns={0:'content'},inplace=True)
    
    data = pd.concat([data,cleanedData],axis=1)
       
    #Pre-Procesed Data Frame
    data = data[['content','Result']]
    data.rename(columns={'Result':'result'},inplace=True)
    print(data.head())
    
    data['content'] = data['content'].map(lambda x: cleanData(x, lowercase=False, remove_stops=True, stemming=False, lemmatization = False))
    
    X_train,y_train = MLSplit(data)

    #Model Configuarions
    models = {
    "LR": "Logistic Regression",
    "NB": "MultinomiaL Naive Bayes",
    "DT": "Decision Trees",
    "RF": "Random Forest",
    "LightBoost": "Light GBM",
    "AdaBoost": "Ada Boost"
    }
    
    ##########Model Building#############
    
    #The below process takes care of feature vectorization(tfidf) and building models for various algorithm
    #with different hyperparameters and *pkl* files would be generated which is required for predictions.
    
    for model in models.keys():
        #print(model)
        if model == 'LR':
            classifierLR = LogisticRegression(random_state=42,C=5)### Hyper parameter configurations
            model = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer()),
                              ('clf', classifierLR)
                             ])
            model.fit(X_train,y_train)
            with open("..\\ml models\\smalltalk_model_LR.pkl","wb") as f:
                pickle.dump(model,f)
            print("Logistic Regression Model is saved")
            
        if model == 'NB':
            classifierLR = MultinomialNB()### Hyper parameter configurations
            model = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer()),
                              ('clf', classifierLR)
                             ])
            model.fit(X_train,y_train)
            with open("..\\ml models\\smalltalk_model_GNB.pkl","wb") as f:
                pickle.dump(model,f)
                
            print("Naive Bayes Model is saved")

        if model == 'DT':
            classifierLR = DecisionTreeClassifier(random_state=42)### Hyper parameter configurations
            model = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer()),
                              ('clf', classifierLR)
                             ])
            model.fit(X_train,y_train)
            with open("..\\ml models\\smalltalk_model_DT.pkl","wb") as f:
                pickle.dump(model,f)
                
            print("Decision Tree Model is saved")

        if model == 'RF':
            classifierLR = RandomForestClassifier(random_state=42)### Hyper parameter configurations
            model = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer()),
                              ('clf', classifierLR)
                             ])
            model.fit(X_train,y_train)
            with open("..\\ml models\\smalltalk_model_RF.pkl","wb") as f:
                pickle.dump(model,f)

            print("Random Forest Model is saved")
            
        if model == 'LightBoost':
            classifierLR = LGBMClassifier(random_state=42,class_weight='balanced')### Hyper parameter configurations
            model = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer()),
                              ('clf', classifierLR)
                             ])
            model.fit(X_train,y_train)
            with open("..\\ml models\\smalltalk_model_LightBoost.pkl","wb") as f:
                pickle.dump(model,f)

            print("Light Boost Model is saved")
            
        if model == 'AdaBoost':
            classifierLR = AdaBoostClassifier(random_state=42)### Hyper parameter configurations
            model = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer()),
                              ('clf', classifierLR)
                             ])
            model.fit(X_train,y_train)
            with open("..\\ml models\\smalltalk_model_AdaBoost.pkl","wb") as f:
                pickle.dump(model,f)
            
            print("Ada Boost Model is saved")
            

In [38]:
if __name__ == "__main__":
    MLTrain(data)

                     content             result
0      good morning everyone  action > greeting
1       have a great morning  action > greeting
2  and a good morning to you  action > greeting
3        good morning to you  action > greeting
4         hello good morning  action > greeting
Train Data Features: (867, 2)
Validation Data Features: (372, 2)
Logistic Regression Model is saved
Naive Bayes Model is saved
Decision Tree Model is saved
Random Forest Model is saved
Light Boost Model is saved
Ada Boost Model is saved


In [53]:
def validation(text):
    #load the model
    with open("..\\smalltalk_model_LR.pkl","rb") as f:
        model = pickle.load(f)
        
    prediction = model.predict(text)
    model.predict_proba(text)
    
    confidenceScore = model.predict_proba(text)
    confidenceScore = confidenceScore[0][np.argmax(confidenceScore)]
    confidenceScore
    
    print("predicted category -->",prediction)
    print("confidence score -->",round(confidenceScore,2))
    

In [54]:
text = ['I want to talk to your manager']
validation(text)

predicted category --> ['action > escalate']
confidence score --> 0.87
