# Model Creation

This jupyter notebook is where I personally created the logistic regression model that will be used in the server

In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import os as os
import re, string, unicodedata
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nigel.hussain/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nigel.hussain/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Read CSV
tech_df = pd.read_csv('tech_test_data.csv')
tech_df

# Drop agent rows
tech_customer_df = tech_df[tech_df.message_source == 'customer']

# Drop account number and Order ID rows
tech_customer_modified_df = tech_customer_df[(tech_customer_df.message_number == 1)]
tech_customer_modified_df

Unnamed: 0,message,case_type,conversation_id,message_id,message_number,message_source
0,"Hi, I’d like to cancel my order please.",cancel_order,1,1,1,customer
4,"Hi, please give me some assistance cancelling ...",cancel_order,2,5,1,customer
8,"Hello, I need to cancel an order",cancel_order,3,9,1,customer
13,"Hey hey, I ordered something yesterday but it ...",cancel_order,4,14,1,customer
17,"Hi, thanks for helping out – I want to cancel ...",cancel_order,5,18,1,customer
23,I need to cancel my order,cancel_order,6,24,1,customer
27,"Hello, I ordered from your service yesterday b...",cancel_order,7,28,1,customer
31,I’d like to cancel an order with you,cancel_order,8,32,1,customer
35,"Someone ordered something on my account, I nee...",cancel_order,9,36,1,customer
39,I would like to cancel the order I made last w...,cancel_order,10,40,1,customer


In [3]:
# Extract Variables
X, y = tech_customer_modified_df.message, tech_customer_modified_df.case_type

In [4]:
# First round of preprocessing the training data
def replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text)

In [5]:
# remove all of the contractions
X_list = list(X)

X_preprocessed = []
for string in X_list:
    d = replace_contractions(string)
    X_preprocessed.append(d)

print(X_preprocessed)

['Hi, I would like to cancel my order please.', 'Hi, please give me some assistance cancelling my order.', 'Hello, I need to cancel an order', 'Hey hey, I ordered something yesterday but it was the wrong item – can I still cancel that?', 'Hi, thanks for helping out – I want to cancel an order', 'I need to cancel my order', 'Hello, I ordered from your service yesterday but I changed my mind, I need to cancel', 'I would like to cancel an order with you', 'Someone ordered something on my account, I need to cancel it', 'I would like to cancel the order I made last week if it has not shipped yet', 'I paid for an order last week, what the hell is going on?', 'Can you check if my order has shipped yet', 'Hi, my order was supposed to arrive yesterday, any news?', 'Hello there, please can I check where my order is', 'Hello, I would like to check if my order shipped yet', 'Hi, I have an order due to arrive today and not sure if I should stay home, can you check where it is?', 'Can I check what i

In [6]:
# Tokenise the data
X_tokenised = []
for string in X_preprocessed:
    words = nltk.word_tokenize(string)
    X_tokenised.append(words)

print(X_tokenised)

[['Hi', ',', 'I', 'would', 'like', 'to', 'cancel', 'my', 'order', 'please', '.'], ['Hi', ',', 'please', 'give', 'me', 'some', 'assistance', 'cancelling', 'my', 'order', '.'], ['Hello', ',', 'I', 'need', 'to', 'cancel', 'an', 'order'], ['Hey', 'hey', ',', 'I', 'ordered', 'something', 'yesterday', 'but', 'it', 'was', 'the', 'wrong', 'item', '–', 'can', 'I', 'still', 'cancel', 'that', '?'], ['Hi', ',', 'thanks', 'for', 'helping', 'out', '–', 'I', 'want', 'to', 'cancel', 'an', 'order'], ['I', 'need', 'to', 'cancel', 'my', 'order'], ['Hello', ',', 'I', 'ordered', 'from', 'your', 'service', 'yesterday', 'but', 'I', 'changed', 'my', 'mind', ',', 'I', 'need', 'to', 'cancel'], ['I', 'would', 'like', 'to', 'cancel', 'an', 'order', 'with', 'you'], ['Someone', 'ordered', 'something', 'on', 'my', 'account', ',', 'I', 'need', 'to', 'cancel', 'it'], ['I', 'would', 'like', 'to', 'cancel', 'the', 'order', 'I', 'made', 'last', 'week', 'if', 'it', 'has', 'not', 'shipped', 'yet'], ['I', 'paid', 'for', 'an

In [7]:
# Additional Preprocessing
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    return words

In [8]:
X_normalised = []
# print(X_tokenised)
for tokens in X_tokenised:
    words = normalize(tokens)
    X_normalised.append(words)
    
print(X_normalised)

[['hi', 'would', 'like', 'cancel', 'order', 'please'], ['hi', 'please', 'give', 'assistance', 'cancelling', 'order'], ['hello', 'need', 'cancel', 'order'], ['hey', 'hey', 'ordered', 'something', 'yesterday', 'wrong', 'item', 'still', 'cancel'], ['hi', 'thanks', 'helping', 'want', 'cancel', 'order'], ['need', 'cancel', 'order'], ['hello', 'ordered', 'service', 'yesterday', 'changed', 'mind', 'need', 'cancel'], ['would', 'like', 'cancel', 'order'], ['someone', 'ordered', 'something', 'account', 'need', 'cancel'], ['would', 'like', 'cancel', 'order', 'made', 'last', 'week', 'shipped', 'yet'], ['paid', 'order', 'last', 'week', 'hell', 'going'], ['check', 'order', 'shipped', 'yet'], ['hi', 'order', 'supposed', 'arrive', 'yesterday', 'news'], ['hello', 'please', 'check', 'order'], ['hello', 'would', 'like', 'check', 'order', 'shipped', 'yet'], ['hi', 'order', 'due', 'arrive', 'today', 'sure', 'stay', 'home', 'check'], ['check', 'going', 'order', 'weekend'], ['would', 'like', 'find', 'order',

In [9]:
# finalise the tokenisation
X_final = []

for tokens in X_normalised:
    if 'cancel' in tokens:
        X_final.append(0)
    elif 'cancelling' in tokens:
        X_final.append(0)
    else:
        X_final.append(1)


# convert to array
X_final_array = np.asarray(X_final)
print(X_final_array.reshape(1,-1))

[[0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1]]


In [10]:
# Pre-process the testing data
enc = preprocessing.LabelEncoder()
x = ['cancel_order', 'order_status']
enc.fit(x)

# Sanity Check
print(enc.classes_)


y_preprocess = enc.transform(y)

# Sanity Check
print(y_preprocess)

['cancel_order' 'order_status']
[0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1]


In [11]:
# Convert to Testing and Training Data
train_X, test_X, train_Y, test_Y = train_test_split(X_final_array,y_preprocess,
                                                   train_size = .7,
                                                   test_size =  .3,
                                                   random_state = 123)

# to avoid size errors, convert training data to dataframes
train_X_df = pd.DataFrame(train_X)
train_Y_df = pd.DataFrame(train_Y)
test_X_df = pd.DataFrame(test_X)
test_Y_df = pd.DataFrame(test_Y)

In [12]:
# Start Logistic Regression classifier
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(train_X_df, train_Y_df)
results = lr.predict(test_X_df)

print(results)

[1 0 0 1 0 0]


  y = column_or_1d(y, warn=True)


In [13]:
# Sanity Check
list(enc.inverse_transform(results))

['order_status',
 'cancel_order',
 'cancel_order',
 'order_status',
 'cancel_order',
 'cancel_order']

In [14]:
# Save your model (We used joblib, as it was designed for sklearns models and thus more efficient)
from sklearn.externals import joblib
joblib.dump(lr, 'model.pkl')
print("Model dumped!")

Model dumped!


In [15]:
# Load the model that you just saved
lr = joblib.load('model.pkl')