In [1]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/leo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/leo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
seed = 42

In [3]:
import pandas as pd

from sklearn import preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers



import pandas as pd
import numpy as np
import string
import re

# import pandas, xgboost, numpy, textblob, string

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk import pos_tag, word_tokenize

stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

Using TensorFlow backend.


In [4]:
lemmatizer

<WordNetLemmatizer>

In [5]:
df = pd.read_csv("../data/tech_test_data-1.csv").drop("message_id", axis = 1)

customer_df = df[df["message_source"] == "customer"][["message","case_type"]]
convos_df = df.groupby([ "conversation_id", "case_type"])["message"].apply(lambda x: ' '.join(x)).reset_index()

In [6]:
customer_df

Unnamed: 0,message,case_type
0,"Hi, I’d like to cancel my order please.",cancel_order
2,"Sure, my order ID is A8B9V1E9 and account numb...",cancel_order
4,"Hi, please give me some assistance cancelling ...",cancel_order
6,"Yeah account number 09832453, order BSD932X0",cancel_order
8,"Hello, I need to cancel an order",cancel_order
10,"No worries, my order ID is BEDSW912, let me ch...",cancel_order
11,Account number 67223023,cancel_order
13,"Hey hey, I ordered something yesterday but it ...",cancel_order
15,Order ID 87GHE8EU and account number 98234321,cancel_order
17,"Hi, thanks for helping out – I want to cancel ...",cancel_order


In [7]:
customer_df["message"].iloc[7]

'Hey hey, I ordered something yesterday but it was the wrong item – can I still cancel that?'

In [8]:
lemmatizer.lemmatize("persons")

'person'

## NLP / Preprocessing

In [9]:
def lemmatize(phraze):
    new_phraze = []
    for word, tag in pos_tag(word_tokenize(phraze)):
        wntag = tag[0].lower()
        wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
        if not wntag:
            new_phraze.append(word)
        else:
            new_phraze.append(lemmatizer.lemmatize(word, wntag))
        
    return " ".join(new_phraze)

def replace_num(phrase):
    return re.sub(" \d+", " _number_", phrase)

def replace_orderID(phrase):
    return re.sub("([A-Za-z]+[\d@]+[\w@]*|[\d@]+[A-Za-z]+[\w@]*)", "_orderID_", phrase)

def remove_punc(phraze):
    return re.sub(r'[^\w\s]',"",phraze)

def remove_extra_space(phraze):
    return re.sub(' +', ' ', phraze)
    
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"hasn\’t", "has not", phrase)
    phrase = re.sub(r"haven\’t", "has not", phrase)
    phrase = re.sub(r"\’d", " would", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase


def preprocessing(df):
    new_df = df.copy()
    
    
    new_df['message'] = new_df['message'].apply(lambda x: x.lower()).\
        apply(lambda x: decontracted(x)).\
        apply(lambda x: " ".join([item for item in x.split() if item not in stop_words])).\
        apply(lambda x: replace_num(x)).\
        apply(lambda x: replace_orderID(x)).\
        apply(lambda x: lemmatize(x)).\
        apply(lambda x: remove_punc(x)).\
        apply(lambda x: remove_extra_space(x))
    
    
    return new_df


clean_df = preprocessing(customer_df)


In [10]:
clean_df["message"].iloc[5]

'worry order id _orderID_ let check account number'

## Split Train and Validation sets
I am using Stratisfied split in order to make sure i have a balanced train and test dataset

In [11]:
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.4, random_state=seed)

X = clean_df.drop('case_type', axis=1)
y = clean_df['case_type'].apply(lambda x: 0 if x=="cancel_order" else 1)


for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# X_train, X_test, y_train, y_test = train_test_split(X['message'], y)

X_train = X_train["message"]
X_test = X_test["message"]

In [12]:
X_train.apply(lambda x: x.lower())

6          yeah account number _number_ order _orderid_
85                              account number _number_
49                              account number _number_
66    hi order due arrive today sure stay home check...
27    hello order service yesterday change mind need...
72    course order id _number__orderid_ account numb...
8                               hello need cancel order
41           order id _orderid_ account number _number_
50                        order id _number_ba _orderid_
31                              would like cancel order
11                              account number _number_
15    order id _number__orderid_ account number _num...
19                              yeah let check two secs
64                        order id _number_ba _orderid_
20                              account number _number_
52              hi order suppose arrive yesterday news 
76           order id _orderid_ account number _number_
21                        order id _number_ba _o

## 2. Feature Engineering

The next step is the feature engineering step. In this step, raw text data will be transformed into feature vectors and new features will be created using the existing dataset. We will implement the following different ideas in order to obtain relevant features from our dataset.

2.1 Count Vectors as features <br />
2.2 TF-IDF Vectors as features <br />
2.2.1 Word level <br />
2.2.2 N-Gram level <br />
2.2.3 Character level <br />
2.3 Word Embeddings as features <br />
2.4 Text / NLP based features <br />
2.5 Topic Models as features <br />

#### 2.1 Count Vectors as features 

In [13]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(X['message'])

# transform the training and validation data using count vectorizer object
X_train_count =  count_vect.transform(X_train)
X_test_count =  count_vect.transform(X_test)

In [14]:
X_train_count.toarray()[3]

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0])

####  2.2 TF-IDF Vectors as features 


In [15]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}')
tfidf_vect.fit(X['message'])

X_train_tfidf =  tfidf_vect.transform(X_train)
X_test_tfidf =  tfidf_vect.transform(X_test)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3))
tfidf_vect_ngram.fit(X['message'])
X_train_tfidf_ngram =  tfidf_vect_ngram.transform(X_train)
X_test_tfidf_ngram =  tfidf_vect_ngram.transform(X_test)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3))
tfidf_vect_ngram_chars.fit(X['message'])
X_train_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(X_train) 
X_test_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(X_test) 

####  2.3 Word Embeddings


In [16]:
# load the pre-trained word-embedding vectors 
embeddings_index = {}
for i, line in enumerate(open('../data2/wiki-news-300d-1M.vec',encoding="utf8")):
    values = line.split()
    embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')

# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(X['message'])
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
X_train_seq_x = sequence.pad_sequences(token.texts_to_sequences(X_train), maxlen=70)
X_test_seq_x = sequence.pad_sequences(token.texts_to_sequences(X_test), maxlen=70)

# create token-embedding mapping
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

#### 2.4  NLP based features

In [17]:
X['char_count'] = X['message'].apply(len)
X['word_count'] = X['message'].apply(lambda x: len(x.split()))
X['word_density'] = X['char_count'] / (X['word_count']+1)
X['punctuation_count'] = X['message'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
X['title_word_count'] = X['message'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
X['upper_case_word_count'] = X['message'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [18]:
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

X['noun_count'] = X['message'].apply(lambda x: check_pos_tag(x, 'noun'))
X['verb_count'] = X['message'].apply(lambda x: check_pos_tag(x, 'verb'))
X['adj_count'] = X['message'].apply(lambda x: check_pos_tag(x, 'adj'))
X['adv_count'] = X['message'].apply(lambda x: check_pos_tag(x, 'adv'))
X['pron_count'] = X['message'].apply(lambda x: check_pos_tag(x, 'pron'))

In [19]:
X

Unnamed: 0,message,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,noun_count,verb_count,adj_count,adv_count,pron_count
0,hi would like cancel order please,34,6,4.857143,0,0,0,0,0,0,0,0
2,sure order id _orderID_ account number _number_,47,7,5.875,4,0,0,0,0,0,0,0
4,hi please give assistance cancel order,39,6,5.571429,0,0,0,0,0,0,0,0
6,yeah account number _number_ order _orderID_,44,6,6.285714,4,0,0,0,0,0,0,0
8,hello need cancel order,23,4,4.6,0,0,0,0,0,0,0,0
10,worry order id _orderID_ let check account number,49,8,5.444444,2,0,0,0,0,0,0,0
11,account number _number_,23,3,5.75,2,0,0,0,0,0,0,0
13,hey hey order something yesterday wrong item s...,63,10,5.727273,0,0,0,0,0,0,0,0
15,order id _number__orderID_ account number _num...,50,6,7.142857,6,0,0,0,0,0,0,0
17,hi thanks help want cancel order,32,6,4.571429,0,0,0,0,0,0,0,0


#### 2.5 Topic Models as features

In [20]:
# train a LDA Model
lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
X_train_topics = lda_model.fit_transform(X_train_count)
X_test_topics = lda_model.fit_transform(X_test_count)


topic_word = lda_model.components_ 
vocab = count_vect.get_feature_names()

# view the topic models
n_top_words = 5
topic_summaries = []
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))

In [21]:
X_train_topics.shape

(28, 20)

## 3.0 Model Selection

In [22]:
X_train_tfidf_ngram.shape

(28, 228)

In [23]:
def train_model(clf,clf_name, is_neural_net=False):
    
    data = {"count": [X_train_count,X_test_count ],
            "tfidf": [X_train_tfidf, X_test_tfidf],
            "ngram": [X_train_tfidf_ngram, X_test_tfidf_ngram],
            "chars": [X_train_tfidf_ngram_chars, X_test_tfidf_ngram_chars],
            "topics":[X_train_topics,X_test_topics ],
            "embeddings" : [X_train_seq_x,X_test_seq_x ]
           }
    print(clf_name)
    print("-------")
    for k in data.keys():
        train, test = data[k]
        # fit the training dataset on the classifier
        clf.fit(train, y_train)

        # predict the labels on validation dataset
        y_pred = clf.predict(test)

        if is_neural_net:
            y_pred = y_pred.argmax(axis=-1)

        acc = metrics.accuracy_score(y_pred, y_test)
        print("{} Accuracy: {}".format(k, acc))
    print()

In [24]:
X_train_seq_x.shape

(28, 70)

In [25]:
classsifiers = {"Naive Bayes":naive_bayes.MultinomialNB(),
                "Logistic Regression":linear_model.LogisticRegression(random_state=seed),
                "SVM": svm.SVC(random_state=seed),
                "rfc": ensemble.RandomForestClassifier(random_state=seed),
                "gbc": ensemble.GradientBoostingClassifier(random_state=seed)}

In [26]:
for clf_name in classsifiers.keys():
    train_model(classsifiers[clf_name], clf_name)
    

Naive Bayes
-------
count Accuracy: 0.5789473684210527
tfidf Accuracy: 0.5789473684210527
ngram Accuracy: 0.42105263157894735
chars Accuracy: 0.5263157894736842
topics Accuracy: 0.5789473684210527
embeddings Accuracy: 0.42105263157894735

Logistic Regression
-------
count Accuracy: 0.5789473684210527
tfidf Accuracy: 0.631578947368421
ngram Accuracy: 0.42105263157894735
chars Accuracy: 0.47368421052631576
topics Accuracy: 0.5789473684210527
embeddings Accuracy: 0.5263157894736842

SVM
-------
count Accuracy: 0.631578947368421
tfidf Accuracy: 0.5789473684210527
ngram Accuracy: 0.42105263157894735
chars Accuracy: 0.7368421052631579
topics Accuracy: 0.47368421052631576
embeddings Accuracy: 0.42105263157894735

rfc
-------
count Accuracy: 0.2631578947368421
tfidf Accuracy: 0.47368421052631576
ngram Accuracy: 0.5789473684210527
chars Accuracy: 0.5263157894736842
topics Accuracy: 0.631578947368421
embeddings Accuracy: 0.5263157894736842

gbc
-------
count Accuracy: 0.7368421052631579
tfidf Ac



ngram Accuracy: 0.7368421052631579
chars Accuracy: 0.5263157894736842
topics Accuracy: 0.5263157894736842
embeddings Accuracy: 0.47368421052631576



### 3.5 LSTM

In [30]:
def create_rnn_lstm(input_layer):
    embedings_size = 300
    
    # Add an Input Layer
    input_layer = layers.Input((input_layer, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, embedings_size, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.2)(embedding_layer)

    # Add the LSTM Layer
    lstm_layer = layers.LSTM(100)(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.2)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model


data = {
#     "count": [X_train_count,X_test_count ],
#         "tfidf": [X_train_tfidf, X_test_tfidf],
#         "ngram": [X_train_tfidf_ngram, X_test_tfidf_ngram],
#         "chars": [X_train_tfidf_ngram_chars, X_test_tfidf_ngram_chars],
#         "topics":[X_train_topics,X_test_topics ],
        "embeddings" : [X_train_seq_x,X_test_seq_x ]
       }

print("LSTM")
print("-------")
for k in data.keys():
    train, test = data[k]
    
    clf = create_rnn_lstm(input_layer = train.shape[1] )

    # fit the training dataset on the classifier
    clf.fit(train, y_train, nb_epoch = 10)

    # predict the labels on validation dataset
    y_pred = clf.predict(test)
    y_pred = y_pred.argmax(axis=-1)

    acc = metrics.accuracy_score(y_pred, y_test)
    print("{} Accuracy: {}".format(k, acc))

LSTM
-------




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
embeddings Accuracy: 0.47368421052631576
