In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [281]:
csv = pd.read_csv('data/dialog.csv', names=['input', 'category'])

In [282]:
csv

Unnamed: 0,input,category
0,Hello,greeting
1,Hi,greeting
2,How are you?,greeting
3,What's up?,greeting
4,I have a question.,help
5,I need help.,help
6,What can you do?,help
7,What can I ask you?,help
8,Help me,help
9,Help,help


In [283]:
from nltk import word_tokenize
from pymagnitude import *
from sklearn.preprocessing import MinMaxScaler
from scipy import sparse

glove = Magnitude(MagnitudeUtils.download_model('glove/medium/glove.6B.100d.magnitude'))

def avg_glove(df):
    vectors = []
    for text in df.input.values:
        vectors.append(np.average(glove.query(word_tokenize(text)), axis = 0))
    
    return np.array(vectors)

def has_word(df, word):
    result = np.zeros(len(df.index), dtype='int')
    
    for i, text in enumerate(df.input.values):
        for token in word_tokenize(text.lower()):
            if token == word:
                result[i] = 1
                break
    
    return result.reshape(-1,1)

def num_tokens(df):
    result = np.zeros(len(df.index), dtype='int')
    
    for i, text in enumerate(df.input.values):
        result[i] = len(word_tokenize(text))
    
    return result.reshape(-1,1)

def num_chars(df):
    result = np.zeros(len(df.index), dtype='int')    
    for i, text in enumerate(df.input.values):
        result[i] = len(text)    
    return result.reshape(-1,1)

def featurize(df):
    vectors = avg_glove(df)
#     what = has_word(df, 'what')
#     who = has_word(df, 'who')
#     where = has_word(df, 'where')
#     when = has_word(df, 'when')
#     why = has_word(df, 'why')
#     how = has_word(df, 'how')
#     qm = has_word(df, '?')
#     token_count = num_tokens(df)
#     char_count = num_chars(df)
        
#     features = np.hstack((
#         what,
#         who,
#         where,
#         when,
#         why,
#         how,
#         qm,
# #         token_count,
# #         char_count
#     ))
    
    
#     vector_scaler = MinMaxScaler()
#     vectors = vector_scaler.fit_transform(vectors)
    
#     feature_scaler = MinMaxScaler()
#     features = feature_scaler.fit_transform(features)
        
    #print(features.shape)
    #print(vectors.shape)
    return vectors

In [284]:
x_train = featurize(csv)

In [285]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(csv.category.values)

print(y_train)

[0 0 0 0 1 1 1 1 1 1 5 5 5 5 4 4 4 4 3 3 3 1 3 0 2 2 2 2 2 3 3 3 3 0 3 3 3
 3 2 2 3 3 4 3 3 5 5 5 5 3]


In [286]:
from sklearn.linear_model import SGDClassifier

log_reg = SGDClassifier(loss = 'log', n_jobs = -1, penalty = 'l2')
log_reg.fit(x_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,
              n_iter_no_change=5, n_jobs=-1, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [287]:
def create_df(text):
    df = pd.DataFrame({'input': [text]})
    return featurize(df) 

def classify(text):
    x = create_df(text)    
        
    predicted = log_reg.predict(x)
    return le.inverse_transform(predicted)

inputs = [
    'thanks',
    'how do i request an extension?',
    'can you help me?',
    'what is the first step in creating an s-corp?',
    'how should i pick my board members?',
    'what should i do if my employees quit?',
    'thank you so much!',
    'this helped me a lot'
]


for i in inputs:
    print(i, classify(i))
    
df = create_df(inputs[-3])
print(df.shape)
eli5.show_prediction(log_reg, df[0])

thanks ['thanks']
how do i request an extension? ['question']
can you help me? ['help']
what is the first step in creating an s-corp? ['question']
how should i pick my board members? ['question']
what should i do if my employees quit? ['question']
thank you so much! ['thanks']
this helped me a lot ['thanks']
(1, 100)


Contribution?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0
Contribution?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Contribution?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Contribution?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
Contribution?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4
Contribution?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5
+1.069,x4,,,,
+0.792,x66,,,,
+0.689,x3,,,,
+0.589,x61,,,,
+0.538,x44,,,,
+0.458,x2,,,,
+0.373,x56,,,,
+0.349,x39,,,,
+0.334,x64,,,,
+0.322,x27,,,,

Contribution?,Feature
1.069,x4
0.792,x66
0.689,x3
0.589,x61
0.538,x44
0.458,x2
0.373,x56
0.349,x39
0.334,x64
0.322,x27

Contribution?,Feature
3.857,x55
1.423,x84
1.283,x18
1.008,x4
0.699,x89
0.644,x43
0.611,x59
0.561,x44
0.529,x67
0.517,x53

Contribution?,Feature
1.174,x82
0.817,x2
0.763,x3
0.758,x61
0.598,x64
0.456,x55
0.321,x12
0.314,x54
0.303,x53
0.281,x15

Contribution?,Feature
5.345,<BIAS>
2.842,x58
0.956,x31
0.77,x49
0.617,x44
0.597,x98
0.509,x43
0.5,x78
0.496,x17
0.314,x73

Contribution?,Feature
1.229,x84
0.935,x55
0.796,x61
0.572,x28
0.556,x94
0.535,x49
0.483,x43
0.441,x73
0.353,x27
0.211,x62

Contribution?,Feature
2.455,x55
2.159,x84
2.009,x49
1.109,x53
0.596,x66
0.516,x62
0.471,x19
0.395,x88
0.386,x58
0.358,x23


In [288]:
import eli5

    
#eli5.show_weights(log_reg)


In [104]:
le.classes_

array(['greeting', 'help', 'other', 'question', 'sendoff', 'thanks'],
      dtype=object)

In [290]:
import pickle

with open('intent_model.pkl', 'wb') as f:
    f.write(pickle.dumps((le, log_reg)))