In [107]:
import pandas as pd
import numpy as np

In [108]:
csv = pd.read_csv('../data/dialog.csv')

In [109]:
csv

Unnamed: 0,input,category
0,Hello,greeting
1,Hi,greeting
2,How are you?,greeting
3,What's up?,greeting
4,I have a question.,help
5,I need help.,help
6,What can you do?,help
7,What can I ask you?,help
8,Help me,help
9,Help,help


In [283]:
from nltk import word_tokenize
from pymagnitude import *
from sklearn.preprocessing import MinMaxScaler
from scipy import sparse

glove = Magnitude(MagnitudeUtils.download_model('glove/medium/glove.6B.100d.magnitude'))

def avg_glove(df):
    vectors = []
    for text in df.input.values:
        vectors.append(np.average(glove.query(word_tokenize(text)), axis = 0))
    
    return np.array(vectors)

def has_word(df, word):
    result = np.zeros(len(df.index), dtype='int')
    
    for i, text in enumerate(df.input.values):
        for token in word_tokenize(text.lower()):
            if token == word:
                result[i] = 1
                break
    
    return result.reshape(-1,1)

def num_tokens(df):
    result = np.zeros(len(df.index), dtype='int')
    
    for i, text in enumerate(df.input.values):
        result[i] = len(word_tokenize(text))
    
    return result.reshape(-1,1)

def num_chars(df):
    result = np.zeros(len(df.index), dtype='int')    
    for i, text in enumerate(df.input.values):
        result[i] = len(text)    
    return result.reshape(-1,1)

def featurize(df):
    vectors = avg_glove(df)
#     what = has_word(df, 'what')
#     who = has_word(df, 'who')
#     where = has_word(df, 'where')
#     when = has_word(df, 'when')
#     why = has_word(df, 'why')
#     how = has_word(df, 'how')
#     qm = has_word(df, '?')
#     token_count = num_tokens(df)
#     char_count = num_chars(df)
        
#     features = np.hstack((
#         what,
#         who,
#         where,
#         when,
#         why,
#         how,
#         qm,
# #         token_count,
# #         char_count
#     ))
    
    
#     vector_scaler = MinMaxScaler()
#     vectors = vector_scaler.fit_transform(vectors)
    
#     feature_scaler = MinMaxScaler()
#     features = feature_scaler.fit_transform(features)
        
    #print(features.shape)
    #print(vectors.shape)
    return vectors

In [284]:
x_train = featurize(csv)

In [285]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(csv.category.values)

print(y_train)

[0 0 0 0 1 1 1 1 1 1 5 5 5 5 4 4 4 4 3 3 3 1 3 0 2 2 2 2 2 3 3 3 3 0 3 3 3
 3 2 2 3 3 4 3 3 5 5 5 5 3]


In [286]:
from sklearn.linear_model import SGDClassifier

log_reg = SGDClassifier(loss = 'log', n_jobs = -1, penalty = 'l2')
log_reg.fit(x_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,
              n_iter_no_change=5, n_jobs=-1, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [287]:
def create_df(text):
    df = pd.DataFrame({'input': [text]})
    return featurize(df) 

def classify(text):
    x = create_df(text)    
        
    predicted = log_reg.predict(x)
    return le.inverse_transform(predicted)

inputs = [
    'thanks',
    'how do i request an extension?',
    'can you help me?',
    'what is the first step in creating an s-corp?',
    'how should i pick my board members?',
    'what should i do if my employees quit?',
    'thank you so much!',
    'this helped me a lot'
]


for i in inputs:
    print(i, classify(i))
    
df = create_df(inputs[-3])
print(df.shape)
eli5.show_prediction(log_reg, df[0])

thanks ['thanks']
how do i request an extension? ['question']
can you help me? ['help']
what is the first step in creating an s-corp? ['question']
how should i pick my board members? ['question']
what should i do if my employees quit? ['question']
thank you so much! ['thanks']
this helped me a lot ['thanks']
(1, 100)


Contribution?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0
Contribution?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Contribution?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Contribution?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
Contribution?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4
Contribution?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5
+1.069,x4,,,,
+0.792,x66,,,,
+0.689,x3,,,,
+0.589,x61,,,,
+0.538,x44,,,,
+0.458,x2,,,,
+0.373,x56,,,,
+0.349,x39,,,,
+0.334,x64,,,,
+0.322,x27,,,,

Contribution?,Feature
1.069,x4
0.792,x66
0.689,x3
0.589,x61
0.538,x44
0.458,x2
0.373,x56
0.349,x39
0.334,x64
0.322,x27

Contribution?,Feature
3.857,x55
1.423,x84
1.283,x18
1.008,x4
0.699,x89
0.644,x43
0.611,x59
0.561,x44
0.529,x67
0.517,x53

Contribution?,Feature
1.174,x82
0.817,x2
0.763,x3
0.758,x61
0.598,x64
0.456,x55
0.321,x12
0.314,x54
0.303,x53
0.281,x15

Contribution?,Feature
5.345,<BIAS>
2.842,x58
0.956,x31
0.77,x49
0.617,x44
0.597,x98
0.509,x43
0.5,x78
0.496,x17
0.314,x73

Contribution?,Feature
1.229,x84
0.935,x55
0.796,x61
0.572,x28
0.556,x94
0.535,x49
0.483,x43
0.441,x73
0.353,x27
0.211,x62

Contribution?,Feature
2.455,x55
2.159,x84
2.009,x49
1.109,x53
0.596,x66
0.516,x62
0.471,x19
0.395,x88
0.386,x58
0.358,x23


In [288]:
import eli5

    
#eli5.show_weights(log_reg)


In [104]:
le.classes_

array(['greeting', 'help', 'other', 'question', 'sendoff', 'thanks'],
      dtype=object)

In [290]:
import pickle

with open('intent_model.pkl', 'wb') as f:
    f.write(pickle.dumps((le, log_reg)))

In [5]:
import tensorflow_hub as hub

In [6]:
embed = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')

In [9]:
csv['embeddings'] = embed(csv['input']).numpy().tolist()

In [10]:
csv

Unnamed: 0,input,category,embeddings
0,Hello,greeting,"[-0.03524598851799965, -0.07763014733791351, 0..."
1,Hi,greeting,"[-0.005443588364869356, -0.06554596871137619, ..."
2,How are you?,greeting,"[-0.05739153176546097, -0.018251711502671242, ..."
3,What's up?,greeting,"[-0.05163351818919182, -0.022842010483145714, ..."
4,I have a question.,help,"[0.011111502535641193, 0.004057114943861961, 0..."
5,I need help.,help,"[-0.05238720029592514, -0.008065782487392426, ..."
6,What can you do?,help,"[-0.05967104062438011, 0.013132442720234394, 0..."
7,What can I ask you?,help,"[-0.08805720508098602, -0.0011457621585577726,..."
8,Help me,help,"[-0.05955479294061661, -0.012867441400885582, ..."
9,Help,help,"[-0.07254594564437866, -0.0034580896608531475,..."


In [None]:
csv.groupby('category').apply(lambda row: np.mean(row['embeddings']))

In [119]:
centroids = {}

for category, examples in csv.groupby('category'):
    centroids[category] = np.mean(embed(examples['input']).numpy(), axis=0)

In [121]:
centroids.keys()

dict_keys(['greeting', 'help', 'other', 'question', 'sendoff', 'thanks'])

In [122]:
from scipy import spatial

def similarity(text, centroid):
    e = embed([text])[0]
    return spatial.distance.cosine(centroid, e)

In [124]:
def find_similar(text):
    results = []
    for category, centroid in centroids.items():
        sim = similarity(text, centroid)
        results.append((category, sim))
        
    results.sort(key=lambda x: x[1])
    return results

In [140]:
print(find_similar('hello'))

[('greeting', 0.09324604272842407), ('sendoff', 0.5445320904254913), ('help', 0.5493127107620239), ('thanks', 0.5960005819797516), ('other', 0.7855579107999802), ('question', 0.9759620800614357)]


In [114]:
print(similarity('who was the first president?'))

0.9439601674675941


In [141]:
import elasticsearch
elastic = elasticsearch.Elasticsearch()

def classify(text):
        embeddings = embed([text])[0].numpy()

        query = {
            'from': 0,
            'size': 4,
            'query': {
                'script_score': {
                    'query': {
                        'match_all': {}
                    },
                    'script': {
                        'source': "cosineSimilarity(params.query_vector, doc['centroid'])",
                        'params': {
                            'query_vector': embeddings.tolist()
                        }
                    }
                }
            }
        }

        results = elastic.search(index='classes', body=query)

        for result in results['hits']['hits']:
            document = result['_source']
            score = result['_score']
            category = document['category']

            print(f'input: "{text}" - result {score}: {category}')

            #return category
        
print(classify("who is the president?"))

input: "who is the president?" - result 0.18618529: help
input: "who is the president?" - result 0.12692668: greeting
input: "who is the president?" - result -0.036398895: sendoff
input: "who is the president?" - result -0.057310417: thanks
None


In [47]:
def foo(row):
    return np.mean(embed(row['input']).numpy(), axis=0)

centroids = csv.groupby('category').apply(foo)

In [53]:
def classify(text):
    e = embed([text])[0]
    return centroids.apply(lambda x: spatial.distance.cosine(x, e))

In [59]:
classify('where would i send paperwork for starting my company?').argmin()

3

In [57]:
classify('hello')

category
greeting    0.093246
help        0.549313
other       0.785558
question    0.975962
sendoff     0.544532
thanks      0.596001
dtype: float64

In [62]:
for row in centroids:
    print(row)

[-2.30304282e-02 -5.11824936e-02  3.98516208e-02  1.67592783e-02
  5.43817990e-02  2.36937199e-02  4.43239957e-02 -5.22480421e-02
  4.35496755e-02  5.11609614e-02  5.15066795e-02  3.90513279e-02
 -5.37077384e-03  6.78102896e-02  3.66622233e-03 -4.28453870e-02
 -2.67570037e-02  9.17508546e-03 -5.52909030e-03 -6.85622543e-02
  1.04836598e-02  1.80118326e-02  7.87182420e-04  2.81376932e-02
 -6.94169477e-02  1.85550470e-02  3.18540186e-02 -3.03855091e-02
 -4.92880028e-03  1.08825164e-02  4.18553688e-02  2.86824256e-02
  1.36530632e-02 -1.84677187e-02 -3.75696011e-02  2.66547650e-02
  4.01885947e-03  1.83171574e-02  2.96419486e-02  4.94157895e-03
 -4.58278740e-03  2.42862422e-02  1.79680008e-02 -2.16126498e-02
  8.09650496e-03  3.77135091e-02 -5.40117435e-02 -2.38728952e-02
 -5.80587983e-03  8.94072000e-03 -1.48582021e-02  2.85529792e-02
  4.60991971e-02  3.81070189e-02  6.59204507e-03  1.75376944e-02
  2.55253240e-02 -2.19465848e-02  1.11179547e-02  5.81101477e-02
  4.14614677e-02 -1.50788

In [64]:
for category, examples in csv.groupby('category'):
    centroid = np.mean(embed(examples['input']).numpy(), axis=0)

    payload = {
        'category': category,
        'centroid': centroid.tolist(),
    }
    print(payload)

{'category': 'greeting', 'centroid': [-0.02303042821586132, -0.051182493567466736, 0.03985162079334259, 0.016759278252720833, 0.05438179895281792, 0.023693719878792763, 0.04432399570941925, -0.052248042076826096, 0.04354967549443245, 0.051160961389541626, 0.05150667950510979, 0.03905132785439491, -0.00537077384069562, 0.0678102895617485, 0.0036662223283201456, -0.042845387011766434, -0.026757003739476204, 0.009175085462629795, -0.005529090296477079, -0.06856225430965424, 0.01048365980386734, 0.018011832609772682, 0.000787182420026511, 0.02813769318163395, -0.06941694766283035, 0.0185550469905138, 0.031854018568992615, -0.030385509133338928, -0.004928800277411938, 0.010882516391575336, 0.04185536876320839, 0.028682425618171692, 0.013653063215315342, -0.018467718735337257, -0.03756960108876228, 0.026654765009880066, 0.004018859472125769, 0.01831715740263462, 0.029641948640346527, 0.004941578954458237, -0.004582787398248911, 0.024286242201924324, 0.01796800084412098, -0.02161264978349209,

In [101]:
def classify(text):
    embeddings = embed([text])[0].numpy().tolist()

    query = {
        'from': 0,
        'size': 1,
        'query': {
            'script_score': {
                'query': {
                    'match_all': {}
                },
                'script': {
                    'source': "cosineSimilarity(params.query_vector, doc['centroid'])",
                    'params': {
                        'query_vector': embeddings
                    }
                }
            }
        }
    }

    results = elastic.search(index='classes', body=query)
    result = results['hits']['hits'][0]
    
    return result['_score'], result['_source']['category']

In [106]:
classify('where would i inquire about incorporation?')

(0.65920156, 'question')