# Intent classification

In [161]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import spacy

In [162]:
nlp = spacy.load('en_core_web_lg')

In [199]:
# df_train=pd.read_table('./datasets/KL/AskUbuntu/train.csv', header=None)
# df_test=pd.read_table('./datasets/KL/AskUbuntu/test.csv', header=None)

df_train=pd.read_table('./datasets/KL/WebApplication/train.csv', header=None)
df_test=pd.read_table('./datasets/KL/WebApplication/test.csv', header=None)

In [200]:
df_train[1].value_counts()
df_test[1].value_counts()

Find Alternative    16
Filter Spam         14
Delete Account      10
Change Password      6
Sync Accounts        6
None                 4
Export Data          3
Name: 1, dtype: int64

In [201]:
vectorizer = CountVectorizer(analyzer = "word", strip_accents=None, tokenizer = None, \
                             preprocessor = None, stop_words = None, max_features = 5000, ngram_range=(1, 3)) 
data_features_train = vectorizer.fit_transform(df_train[0])
tfidfier = TfidfTransformer()
tfidf_train = tfidfier.fit_transform(data_features_train)
data_features_test = vectorizer.transform(df_test[0])
tfidf_test=tfidfier.transform(data_features_test)

In [202]:
cls=LogisticRegression()
cls.fit(tfidf_train, df_train[1])



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [203]:
print(f1_score(df_test[1], cls.predict(tfidf_test), average='micro'), 
     precision_score(df_test[1], cls.predict(tfidf_test), average='micro'), 
     recall_score(df_test[1], cls.predict(tfidf_test), average='micro'))

0.6101694915254238 0.6101694915254238 0.6101694915254238


In [128]:
nlp.add_pipe(nlp.create_pipe('textcat', exclusive_classes=True))

TypeError: create_pipe() got an unexpected keyword argument 'exclusive_classes'

In [129]:
nlp('hello', [1])

hello

In [130]:
(0.93+.84+.61)/3

0.7933333333333333

In [168]:
cls=SVC()
cls.fit(vector_train, df_train[1])
print(f1_score(df_test[1], cls.predict(vector_test), average=None), 
     precision_score(df_test[1], cls.predict(vector_test), average='micro'), 
     recall_score(df_test[1], cls.predict(vector_test), average='micro'))

  'precision', 'predicted', average, warn_for)


[0.        0.0754717 0.        0.        0.1875    0.        0.       ] 0.0847457627118644 0.0847457627118644


# Method 2: classification by cosine distance

In [204]:
from sklearn.metrics.pairwise import cosine_similarity

df_train[2]=df_train.apply(lambda x: nlp(x[0]), axis=1)
df_test[2]=df_test.apply(lambda x: nlp(x[0]), axis=1)
vector_train=np.empty([len(df_train), 300])
for i in range(len(df_train)):
    vector_train[i, :]=df_train.iloc[i, 2].vector
vector_test=np.empty([len(df_test), 300])
for ii in range(len(df_test)):
    vector_test[ii, :]=df_test.iloc[ii, 2].vector

In [205]:
df_train

Unnamed: 0,0,1,2
0,How do I download a YouTube video?,Download Video,"(How, do, I, download, a, YouTube, video, ?)"
1,How do I change my password on TV Tropes?,Change Password,"(How, do, I, change, my, password, on, TV, Tro..."
2,Why can't I change my password and login with ...,Change Password,"(Why, ca, n't, I, change, my, password, and, l..."
3,Change subject line in new Gmail compose window,,"(Change, subject, line, in, new, Gmail, compos..."
4,Email Google Form daily?,,"(Email, Google, Form, daily, ?)"
5,How can I export track.scrobble data from last...,Export Data,"(How, can, I, export, track.scrobble, data, fr..."
6,How can I backup my wordpress.com hosted blog?,Export Data,"(How, can, I, backup, my, wordpress.com, hoste..."
7,How can I sync my Yahoo! Calendar with Google ...,Sync Accounts,"(How, can, I, sync, my, Yahoo, !, Calendar, wi..."
8,Google Bookmarks and Chrome Bookmark Sync -- D...,Sync Accounts,"(Google, Bookmarks, and, Chrome, Bookmark, Syn..."
9,How do I sync my RunKeeper workout schedule wi...,Sync Accounts,"(How, do, I, sync, my, RunKeeper, workout, sch..."


In [206]:
classes=list(df_train[1].unique())
def func(row):
#     print(row[2])
    allthings=row.apply(lambda y: y[2].vector)
    return allthings.mean()
df_meanvects = df_train.groupby(by=1).apply(lambda x: func(x[2]))

In [207]:
for ii, value in df_meanvects.items():
    print(ii)

Change Password
Delete Account
Download Video
Export Data
Filter Spam
Find Alternative
None
Sync Accounts


In [208]:
def predict(row):
    prediction=''
    distance=0
    for item, value in df_meanvects.items():
        current_dist=cosine_similarity(value.reshape(1, -1), row[2].vector.reshape(1, -1))
        if current_dist>distance:
            prediction=item
            distance=current_dist
    return prediction
df_test[3]=df_test.apply(lambda x: predict(x), axis=1)

In [215]:
f1_score(df_test[1], df_test[3], average='micro')

0.2542372881355932

In [217]:
(.93+.84+.61)/3

0.7933333333333333