## Intent Classification

### Preprocessing:
    1. Remove "'s" and non alphabets.
    2. Convert from capital to lower.
    2. Stem using SnowBallStemmer.
    
### Feature Extraction
    1. CountVectorizer using top 50 features

### Model Selection and Evaluation:
    1. Cross Validation CV with 5 folds. Tried with SVM, MultinomialNB, MLPClassifier, DecisionTreeClassifier, and 
       RandomForestClassifier
    2. Use Precision, Recall, FScore

### Hyperparameter Optimization:
    1. RandomizedSearchCV (Because Grid Search is too costly)

### Result:
__The best model trained was 98.5% accurate__

In [None]:
import numpy as np
import pandas as pd
import nltk
import re

from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import (train_test_split,
                                    cross_val_score)
from sklearn.metrics import (confusion_matrix, 
                             precision_recall_fscore_support, 
                             f1_score, accuracy_score, recall_score,
                             precision_score)

from sklearn.metrics import pre
nltk.download("punkt")

## Preprocessing

In [3]:
with open("train.txt") as f:
    text = f.read()
text = text.split("\n")
corpus = []
labels = []
for datum in text:
    text_line = datum.split(",,,")
    if len(text_line) != 2:
        continue
    corpus.append(text_line[0].strip())
    labels.append(text_line[1].strip())
corpus_dict = {"Text": corpus, "Label": labels}
df = pd.DataFrame(corpus_dict)
print(df.head())

     Label                                               Text
0  unknown  how did serfdom develop in and then leave russ...
1     what   what films featured the character popeye doyle ?
2  unknown  how can i find a list of celebrities ' real na...
3     what  what fowl grabs the spotlight after the chines...
4     what                    what is the full form of .com ?


In [4]:
df["Label"].unique()

array(['unknown', 'what', 'when', 'who', 'affirmation'], dtype=object)

In [5]:
stemmer = SnowballStemmer("english")

def stem_filter(row):
    row = re.sub(r"'s'", "", row)
    row = re.sub("[^a-zA-Z ]", "", row)
    words = word_tokenize(row)
    stemmed = []
    for word in words:
        stemmed.append(stemmer.stem(word))
    filtered_line = " ".join(stemmed[:4])
    return filtered_line

df["Text"] = df["Text"].apply(stem_filter)

## Training

In [6]:
cv = CountVectorizer(max_features=50)
X = cv.fit_transform(df["Text"])
y = df["Label"]

In [8]:
clf = MLPClassifier()
print(cross_val_score(clf, X, y, cv=5))

[ 0.97993311  0.97651007  0.97297297  0.94594595  0.98979592]


## Hyperparameter Optimization

WARNING: Do not execute this cell for it will take a long time to complete. This model was trained on Google Colaboratory. 


In [None]:
from sklearn.model_selection import RandomizedSearchCV


parameters = {
    'learning_rate': ["constant", "invscaling", "adaptive"],
    'hidden_layer_sizes': [(100, 100), (200, 200), (300, 300)],
    'alpha': np.linspace(0.001, 0.1, 10),
    'activation': ["logistic", "relu", "tanh"]
}

clf = RandomizedSearchCV(estimator=MLPClassifier(),
                         param_distributions=parameters, cv=7, verbose=2, 
                         n_jobs=-1, n_iter=50)
clf.fit(X, y)
print("Best Model: {}".format(clf.best_estimator_))
print("Best Score: {}".format(clf.best_score_))
print("Best Params: {}".format(clf.best_params_))

```
Best Model: MLPClassifier(activation='relu', alpha=0.1, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100), learning_rate='invscaling',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)
Best Score: 0.9850505731625084
Best Params: {'learning_rate': 'invscaling', 'hidden_layer_sizes': (100, 100), 'alpha': 0.1, 'activation': 'relu'}
```

## Model Evaluation

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    stratify=y)
model = MLPClassifier(learning_rate="invscaling", 
                      hidden_layer_sizes=(100, 100), 
                      alpha=0.1, activation="relu")
model.fit(X_train, y_train)


MLPClassifier(activation='relu', alpha=0.1, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100), learning_rate='invscaling',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [26]:
labels = model.classes_
predictions = model.predict(X_test)
f1score = list(f1_score(y_test, predictions, average=None))
fscore = pd.Series(f1score, index=labels)
print("Fscore  of Individual class:")
print(fscore)
print("Accuracy : {}".format(accuracy_score(y_test, predictions)))
print("Recall : {}".format(recall_score(y_test,predictions, average="weighted")))
print("Precision : {}".format(precision_score(y_test,predictions, average="weighted")))
# confusion matrix
pred_labels = ['Predicted '+ l for l in labels]
cm = confusion_matrix(y_test, predictions)
cm = pd.DataFrame(cm, index=labels, columns=pred_labels)
cm['Actual Total'] = cm.sum(axis=1)
cm.loc['Predicted Total'] = cm.sum()
cm

Fscore  of Individual class
affirmation    0.952381
unknown        0.962264
what           0.983607
when           0.950000
who            1.000000
dtype: float64
Accuracy :  0.979797979798
Recall :  0.979797979798
Precision :  0.980309813643


Unnamed: 0,Predicted affirmation,Predicted unknown,Predicted what,Predicted when,Predicted who,Actual Total
affirmation,20,1,0,0,0,21
unknown,0,51,2,1,0,54
what,1,0,120,1,0,122
when,0,0,0,19,0,19
who,0,0,0,0,81,81
Predicted Total,21,52,122,21,81,297


## Creating final model


In [27]:
final_model = MLPClassifier(learning_rate="invscaling", 
                            hidden_layer_sizes=(100, 100), 
                            alpha=0.1, activation="relu")
final_model.fit(X, y)

MLPClassifier(activation='relu', alpha=0.1, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100), learning_rate='invscaling',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [28]:
def predict_sentence(sentence):
    filtered = stem_filter(sentence)
    vector = cv.transform([filtered])
    probabilities = final_model.predict_proba(vector)
    classes = final_model.classes_
    y_pred = np.max(probabilities)
    if y_pred < 0.6:
        pred = "unknown"
    else:
        pred = classes[np.argmax(probabilities)]
    print("Predicted class: \"{0}\" with confidence {1:.2f} %"
              .format(pred, y_pred * 100))

## Test with your own sentence
Substitute your sentence in the `input_sentence` field below

In [29]:
# Input your sentence below
input_sentence = "Will I be hired? :) "
predict_sentence(input_sentence)

Predicted class: "affirmation" with confidence 98.47 %
