In this notebook, we will explore various traditional models.

First we will do TfIdf and use that data to fit Logistic Regression, SVM, and Random Forest.

All these are done with multi-label calculation


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [3]:
train_df = pd.read_csv('../reformat_data/train_wide.csv')
val_df = pd.read_csv('../reformat_data/val_wide.csv')
test_df = pd.read_csv('../reformat_data/test_wide.csv')
train_df.head()

Unnamed: 0,Text,Classes,ID,Labels,admiration,amusement,anger,annoyance,approval,caring,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,My favourite food is anything I didn't have to...,27,eebbqej,['neutral'],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,"Now if he does off himself, everyone will thin...",27,ed00q6i,['neutral'],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj,['anger'],0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,To make her feel threatened,14,ed7ypvh,['fear'],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Dirty Southern Wankers,3,ed0bdzj,['annoyance'],0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
y_train = train_df.iloc[:,4:]
y_val = val_df.iloc[:,4:]
y_test = test_df.iloc[:,4:]
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(43410, 28)
(5426, 28)
(5427, 28)


### Do TfIdf transformation

In [5]:
tfidfVec = TfidfVectorizer(stop_words='english')
X_train = tfidfVec.fit_transform(train_df.Text)

In [8]:
X_val = tfidfVec.transform(val_df.Text)
X_test = tfidfVec.transform(test_df.Text)

In [9]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(43410, 26080)
(5426, 26080)
(5427, 26080)


### Define metric

In [32]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
def multi_label_metrics(pred, y_true):

    f1_micro_average = f1_score(y_true=y_true, y_pred=pred, average='micro')
    roc_auc = roc_auc_score(y_true, pred, average = 'micro')
    accuracy = accuracy_score(y_true, pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    print('The micro averaged scores are')
    [print(k,format(v,'.3f')) for k,v in metrics.items()]
    return metrics


### Logistic Regression

In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
lr_clf = MultiOutputClassifier(estimator = LogisticRegression())
lr_clf.fit(X_train,y_train)

In [55]:
val_pred = lr_clf.predict(X_test)
lr_scores = multi_label_metrics(val_pred, y_test)

The micro averaged scores are
f1 0.412
roc_auc 0.643
accuracy 0.276


### SVM
Because the data is large, we will only be doing linear SVC

In [34]:
from sklearn.svm import LinearSVC
svm_clf = MultiOutputClassifier(estimator = LinearSVC())
svm_clf.fit(X_train,y_train)



In [54]:
svm_scores = multi_label_metrics(svm_clf.predict(X_test), y_test)

The micro averaged scores are
f1 0.461
roc_auc 0.675
accuracy 0.324


### Random Forest

In [51]:
%%time
from sklearn.ensemble import RandomForestClassifier
rf_clf = MultiOutputClassifier(estimator = RandomForestClassifier(n_estimators = 100, max_depth = 10))
rf_clf.fit(X_train,y_train)
rf_scores = multi_label_metrics(rf_clf.predict(X_val), y_val)


The micro averaged scores are
f1 0.000
roc_auc 0.500
accuracy 0.000
CPU times: total: 46.5 s
Wall time: 53.7 s


In [48]:
rf_clf.predict(X_val).sum()

0

### Decision Tree
It looked like the random forest always predicted 0. Perhaps the output was too sparse

In [None]:
%%time
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train,y_train)

In [56]:
tree_scores = multi_label_metrics(tree_clf.predict(X_test), y_test)

The micro averaged scores are
f1 0.450
roc_auc 0.706
accuracy 0.385
