In [2]:
import json
import pandas as pd
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import *

In [39]:
train_tw_all = []
train_tw_id = []
train_tw_text = []
train_tw_label = []
with open('data/train/train.txt') as f:
    for line in f:
        split_text = line.strip().split('\t')
        train_tw_all.append({
            'id': split_text[0],
            'text': split_text[2],
            'label': split_text[1]
        })
        train_tw_id.append(split_text[0])
        train_tw_label.append(split_text[1])
        train_tw_text.append(split_text[2])

In [38]:
pd.set_option('max_colwidth',150)
train_tw_all_pd = pd.DataFrame(train_tw_all)

In [40]:
# Store csv file into dataframe. This csv files are converted results from arff file
train_features = pd.read_csv('data/train/train.csv')
dev_features = pd.read_csv('data/dev/dev.csv')

In [41]:
# Extract label and modify dataframe (remove id and class)

# Training data
train_features_label = train_features['class']
train_features.drop('class', axis = 1, inplace = True)
train_features.drop('id', axis = 1, inplace = True)

In [42]:
train_features_label.value_counts()

N    2793
Y     373
Name: class, dtype: int64

## Gaussian Bayes

In [43]:
# Initialise with training data
clf = GaussianNB()
clf.fit(train_features, train_features_label)

GaussianNB(priors=None)

In [44]:
# Extract data without tweet id and class
dev_for_prediction = dev_features[dev_features.columns[1:-1]]
# Extract only class from dev data
dev_class = dev_features[dev_features.columns[-1]]

In [49]:
# Feed dev data into algorithm for evaluation
dev_pred_GNB = clf.predict(dev_for_prediction)

In [46]:
# Confusion matrix for GaussianBayes
GaussianCM = confusion_matrix(dev_class, dev_pred_GNB, labels = ["Y","N"])
GaussianCM

array([[ 39,  75],
       [ 50, 912]])

In [50]:
pd.Series(dev_pred_GNB).value_counts()

N    987
Y     89
dtype: int64

In [53]:
print(classification_report(dev_class,dev_pred_GNB, labels = ["Y","N"]))

             precision    recall  f1-score   support

          Y       0.44      0.34      0.38       114
          N       0.92      0.95      0.94       962

avg / total       0.87      0.88      0.88      1076



In [54]:
accuracy_score(dev_class,dev_pred_GNB)

0.88382899628252787

## Multinomial Bayes

In [55]:
clfMulti = MultinomialNB()
clfMulti.fit(train_features, train_features_label)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [56]:
dev_pred_MNB = clfMulti.predict(dev_for_prediction)

In [57]:
MultiNBCM = confusion_matrix(dev_class,dev_pred_MNB, labels = ["Y","N"])
MultiNBCM

array([[ 31,  83],
       [ 26, 936]])

In [58]:
pd.Series(dev_pred_MNB).value_counts()

N    1019
Y      57
dtype: int64

In [59]:
pd.Series(dev_class).value_counts()

N    962
Y    114
Name: class, dtype: int64

In [60]:
print(classification_report(dev_class,dev_pred_MNB, labels = ["Y","N"]))

             precision    recall  f1-score   support

          Y       0.54      0.27      0.36       114
          N       0.92      0.97      0.94       962

avg / total       0.88      0.90      0.88      1076



In [61]:
accuracy_score(dev_class,dev_pred_MNB)

0.89869888475836435

## Decision Tree

In [62]:
from sklearn import tree

In [63]:
# Default Tree setup
tree_clf = tree.DecisionTreeClassifier()
tree_clf.fit(train_features, train_features_label)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [64]:
# check for overfitting by predicting on itself and see how it does with development
train_prediction_descTree = tree_clf.predict(train_features)

In [65]:
print(classification_report(train_features_label ,train_prediction_descTree,labels = ["Y","N"]))

             precision    recall  f1-score   support

          Y       0.99      0.82      0.90       373
          N       0.98      1.00      0.99      2793

avg / total       0.98      0.98      0.98      3166



In [66]:
dev_prediction_descTree = tree_clf.predict(dev_for_prediction)

In [67]:
confusion_matrix(dev_class, dev_prediction_descTree,labels = ["Y","N"])

array([[ 31,  83],
       [ 57, 905]])

In [68]:
print(classification_report(dev_class ,dev_prediction_descTree,labels = ["Y","N"]))

             precision    recall  f1-score   support

          Y       0.35      0.27      0.31       114
          N       0.92      0.94      0.93       962

avg / total       0.86      0.87      0.86      1076



In [69]:
accuracy_score(dev_class, dev_prediction_descTree)

0.86988847583643125

In [202]:
# Set maximum depth
tree_clf_md = tree.DecisionTreeClassifier(max_depth = 35)
tree_clf_md.fit(train_features, train_features_label)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=35,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [203]:
dev_prediction_mdTree = tree_clf_md.predict(dev_for_prediction)

In [204]:
confusion_matrix(dev_class, dev_prediction_mdTree,labels = ["Y","N"])

array([[ 29,  85],
       [ 50, 912]])

In [205]:
print(classification_report(dev_class ,dev_prediction_mdTree,labels = ["Y","N"]))

             precision    recall  f1-score   support

          Y       0.37      0.25      0.30       114
          N       0.91      0.95      0.93       962

avg / total       0.86      0.87      0.86      1076



In [206]:
accuracy_score(dev_class, dev_prediction_mdTree)

0.87453531598513012

In [207]:
# Export Decision Trees
with open("tree.dot", 'w') as f:
    f = tree.export_graphviz(tree_clf_md, out_file=f)

## Random Forest

In [197]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(max_depth=35)
rf_clf.fit(train_features, train_features_label)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=35, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [198]:
dev_prediction_rf = rf_clf.predict(dev_for_prediction)

In [199]:
confusion_matrix(dev_class, dev_prediction_rf, labels = ["Y","N"])

array([[ 18,  96],
       [ 14, 948]])

In [200]:
print(classification_report(dev_class,dev_prediction_rf,labels = ["Y","N"]))

             precision    recall  f1-score   support

          Y       0.56      0.16      0.25       114
          N       0.91      0.99      0.95       962

avg / total       0.87      0.90      0.87      1076



## Support Vector Machine

In [208]:
from sklearn import svm
svm_clf = svm.LinearSVC()
svm_clf.fit(train_features,train_features_label)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [209]:
dev_prediction_svm = svm_clf.predict(dev_for_prediction)

In [210]:
confusion_matrix(dev_class, dev_prediction_svm,labels = ["Y","N"])

array([[ 22,  92],
       [ 19, 943]])

In [212]:
print(classification_report(dev_class,dev_prediction_svm,labels = ["Y","N"]))

             precision    recall  f1-score   support

          Y       0.54      0.19      0.28       114
          N       0.91      0.98      0.94       962

avg / total       0.87      0.90      0.87      1076



In [213]:
accuracy_score(dev_class,dev_prediction_svm)

0.89684014869888473

In [None]:
svmClfRBF = svm.SVC()
svmClfRBF.fit(twtDf,sentiment)