In [12]:
# import libraries
# import packages

import os
import re 
import sys
import pickle
import numpy as np
import pandas as pd
import pkg_resources
import itertools, pickle
from textblob import Word
from sklearn.svm import SVC
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC
from nltk.corpus import stopwords
from sklearn import preprocessing
from keras.models import load_model
from symspellpy import SymSpell, Verbosity
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from prepare_training_data import get_numerical_test_data, get_training_sentences

%matplotlib inline
# ignore all warning. Note, if you are using this package for self-education, it is recommended to see any warnings.
import warnings
warnings.filterwarnings('ignore')

### Spell correction
Spell correction is performed using symspell library in python. It is a crucial part of this project because typo and use of slang are very common in text messages. These can led to confusion for the model, hence we attemp to correct the spell.
NLTK library is then used to:
- make all letters lowercase
- remove stop words using
- lemmatisation
- correct letter repetitions
- remove the top 10,000 rarest words appearing in the data

In [16]:
# load data
data = pd.read_csv('M:/Insight/project/emotion_detection_sentiment140/text_emotion.csv')
data['sentiment'].replace('empty', 'sadness', inplace=True)
data['sentiment'].replace('anger', 'sadness', inplace=True)
data['sentiment'].replace('boredom', 'sadness', inplace=True)
data['sentiment'].replace('hate', 'sadness', inplace=True)
data['sentiment'].replace('worry', 'sadness', inplace=True)
data['sentiment'].replace('neutral', 'sadness', inplace=True)
data['sentiment'].replace('enthusiasm', 'happiness', inplace=True)
data['sentiment'].replace('fun', 'happiness', inplace=True)
data['sentiment'].replace('relief', 'happiness', inplace=True)
data['sentiment'].replace('surprise', 'happiness', inplace=True)
data['sentiment'].replace('love', 'happiness', inplace=True)

In [57]:
data

NameError: name 'data' is not defined

In [None]:
# spell correction is based on SymSpell Python library. This tool checks for possible spelling errors within a maximum 
# edit distance of n (N-3 in this work) using Fuzzy logic.

dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt")

# setup max edit distance
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

# path for dictionary
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")

# term_index is the column of the term and count_index is the column of the term frequency
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

# uncomment and run the following for an example.
# input_term = ("whau you do is coool")
# result = sym_spell.word_segmentation(input_term)
# corrected_sentence = result.corrected_string
# corrected_sentence

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')


# make all letters lowercase
data['content'] = data['content'].apply(lambda x: " ".join(x.lower() for x in x.split()))

# remove Punctuation, symbols
data['content'] = data['content'].str.replace('[^\w\s]',' ')

# remove stop words using
stop = stopwords.words('english')
data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

# lemmatisation
data['content'] = data['content'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

# correct letter repetitions
def de_repeat(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

data['content'] = data['content'].apply(lambda x: " ".join(de_repeat(x) for x in x.split()))

# find the top 10,000 rarest words appearing in the data
freq = pd.Series(' '.join(data['content']).split()).value_counts()[-10000:]

# remove all those rarely appearing words from the data
freq = list(freq.index)
data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

In [2]:
def read_file(file_name): 
    data_list  = []
    with open(file_name, 'r') as f: 
        for line in f: 
            line = line.strip() 
            label = ' '.join(line[1:line.find("]")].strip().split())
            text = line[line.find("]")+1:].strip()
            data_list.append([label, text])
    return data_list
file_name = "sentences.txt"
psychExp_txt = read_file(file_name)

In [3]:
import re 
from collections import Counter

def ngram(token, n): 
    output = []
    for i in range(n-1, len(token)): 
        ngram = ' '.join(token[i-n+1:i+1])
        output.append(ngram) 
    return output


def create_feature(text, nrange=(1, 1)):
    text_features = [] 
    text = text.lower() 

    # 1. treat alphanumeric characters as word tokens
    # Since tweets contain #, we keep it as a feature
    # Then, extract all ngram lengths
    text_alphanum = re.sub('[^a-z0-9#]', ' ', text)
    for n in range(nrange[0], nrange[1]+1): 
        text_features += ngram(text_alphanum.split(), n)
    
    # 2. treat punctuations as word token
    text_punc = re.sub('[a-z0-9]', ' ', text)
    text_features += ngram(text_punc.split(), 1)
    
    # 3. Return a dictinaory whose keys are the list of elements 
    # and their values are the number of times appearede in the list.
    return Counter(text_features)

In [135]:
def convert_label(item, name): 
    items = list(map(float, item.split()))
    label = ""
    for idx in range(len(items)): 
        if items[idx] == 1: 
            label += name[idx] + " "
    
    return label.strip()

emotions = ["happiness", 'fear', "anger", "sadness", "disgust", "shame", "guilt"]

X_all = []
y_all = []
for label, text in psychExp_txt:
    y_all.append(convert_label(label, emotions))
    X_all.append(create_feature(text, nrange=(1, 4)))

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.2, random_state=101)

In [13]:
from sklearn.feature_extraction import DictVectorizer
vectorizer = DictVectorizer(sparse = True)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [68]:
forest_clf = RandomForestClassifier(max_depth = 100, n_estimators=450, max_leaf_nodes=200, n_jobs=-1, random_state=101)
train_acc, test_acc = train_test(forest_clf, X_train, X_test, y_train, y_test)

In [69]:
train_acc

0.8018048128342246

In [70]:
test_acc

0.5521390374331551

In [66]:
forest_clf

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=200, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=450,
                       n_jobs=-1, oob_score=False, random_state=101, verbose=0,
                       warm_start=False)

In [75]:
# grid search
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [450], 'max_leaf_nodes': [200], 'max_depth':[20,60,100,200]}
grid_search = GridSearchCV(forest_clf, param_grid, cv=3, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  12 | elapsed:   47.2s remaining:  2.4min
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:  1.2min remaining:   13.9s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  1.2min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=100,
                                              max_features='auto',
                                              max_leaf_nodes=200,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=450, n_jobs=-1,
                                              oob_score=False, random_state=101,
                                     

In [76]:
grid_search.best_params_

{'max_depth': 200, 'max_leaf_nodes': 200, 'n_estimators': 450}

In [78]:
grid_search.best_score_

0.5656740480422052

In [79]:
y_pred_ = grid_search.predict(X_test)


In [80]:
accuracy_score(y_test, y_pred_)

0.5614973262032086

In [33]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_curve

y_probas_forest = cross_val_predict(forest_clf, X_train, y_train, cv=3,
                                    method="predict_proba")

In [26]:
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, "b:", linewidth=2, label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plt.legend(loc="lower right", fontsize=16)
save_fig("roc_curve_comparison_plot")
plt.show()

array([0., 0., 0., ..., 0., 0., 0.])

In [19]:
from sklearn.metrics import accuracy_score
def train_test(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    train_acc = accuracy_score(y_train, clf.predict(X_train))
    test_acc = accuracy_score(y_test, clf.predict(X_test))
    
#     print("Training acc: {}".format(train_acc))
#     print("Test acc    : {}".format(test_acc))
    
    return train_acc, test_acc