In [118]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import gensim
import scipy as sp
from sklearn.model_selection import train_test_split
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
stop_words = set(stopwords.words('english'))

# Base Run

In [3]:
all_data = pd.read_csv("./data/Emotion_final.csv")
andbrain_data =pd.read_csv("./data/Andbrain_DataSet.csv")

In [4]:
shuffled_data = all_data.sample(frac=1).reset_index(drop=True)

In [68]:
y = shuffled_data["label"]
x = shuffled_data

x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.2, random_state=42)

y_train = y_train.to_frame()
y_test = y_test.to_frame()

In [168]:
algorithms = [LogisticRegression()]

In [135]:
count_vectorizer=CountVectorizer(analyzer='word', ngram_range=(1, 1))
x = count_vectorizer.fit_transform(x_train["text"])
print(x.shape)

(15854, 16316)


In [71]:
def calculate_accuracy(predicted_data):
    count_equals = 0
    for idx, (_, row) in enumerate(x_test.iterrows()):
        if row["label"] == predicted_data[idx]:
            count_equals += 1
    print(f"accuracy={count_equals/len(predicted_data)}")

In [105]:
def get_confusion_matrix(predictions):
    matrix = np.zeros(25).reshape(5, 5)

    for a,b in zip(predictions, x_test.label):
        matrix[emotions[a]][emotions[b]] += 1

    print(matrix)

In [128]:
def get_sentences(true_label, predicted_label, num_of_results, predictions, test_collection):
  counter = 0
  for i, (a, b) in enumerate(zip (test_collection.label, predictions)):
    if counter == num_of_results:
      break
    if a == true_label and b == predicted_label:
      print(test_collection.iloc[[i]]["text"].values)
      counter += 1

In [132]:
def get_strongest_words(label, pipe, count_vectorizer):
    inverse_dict={count_vectorizer.vocabulary_[w]:w for w in count_vectorizer.vocabulary_.keys()}
    cur_coef=pipe["algo"].coef_[emotions[label]]
    word_df=pd.DataFrame({"val":cur_coef}).reset_index().sort_values(["val"],ascending=[False])
    word_df.loc[:, "word"]=word_df["index"].apply(lambda v:inverse_dict[v])
    print(word_df.head(10))

In [169]:
for algorithm_under_test in algorithms:
    pipe = Pipeline([('vectorizer', count_vectorizer),
                    ('algo', algorithm_under_test)])
    pipe.fit(x_train["text"], y_train["label"])
    predicted = pipe.predict(x_test["text"])
    calculate_accuracy(predicted)
    try:
        get_strongest_words('happy', pipe, count_vectorizer)
    except AttributeError:
        pass
    get_confusion_matrix(predicted)
    get_sentences('happy', 'sadness', 3, predicted, x_test)
    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


accuracy=0.9101917255297679
       index       val         word
12412  12412  2.745021    satisfied
7384    7384  2.695988     innocent
11971  11971  2.630906     resolved
13995  13995  2.623312     superior
11980  11980  2.610937    respected
4532    4532  2.571322     ecstatic
14382  14382  2.452262     terrific
560      560  2.421652       amused
12970  12970  2.394609      sincere
7566    7566  2.371352  invigorated
[[ 524.   25.    8.   28.    3.]
 [  25.  418.   10.   12.   24.]
 [  21.   14. 1377.   31.   16.]
 [  28.   37.   36. 1160.    9.]
 [   3.   19.    4.    3.  129.]]
['im feeling rather inspired yet low i will enjoy my writing and even though i may be writing about morbid things i will find a way to make it interesting to read']
['im sitting on the couch thinking about how miserable i feel from indulging in too much delicious food']
['i been left alone this is how i feel a kind of sweet song for me but the official video clip for this song is quite annoying']


# Data Processing

In [10]:
train_data_list, cur_sent = list(), list()
with open('./data/wackypedia_en1.words10.20Mwords') as f:
 for line in f:
    line = line.strip()
    if line == '</s>':
        train_data_list.append(cur_sent)
        cur_sent = list()
    elif line != '<s>' and not line.startswith('<text') and not line.startswith('</text'):
        cur_sent.append(line.split('\t')[0])

In [11]:
# Builds the models

# model_big_ten = gensim.models.Word2Vec(
#     train_data_list, min_count=5, window=1, vector_size=10)
# model_small_ten = gensim.models.Word2Vec(
#     train_data_list, min_count=5, window=10, vector_size=10)

# model_big_five = gensim.models.Word2Vec(
#     train_data_list, min_count=5, window=1, vector_size=500)
model_small_five = gensim.models.Word2Vec(
    train_data_list, min_count=5, window=10, vector_size=500)

models = [model_small_five]

In [12]:
def get_without_stopwords(sentence):
	word_tokens = word_tokenize(sentence)
 
	filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
	
	filtered_sentence = []
	
	for w in word_tokens:
		if w not in stop_words:
			filtered_sentence.append(w)
	return filtered_sentence

In [145]:
emotions = dict()
emotions["anger"] = 0
emotions["fear"] = 1
emotions["happy"] = 2
emotions["sadness"] = 3
emotions["surprise"] = 4

In [14]:
def get_word_score(word):
	scores = [0, 0, 0, 0, 0]
	similar_words = models[0].wv.most_similar(word)[:3]
	similar_words.append((word, 1.0))
	for word_candidate, correlation in similar_words:
		try:
			word_score_row = andbrain_data.loc[andbrain_data['word'] == word_candidate + " "]
			if not word_score_row.empty:
				columns = list(word_score_row)
				for column in columns:
					if column != 'word':
						scores[emotions[column]] += word_score_row[column].values[0]
		except KeyError as e:
			continue
	return scores


In [15]:
def is_all_emotions_zero(word_score):
	result = True
	for score in word_score:
		if score != 0:
			result = False
			break
	return result

In [16]:
def get_sentence_score(word_tokens):
	sentence_score = [0, 0, 0, 0, 0]
	for word in word_tokens:
		try:
			word_score = get_word_score(word)
			if not is_all_emotions_zero(word_score):
				sentence_score[word_score.index(max(word_score))] += 1
		except KeyError:
			continue
	return sentence_score


In [42]:
def get_score(raw_sentence):
	non_stopwords = get_without_stopwords(raw_sentence)
	sentence_score = np.array(get_sentence_score(non_stopwords))
	return sentence_score

In [61]:
def get_sentence_with_score_tokens(sentence, score):
	for emotion in emotions.keys():
		for i in range(score[emotions[emotion]]):
			sentence += " #" + emotion
	return sentence

In [64]:
def get_processed_data(data_to_process, title):
	processed_data = pd.DataFrame(columns=data_to_process.columns)
	for idx, (_, row) in enumerate(data_to_process.iterrows()):
		print(str(idx) + " OUT OF " + str(data_to_process.shape[0]))
		raw_sentence = row["text"]
		sentence_score = get_score(raw_sentence)
		sentence_with_score = get_sentence_with_score_tokens(raw_sentence, sentence_score)
		data_to_add = {'text': str(sentence_with_score), 'label': row["label"]}
		processed_data = processed_data.append(data_to_add, ignore_index=True)
	return processed_data

In [65]:
processed_train = get_processed_data(x_train, "TRAIN")
processed_test = get_processed_data(x_test, "TEST")

0 OUT OF 15854
1 OUT OF 15854
2 OUT OF 15854
3 OUT OF 15854
4 OUT OF 15854
5 OUT OF 15854
6 OUT OF 15854
7 OUT OF 15854
8 OUT OF 15854
9 OUT OF 15854
10 OUT OF 15854
11 OUT OF 15854
12 OUT OF 15854
13 OUT OF 15854
14 OUT OF 15854
15 OUT OF 15854
16 OUT OF 15854
17 OUT OF 15854
18 OUT OF 15854
19 OUT OF 15854
20 OUT OF 15854
21 OUT OF 15854
22 OUT OF 15854
23 OUT OF 15854
24 OUT OF 15854
25 OUT OF 15854
26 OUT OF 15854
27 OUT OF 15854
28 OUT OF 15854
29 OUT OF 15854
30 OUT OF 15854
31 OUT OF 15854
32 OUT OF 15854
33 OUT OF 15854
34 OUT OF 15854
35 OUT OF 15854
36 OUT OF 15854
37 OUT OF 15854
38 OUT OF 15854
39 OUT OF 15854
40 OUT OF 15854
41 OUT OF 15854
42 OUT OF 15854
43 OUT OF 15854
44 OUT OF 15854
45 OUT OF 15854
46 OUT OF 15854
47 OUT OF 15854
48 OUT OF 15854
49 OUT OF 15854
50 OUT OF 15854
51 OUT OF 15854
52 OUT OF 15854
53 OUT OF 15854
54 OUT OF 15854
55 OUT OF 15854
56 OUT OF 15854
57 OUT OF 15854
58 OUT OF 15854
59 OUT OF 15854
60 OUT OF 15854
61 OUT OF 15854
62 OUT OF 15854
63

In [157]:
processed_train_filtered = processed_train.copy()
for idx, (_, row) in enumerate(processed_train_filtered.iterrows()):
	if '#' in row["text"]:
		row["text"] = row["text"][row["text"].find("#"):]
print(processed_train_filtered.shape)

                                                    text    label
0                                  #fear #fear #surprise     fear
1      #happy #happy #happy #happy #happy #sadness #a...  sadness
2               #happy #happy #happy #anger #anger #fear    happy
3              #happy #sadness #fear #surprise #surprise    happy
4                   i feel weird a href http bondmusings     fear
...                                                  ...      ...
15849                   i feel like im a gorgeous person    happy
15850                               #happy #happy #happy    happy
15851        i feel like i have been rather unkind to it    anger
15852  im sure everyone is starting to feel the chris...    happy
15853                #happy #sadness #surprise #surprise     fear

[15854 rows x 2 columns]


In [158]:
processed_test_filtered = processed_test.copy()
for idx, (_, row) in enumerate(processed_test_filtered.iterrows()):
	if '#' in row["text"]:
		row["text"] = row["text"][row["text"].find("#"):]

In [173]:
count_vectorizer_processed=CountVectorizer(analyzer='word', ngram_range=(1, 1))
vectors = count_vectorizer_processed.fit_transform(processed_train_filtered["text"])

In [174]:
for algorithm_under_test in algorithms:
    pipe = Pipeline([('vectorizer', count_vectorizer_processed),
                    ('algo', algorithm_under_test)])
    pipe.fit(processed_train_filtered["text"], y_train["label"])
    predicted = pipe.predict(processed_test_filtered["text"])
    calculate_accuracy(predicted)
    try:
        get_strongest_words('happy', pipe, count_vectorizer_processed)
    except AttributeError:
        pass
    get_confusion_matrix(predicted)
    get_sentences('happy', 'sadness', 5, predicted, processed_test_filtered)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


accuracy=0.42658930373360243
      index       val         word
1758   1758  1.803762     talented
1859   1859  1.733294   triumphant
339     339  1.720999      content
971     971  1.696127  invigorated
316     316  1.568278   complacent
1413   1413  1.550632   productive
12       12  1.486480     accepted
556     556  1.454419     ecstatic
1492   1492  1.448381    respected
871     871  1.397202      honored
[[ 51.   3.   3.   3.   1.]
 [  3.  28.   1.   1.   4.]
 [280. 248. 995. 616.  94.]
 [267. 229. 435. 614.  79.]
 [  0.   5.   1.   0.   3.]]
['#fear #fear #surprise']
['#happy #sadness #sadness #sadness #anger #fear #surprise #surprise']
['#sadness']
['#sadness #sadness #fear #fear #surprise #surprise #surprise']
['#happy #sadness #fear #fear #surprise #surprise #surprise #surprise #surprise #surprise']
