# Meaning & Computation - Final Project

## Imports

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import gensim
from sklearn.model_selection import train_test_split
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

#### Import stop-words

In [15]:
stop_words = set(stopwords.words('english'))


## Base Run

#### Prepare data

#### Import data csv's to Pandas data-frame

In [16]:
all_data = pd.read_csv("./data/Emotion_final.csv")
andbrain_data =pd.read_csv("./data/Andbrain_DataSet.csv")

#### Shuffle data

In [17]:
shuffled_data = all_data.sample(frac=1).reset_index(drop=True)

#### Divide in to train & test 

In [18]:
y = shuffled_data["label"]
x = shuffled_data

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

y_train = y_train.to_frame()
y_test = y_test.to_frame()


#### Create algorithms list

In [19]:
algorithms = [LogisticRegression()]
#algorithms = [LogisticRegression(), RandomForestClassifier(), SGDClassifier()]

#### Create and fit count vectorizer

In [20]:
count_vectorizer=CountVectorizer(analyzer='word', ngram_range=(1, 1))
x = count_vectorizer.fit_transform(x_train["text"])
print(x.shape)

(15854, 16304)


#### Calculate accuracy of data function

In [21]:
def calculate_accuracy(predicted_data):
    count_equals = 0
    for idx, (_, row) in enumerate(x_test.iterrows()):
        if row["label"] == predicted_data[idx]:
            count_equals += 1
    print(f"accuracy={count_equals/len(predicted_data)}")

#### Get confusion matrix of predictions data-frame function

In [22]:
def get_confusion_matrix(predictions):
    matrix = np.zeros(25).reshape(5, 5)

    for a,b in zip(predictions, x_test.label):
        matrix[emotions[a]][emotions[b]] += 1
        
    print(matrix)

#### Get sentences function

In [23]:
def get_sentences(true_label, predicted_label, num_of_results, predictions, test_collection):
  counter = 0
  for i, (a, b) in enumerate(zip (test_collection.label, predictions)):
    if counter == num_of_results:
      break
    if a == true_label and b == predicted_label:
      print(test_collection.iloc[[i]]["text"].values)
      counter += 1

#### Get strongest words function

In [24]:
def get_strongest_words(label, pipe, count_vectorizer):
    inverse_dict={count_vectorizer.vocabulary_[w]:w for w in count_vectorizer.vocabulary_.keys()}
    cur_coef=pipe["algo"].coef_[emotions[label]]
    word_df=pd.DataFrame({"val":cur_coef}).reset_index().sort_values(["val"],ascending=[False])
    word_df.loc[:, "word"]=word_df["index"].apply(lambda v:inverse_dict[v])
    print(word_df.head(10))

## Run Base

In [25]:
for algorithm_under_test in algorithms:
    pipe = Pipeline([('vectorizer', count_vectorizer),
                    ('algo', algorithm_under_test)])
    pipe.fit(x_train["text"], y_train["label"])
    predicted = pipe.predict(x_test["text"])
    calculate_accuracy(predicted)
    try:
        get_strongest_words('happy', pipe, count_vectorizer)
    except AttributeError:
        pass
    get_confusion_matrix(predicted)
    get_sentences('happy', 'sadness', 3, predicted, x_test)
    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy=0.908678102926337
       index       val         word
12404  12404  2.714255    satisfied
13999  13999  2.551725     superior
11945  11945  2.547231     resolved
2984    2984  2.493536    contented
12973  12973  2.474030      sincere
4496    4496  2.466922     ecstatic
11954  11954  2.466773    respected
3043    3043  2.460642    convinced
10738  10738  2.447442      pleased
278      278  2.428949  adventurous
[[ 480.   17.   11.   21.    3.]
 [  13.  466.    9.   23.   33.]
 [  28.   17. 1346.   33.   21.]
 [  41.   29.   28. 1196.    8.]
 [   0.   14.    6.    7.  114.]]
['i lost him i realized that i really didnt have anything to fear and that in reality he was the one person that was helping me to trust again because i would tell him how i felt and he would give me back the same and it was starting to feel safe']
['i feel sure the donation would have been rejected']
['But now she looked exci']


## Experiment

### Prepare

#### Read wikipedia data

In [26]:
train_data_list, cur_sent = list(), list()
with open('./data/wackypedia_en1.words10.20Mwords') as f:
 for line in f:
    line = line.strip()
    if line == '</s>':
        train_data_list.append(cur_sent)
        cur_sent = list()
    elif line != '<s>' and not line.startswith('<text') and not line.startswith('</text'):
        cur_sent.append(line.split('\t')[0])

#### Create model using word2Vec

In [27]:
word2vec_model = gensim.models.Word2Vec(
    train_data_list, min_count=5, window=10, vector_size=500)

#### Filter stopwords from sentences

In [28]:
def get_without_stopwords(sentence):
	word_tokens = word_tokenize(sentence)
 
	filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
	
	filtered_sentence = []
	
	for w in word_tokens:
		if w not in stop_words:
			filtered_sentence.append(w)
	return filtered_sentence

#### Create emotions dictionary

In [29]:
emotions = dict()
emotions["anger"] = 0
emotions["fear"] = 1
emotions["happy"] = 2
emotions["sadness"] = 3
emotions["surprise"] = 4

#### Get word score function

In [30]:
def get_word_score(word):
	scores = [0, 0, 0, 0, 0]
	similar_words = word2vec_model.wv.most_similar(word)[:3]
	similar_words.append((word, 1.0))
	for word_candidate in similar_words:
		try:
			word_score_row = andbrain_data.loc[andbrain_data['word'] == word_candidate[0] + " "]
			if not word_score_row.empty:
				columns = list(word_score_row)
				for column in columns:
					if column != 'word':
						scores[emotions[column]] += word_score_row[column].values[0]
		except KeyError as e:
			continue
	return scores


#### Function that get if all emotions are equal to zero

In [31]:
def is_all_emotions_zero(word_score):
	for score in word_score:
		if score != 0:
			return False
	return True

#### Get sentence score function

In [32]:
def get_sentence_score(word_tokens):
	sentence_score = [0, 0, 0, 0, 0]
	for word in word_tokens:
		try:
			word_score = get_word_score(word)
			if not is_all_emotions_zero(word_score):
				sentence_score[word_score.index(max(word_score))] += 1
		except KeyError:
			pass
	return sentence_score

#### Get score for raw sentence function

In [33]:
def get_score(raw_sentence):
	non_stopwords = get_without_stopwords(raw_sentence)
	sentence_score = np.array(get_sentence_score(non_stopwords))
	return sentence_score

#### Get sentence with score tokens function

In [34]:
def get_sentence_with_score_tokens(sentence, score):
	for emotion in emotions.keys():
		for i in range(score[emotions[emotion]]):
			sentence += " #" + emotion
	return sentence

#### Get process data function

In [35]:
def get_processed_data(data_to_process):
	processed_data = pd.DataFrame(columns=data_to_process.columns)
	for idx, (_, row) in enumerate(data_to_process.iterrows()):
		print(str(idx) + " OUT OF " + str(data_to_process.shape[0]))
		raw_sentence = row["text"]
		sentence_score = get_score(raw_sentence)
		sentence_with_score = get_sentence_with_score_tokens(raw_sentence, sentence_score)
		data_to_add = {'text': str(sentence_with_score), 'label': row["label"]}
		processed_data = processed_data.append(data_to_add, ignore_index=True)
	return processed_data

### Run Experiment

#### Get processed train and test data

In [36]:
processed_train = get_processed_data(x_train)
processed_test = get_processed_data(x_test)

0 OUT OF 15854


TypeError: can only concatenate tuple (not "str") to tuple

#### Get process train data with only the meaning tags

In [None]:
processed_train_filtered = processed_train.copy()
for idx, (_, row) in enumerate(processed_train_filtered.iterrows()):
	if '#' in row["text"]:
		row["text"] = row["text"][row["text"].find("#"):]
print(processed_train_filtered.shape)

                                                    text    label
0                                  #fear #fear #surprise     fear
1      #happy #happy #happy #happy #happy #sadness #a...  sadness
2               #happy #happy #happy #anger #anger #fear    happy
3              #happy #sadness #fear #surprise #surprise    happy
4                   i feel weird a href http bondmusings     fear
...                                                  ...      ...
15849                   i feel like im a gorgeous person    happy
15850                               #happy #happy #happy    happy
15851        i feel like i have been rather unkind to it    anger
15852  im sure everyone is starting to feel the chris...    happy
15853                #happy #sadness #surprise #surprise     fear

[15854 rows x 2 columns]


#### Get process test data with only the meaning tags

In [None]:
processed_test_filtered = processed_test.copy()
for idx, (_, row) in enumerate(processed_test_filtered.iterrows()):
	if '#' in row["text"]:
		row["text"] = row["text"][row["text"].find("#"):]

#### Vectorized processed data

In [None]:
count_vectorizer_processed=CountVectorizer(analyzer='word', ngram_range=(1, 1))
vectors = count_vectorizer_processed.fit_transform(processed_train_filtered["text"])

#### Run pipeline using the precessed data after the change

In [None]:
for algorithm_under_test in algorithms:
    pipe = Pipeline([('vectorizer', count_vectorizer_processed),
                    ('algo', algorithm_under_test)])
    pipe.fit(processed_train_filtered["text"], y_train["label"])
    predicted = pipe.predict(processed_test_filtered["text"])
    calculate_accuracy(predicted)
    try:
        get_strongest_words('happy', pipe, count_vectorizer_processed)
    except AttributeError:
        pass
    get_confusion_matrix(predicted)
    get_sentences('happy', 'sadness', 5, predicted, processed_test_filtered)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


accuracy=0.42658930373360243
      index       val         word
1758   1758  1.803762     talented
1859   1859  1.733294   triumphant
339     339  1.720999      content
971     971  1.696127  invigorated
316     316  1.568278   complacent
1413   1413  1.550632   productive
12       12  1.486480     accepted
556     556  1.454419     ecstatic
1492   1492  1.448381    respected
871     871  1.397202      honored
[[ 51.   3.   3.   3.   1.]
 [  3.  28.   1.   1.   4.]
 [280. 248. 995. 616.  94.]
 [267. 229. 435. 614.  79.]
 [  0.   5.   1.   0.   3.]]
['#fear #fear #surprise']
['#happy #sadness #sadness #sadness #anger #fear #surprise #surprise']
['#sadness']
['#sadness #sadness #fear #fear #surprise #surprise #surprise']
['#happy #sadness #fear #fear #surprise #surprise #surprise #surprise #surprise #surprise']
