In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import gensim
import scipy as sp
from sklearn.model_selection import train_test_split
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize



In [3]:
stop_words = set(stopwords.words('english'))

# Base Run

In [4]:
all_data = pd.read_csv("./data/Emotion_final.csv")
andbrain_data =pd.read_csv("./data/Andbrain_DataSet.csv")

In [5]:
shuffled_data = all_data.sample(frac=1).reset_index(drop=True)

In [6]:
y = shuffled_data["label"]
x = shuffled_data

x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.2, random_state=42)

y_train = y_train.to_frame()
y_test = y_test.to_frame()

In [7]:
algorithms = [RandomForestClassifier(), LogisticRegression()]

In [129]:
count_vectorizer=CountVectorizer(analyzer='word', ngram_range=(1, 1))
x = count_vectorizer.fit_transform(x_train["text"])
print(x.shape)

(15854, 16307)


In [9]:
def calculate_accuracy(predicted_data):
    count_equals = 0
    for idx, (_, row) in enumerate(x_test.iterrows()):
        if row["label"] == predicted_data[idx]:
            count_equals += 1
    print(f"accuracy={count_equals/len(predicted_data)}")

In [130]:
for algorithm_under_test in algorithms:
    pipe = Pipeline([('vectorizer', count_vectorizer),
                    ('algo', algorithm_under_test)])
    pipe.fit(x_train["text"], y_train["label"])
    predicted = pipe.predict(x_test["text"])
    calculate_accuracy(predicted)

accuracy=0.8915237134207871


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


accuracy=0.9132189707366297


# Data Processing

In [11]:
train_data_list, cur_sent = list(), list()
with open('./data/wackypedia_en1.words10.20Mwords') as f:
 for line in f:
    line = line.strip()
    if line == '</s>':
        train_data_list.append(cur_sent)
        cur_sent = list()
    elif line != '<s>' and not line.startswith('<text') and not line.startswith('</text'):
        cur_sent.append(line.split('\t')[0])

In [12]:
# Builds the models

# model_big_ten = gensim.models.Word2Vec(
#     train_data_list, min_count=5, window=1, vector_size=10)
# model_small_ten = gensim.models.Word2Vec(
#     train_data_list, min_count=5, window=10, vector_size=10)

# model_big_five = gensim.models.Word2Vec(
#     train_data_list, min_count=5, window=1, vector_size=500)
model_small_five = gensim.models.Word2Vec(
    train_data_list, min_count=5, window=10, vector_size=500)

models = [model_small_five]

In [13]:
def get_without_stopwords(sentence):
	word_tokens = word_tokenize(sentence)
 
	filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
	
	filtered_sentence = []
	
	for w in word_tokens:
		if w not in stop_words:
			filtered_sentence.append(w)
	return filtered_sentence

In [14]:
emotions = dict()
emotions["happy"] = 0
emotions["sadness"] = 1
emotions["anger"] = 2
emotions["fear"] = 3
emotions["surprise"] = 4

In [15]:
def get_word_score(word):
	scores = [0, 0, 0, 0, 0]
	similar_words = models[0].wv.most_similar(word)[:3]
	similar_words.append((word, 1.0))
	for word_candidate, correlation in similar_words:
		try:
			word_score_row = andbrain_data.loc[andbrain_data['word'] == word_candidate + " "]
			if not word_score_row.empty:
				columns = list(word_score_row)
				for column in columns:
					if column != 'word':
						scores[emotions[column]] += word_score_row[column].values[0]
		except KeyError as e:
			continue
	return scores


In [16]:
def is_all_emotions_zero(word_score):
	result = True
	for score in word_score:
		if score != 0:
			result = False
			break
	return result

In [17]:
def get_sentence_score(word_tokens):
	sentence_score = [0, 0, 0, 0, 0]
	for word in word_tokens:
		try:
			word_score = get_word_score(word)
			if not is_all_emotions_zero(word_score):
				sentence_score[word_score.index(max(word_score))] += 1
			# word_score_row = andbrain_data.loc[andbrain_data['word'] == word + " "]
			# if not word_score_row.empty:
			# 	print(models[0].wv.most_similar(word)[:3])
		except KeyError:
			continue
	return sentence_score


In [18]:
def get_vector_with_score(vector, raw_sentence):
	non_stopwords = get_without_stopwords(raw_sentence)
	sentence_score = np.array(get_sentence_score(non_stopwords))
	return np.concatenate((vector, sentence_score))

In [127]:
def get_processed_data(data_to_process, title):
	count_vectorizer=CountVectorizer(analyzer='word', ngram_range=(1, 1))
	vectors = count_vectorizer.fit_transform(data_to_process["text"])
	# new_count_vector = pd.DataFrame(columns=["vector"])
	new_count_vector = np.zeros(shape=(data_to_process.shape[0], vectors.shape[1] + 5))
	print("WORKING ON " + title)
	for idx, vector in enumerate(vectors):
		print(str(idx) + " OUT OF " + str(vectors.shape[0]))
		raw_sentence = data_to_process.iloc[[idx]]	
		new_vector_data = get_vector_with_score(vector.toarray()[0], raw_sentence.values[0][0])
		new_count_vector[idx] = new_vector_data
		print(new_count_vector[idx])
		# new_count_vector = np.append(new_count_vector, new_vector_data, axis=1)
		# new_count_vector.append(new_vector_data)
	print("DONE WITH " + title, end="\n")
	return new_count_vector

In [128]:
processed_train = get_processed_data(x_train, "TRAIN")
processed_test = get_processed_data(x_test, "TEST")

MemoryError: Unable to allocate 1.93 GiB for an array with shape (15854, 16312) and data type float64

In [None]:
print(processed_test)

[]


In [None]:

for algorithm_under_test in algorithms:
    pipe = Pipeline([('algo', algorithm_under_test)])
    pipe.fit(processed_train_shaped, y_train["label"])
    predicted = pipe.predict(processed_test)
    calculate_accuracy(predicted)

ValueError: Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required.