In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import gensim
import scipy as sp

# Base Run

In [3]:
train_data = pd.read_csv("./data/emotion-labels-train.csv")
test_data = pd.read_csv("./data/emotion-labels-test.csv")

In [4]:
algorithms = [RandomForestClassifier(), LogisticRegression()]

In [5]:
count_vectorizer=CountVectorizer(analyzer='word', ngram_range=(1, 2))
x = count_vectorizer.fit_transform(train_data["text"])
print(x.shape)

(3613, 43357)


In [6]:
def calculate_accuracy(predicted_data):
    count_equals = 0
    for idx, row in test_data.iterrows():
        if row["label"] == predicted_data[idx]:
            count_equals += 1
    print(f"accuracy={count_equals/len(predicted_data)}")

In [7]:
for algorithm_under_test in algorithms:
    pipe = Pipeline([('vectorizer', count_vectorizer),
                    ('algo', algorithm_under_test)])
    pipe.fit(train_data["text"], train_data["label"])
    predicted = pipe.predict(test_data["text"])
    calculate_accuracy(predicted)

accuracy=0.7972628898790579
accuracy=0.7721196690006366


# Data Processing

In [8]:
train_data_list = list()
for index, row in train_data.iterrows():
    train_data_list.append(row["text"])

In [9]:
# Builds the models

model_big_ten = gensim.models.Word2Vec(
    train_data_list, min_count=5, window=1, vector_size=10)
model_small_ten = gensim.models.Word2Vec(
    train_data_list, min_count=5, window=10, vector_size=10)

model_big_five = gensim.models.Word2Vec(
    train_data_list, min_count=5, window=1, vector_size=500)
model_small_five = gensim.models.Word2Vec(
    train_data_list, min_count=5, window=10, vector_size=500)

models = [model_big_ten, model_big_five, model_small_ten, model_small_five]

In [10]:
# Makes the similarity lists
words = pd.read_csv('./data/SimLex-999/SimLex-999.txt', delimiter='\t')
pairs = list(zip(words['word1'], words['word2']))
golden_sim = words['SimLex999']
results = [[], [], [], []]
for j in range(len(models)):
    for i in range(len(pairs)):
        try:
            similarity = models[j].wv.similarity(pairs[i][0], pairs[i][1])
        except KeyError:
            similarity = 0
        finally:
            results[j].append(similarity)

In [11]:
cor_results = []
golden_sim = words['SimLex999']
for result in results:
    sim_res = sp.stats.spearmanr(result, golden_sim)
    cor_results.append(sim_res)
print(cor_results)

[SpearmanrResult(correlation=nan, pvalue=nan), SpearmanrResult(correlation=nan, pvalue=nan), SpearmanrResult(correlation=nan, pvalue=nan), SpearmanrResult(correlation=nan, pvalue=nan)]


