# Prepare Labelled Data

This notebook prepares the labelled data to train the linear regression and neural network model to predict continuous sentiment scores based on word2vec and fasttext word embeddings. 

### Dependencies

In [15]:
import re, os
import numpy as np
import pandas as pd
import itertools
from gensim.models import KeyedVectors, FastText

### Sentiment Lexicon: Sentida2

Below, the sentiment lexicon sentida2 is loaded and inspected. It was extracted from [Github](https://github.com/Guscode/Sentida2). 

In [2]:
# loading sentiment lexicon
sentiments = pd.read_csv("../../lexicons/sentida2_lexicon.csv")
# print the how many scores
print(len(sentiments))
sentiments[:5]

6592


Unnamed: 0,word,score
0,abe,-1.0
1,abort,-0.333333
2,absolut,0.333333
3,abstrakt,0.666667
4,absurd,-2.333333


### Word Embeddings: Word2Vec

In [3]:
# load word2vec word embeddings
word2vec = KeyedVectors.load_word2vec_format("../../embeddings/semantic_model_DAGW_cbow.wv.bin", binary=True)
# create emtpty dictionary
w2v_dict = {}
# put word2vec data into dictionary
for idx, key in enumerate(word2vec.key_to_index):
    w2v_dict[key] = word2vec[key]

__Matching sentida2 sentiment scores with corresponding word embeddings__

In [4]:
y = [] # empty list for embeddings
X = [] # empty list for labels (sentiment scores)
not_found = [] # empty list for word in sentida2 for which no embeddings could be found

for index, row in sentiments.iterrows():
    raw_word = row["word"] # take the word
    word = raw_word.lower() # lowercase the word
    sent = row["score"] # get the score
    # if the word exist in word2vec, append to lists
    if word in w2v_dict: 
        embed = w2v_dict[word]
        y.append(sent)
        X.append(embed)
    # otherwise append to not found
    else:
        not_found.append(word)

In [6]:
# save not found words
textfile = open("../../appendix/not_found_w2v_sentida2.txt", "w")
for element in not_found:
    textfile.write(element + "\n")
textfile.close()

In [5]:
print(len(not_found))

172


In [7]:
# save X and y arrays
X_array = np.array(X)
y_array = np.array(y)

print("Shape of X:", X_array.shape, "; Shape of y:", y_array.shape)

np.save("../../data/labelled_data/X_w2v_asent.npy", X_array)
np.save("../../data/labelled_data/y_w2v_asent.npy", y_array)

Shape of X: (6420, 300) ; Shape of y: (6420,)


### FastText

In [17]:
# load fasttext embedding model
ft_model = FastText.load("../../../../dagw_fasttext_embeddings/fasttext_model/fasttext.model")

In [18]:
ft_y = [] # empty list for embeddings
ft_X = [] # empty list for labels (sentiment scores)
ft_not_found = [] # empty list of not founc

for index, row in sentiments.iterrows():
    raw_word = row["word"]
    word = raw_word.lower()
    sent = row["score"]
    if word in ft_model.wv:
        embed = ft_model.wv[word]
        ft_y.append(sent)
        ft_X.append(embed)
    else:
        ft_not_found.append(word)

In [19]:
len(ft_not_found)

0

In [20]:
# save X and y arrays
X_ft_array = np.array(ft_X)
y_ft_array = np.array(ft_y)

print("Shape of X:", X_ft_array.shape, "; Shape of y:", y_ft_array.shape)

np.save("../../data/labelled_data/X_ft_asent.npy", X_ft_array)
np.save("../../data/labelled_data/y_ft_asent.npy", y_ft_array)

Shape of X: (6592, 300) ; Shape of y: (6592,)


In [20]:
# extra: some lemmas are not in the vocabulary of fasttext, but estimated using the trained-subwords, here we find them
in_ft_voc=[]

for index, row in sentiments.iterrows():
    raw_word = row["word"]
    word = raw_word.lower()
    if word in ft_model.wv.key_to_index:
        in_ft_voc.append(index)
        
print(len(in_ft_voc)) # those are in the vocabulary
print(6592-len(in_ft_voc)) # those are not in the vocabulary, but estimated from subword embeddings

6500
92
