# Prepare Unlabelled Data for Lexicon Expansion

This script prepares the unlabelled data of lemmas and their corresponding word2vec and fastText embeddings, which will be used for lexicon expansion.

### Dependencies

In [1]:
from gensim.models import KeyedVectors, FastText
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Prepare list of frequent lemmas (10000-20000)

In [4]:
# load lemmas
lemmas = pd.read_csv('../../lemmas/lemma-30k-2017.txt', sep = "\t", header = None)
lemmas = lemmas.rename(columns={0: "POS", 1: "word", 2: "freq"})
lemmas = lemmas.sort_values(["freq"], ascending = False)
lemmas = lemmas[10000:20000]

In [5]:
lemmas

Unnamed: 0,POS,word,freq
10000,NC,slankekur,0.000004
10001,A,autoriseret,0.000004
10002,V,falme,0.000004
10003,NC,smørrebrød,0.000004
10004,NC,øjekast,0.000004
...,...,...,...
19993,I,jovist,0.000001
19992,V,gennemprøve,0.000001
19991,NC,erstatningsansvar,0.000001
19990,NC,brændselscelle,0.000001


In [8]:
# only keep relevant pos tags
pos_tags = ["A", "D", "NC", "V", "I"]
lemmas_relevant = lemmas[lemmas['POS'].isin(pos_tags)]
len(lemmas_relevant)

9937

In [10]:
# load sentida2 base lexicon
sentiments = pd.read_csv("../../lexicons/sentida2_lexicon.csv")
print(len(sentiments))

6592


In [11]:
# get lemmas without that don't have a sentiment score already
lemmas_nosent = lemmas_relevant[~lemmas_relevant['word'].isin(sentiments["word"].tolist())]
len(lemmas_nosent)

9349

### Match with word2vec Embeddings

In [12]:
# load word embeddings and save to dictionary
word2vec = KeyedVectors.load_word2vec_format("../../embeddings/semantic_model_DAGW_cbow.wv.bin", binary=True)
w2v_dict = dict({})
for idx, key in enumerate(word2vec.key_to_index):
    w2v_dict[key] = word2vec[key]

In [13]:
# find embeddings for lemmas with which we want to extend the dictionary
embeddings_to_predict = []
words_to_predict = []

for idx, row in lemmas_nosent.iterrows():
    # get word from lemma df
    word = row["word"]
    # get embedding of corresponding word
    if word in w2v_dict:
        embedding = w2v_dict[word]
        # append word to word list
        words_to_predict.append(word)
        # append embedding to embedding list
        embeddings_to_predict.append(embedding)

In [15]:
# save the embeddings and word to predict to numpy arrays
embeddings_array = np.array(embeddings_to_predict)
words_array = np.array(words_to_predict)

print(embeddings_array.shape, words_array.shape)

np.save("../../data/unlabelled_data/w2v_embeds_to_predict.npy", embeddings_array)
np.save("../../data/unlabelled_data/w2v_words_to_predict.npy", words_array)

(8639, 300) (8639,)


### Match with fastText Embeddings

In [16]:
# load fastText model
ft_model = FastText.load("../../../../dagw_fasttext_embeddings/fasttext_model/fasttext.model")

In [18]:
# find embeddings for lemmas with which we want to extend the dictionary
embeddings_to_predict_ft = []
words_to_predict_ft = []

for idx, row in lemmas_nosent.iterrows():
    # get word from lemma df
    word = row["word"]
    # get embedding of corresponding word
    if word in ft_model.wv:
        embedding = ft_model.wv[word]
        # append word to word list
        words_to_predict_ft.append(word)
        # append embedding to embedding list
        embeddings_to_predict_ft.append(embedding)

In [19]:
# save the embeddings and word to predict to numpy arrays
embeddings_array_ft = np.array(embeddings_to_predict_ft)
words_array_ft = np.array(words_to_predict_ft)

print(embeddings_array_ft.shape, words_array_ft.shape)

np.save("../../data/unlabelled_data/ft_embeds_to_predict.npy", embeddings_array_ft)
np.save("../../data/unlabelled_data/ft_words_to_predict.npy", words_array_ft)

(9349, 300) (9349,)


In [22]:
# extra: some lemmas are not in the vocabulary of fasttext, but estimated using the trained-subwords, here we find them
in_ft_voc=[]

for index, row in sentiments.iterrows():
    raw_word = row["word"]
    word = raw_word.lower()
    if word in ft_model.wv.key_to_index:
        in_ft_voc.append(index)
        
print(len(in_ft_voc)) # those are in the vocabulary
print(9349-len(in_ft_voc)) # those are not in the vocabulary, but estimated from subword embeddings

6500
2849
