In [1]:
import numpy as np
import re
from gensim.models import KeyedVectors
import pandas as pd
from sklearn.model_selection import train_test_split

## Sentiments

In [17]:
sentiments = pd.read_csv("asent/asent/lexicons/da_lexicon_v1.csv")

In [18]:
sentiments[:5]

Unnamed: 0,word,score
0,abe,-1.0
1,abort,-0.333333
2,absolut,0.333333
3,abstrakt,0.666667
4,absurd,-2.333333


## WORD2VEC

### Loading embeddings and sentiments

In [11]:
word2vec = KeyedVectors.load_word2vec_format('embeddings/semantic_model_DAGW_cbow.wv.bin', binary=True)

In [12]:
my_dict = dict({})
for idx, key in enumerate(word2vec.key_to_index):
    my_dict[key] = word2vec[key]

In [14]:
my_dict["glemte"]

array([-5.44577390e-02, -1.95206292e-02, -5.21334074e-02, -3.63027714e-02,
        3.16028632e-02, -4.82380986e-02, -2.97096968e-02,  5.71238063e-02,
       -8.20544083e-03,  1.41782770e-02, -2.00094413e-02,  2.07450497e-03,
        5.30896522e-02, -3.18971500e-02,  5.52097224e-02, -1.45272305e-02,
       -2.79464107e-02, -7.88782090e-02, -8.78869146e-02,  2.97148898e-02,
       -6.26806915e-02, -6.75774515e-02, -1.64186843e-02,  2.36200634e-03,
       -6.16080016e-02,  1.70091186e-02,  2.11043395e-02, -1.25539757e-03,
       -3.77942692e-03,  2.05034167e-02, -3.28624691e-03, -3.09932865e-02,
       -5.50933294e-02,  8.24128240e-02,  6.55494183e-02,  2.80603599e-02,
        1.18187163e-02,  2.51543950e-02,  1.99077446e-02, -6.44484302e-03,
       -6.17770851e-02,  5.34106158e-02, -5.26156500e-02,  5.00716530e-02,
       -8.78605247e-02,  3.96079272e-02,  4.51625548e-02,  5.23860045e-02,
       -7.07231611e-02,  5.22112995e-02, -5.58102131e-02,  5.78433312e-02,
        6.42338991e-02, -

### Combining embeddings and sentiments

In [61]:
y = []
X = []
not_found = []

for index, row in sentiments.iterrows():
    raw_word = row["word"]
    word = raw_word.lower()
    sent = row["score"]
    if word in my_dict: 
        embed = my_dict[word]
        y.append(sent)
        X.append(embed)
    else:
        not_found.append(word)

In [63]:
print("embeddings:", len(X), "; sentiment labels:", len(y), "; not embedding found:", len(not_found))

embeddings: 6420 ; sentiment labels: 6420 ; not embedding found: 172


In [64]:
# save non found
textfile = open("appendix/not_found_words.txt", "w")
for element in not_found:
    textfile.write(element + "\n")
textfile.close()

In [65]:
# save X and y arrays
X_array = np.array(X)
y_array = np.array(y)

print("Shape of X:", X_array.shape, "; Shape of y:", y_array.shape)

np.save("output/X_array.npy", X_array)
np.save("output/y_array.npy", y_array)

Shape of X: (6420, 300) ; Shape of y: (6420,)


In [58]:
set(y)

{-5.0,
 -4.66666666666667,
 -4.33333333333333,
 -4.16666666666667,
 -4.0,
 -3.66666666666667,
 -3.6666666666666665,
 -3.5,
 -3.33333333333333,
 -3.16666666666667,
 -3.0,
 -2.83333333333333,
 -2.66666666666667,
 -2.5,
 -2.33333333333333,
 -2.16666666666667,
 -2.0,
 -1.66666666666667,
 -1.5,
 -1.33333333333333,
 -1.16666666666667,
 -1.0,
 -0.833333333333333,
 -0.666666666666667,
 -0.5,
 -0.444444444444444,
 -0.333333333333333,
 -0.166666666666667,
 0.0,
 0.111111111111111,
 0.166666666666667,
 0.333333333333333,
 0.444444444444444,
 0.5,
 0.666666666666667,
 0.777777777777778,
 0.833333333333333,
 1.0,
 1.11111111111111,
 1.16666666666667,
 1.33333333333333,
 1.5,
 1.66666666666667,
 1.83333333333333,
 2.0,
 2.33333333333333,
 2.333333333333333,
 2.5,
 2.66666666666667,
 2.83333333333333,
 3.0,
 3.16666666666667,
 3.33333333333333,
 3.66666666666667,
 4.0,
 4.33333333333333,
 4.66666666666667,
 5.0}

In [None]:
# split test train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, 
                                                    random_state=3, shuffle=True, stratify=y)

X_train, X_val, y_train, y_val =  train_test_split(X_train, y_train, test_size=0.2, train_size=0.8, 
                                                    random_state=3, shuffle=True, stratify=y_train)

## Model

In [1]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.utils import plot_model

import pydot, graphviz

2021-12-10 14:49:23.135094: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-10 14:49:23.135134: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [14]:
def nn_model(optimizer, loss="mse", metrics="accuracy"):
    
    # create a sequential model
    model = Sequential()
    
    model.add(Dense(100, input_shape=(300,), activation="relu"), )
    model.add(Dense(1))

    # categorical cross-entropy, optimizer defined in function call
    model.compile(loss=loss, 
                  optimizer=optimizer, 
                  metrics=metrics)

    # return the compiled model
    return model

In [15]:
model = nn_model(optimizer="adam")

In [16]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_7 (Dense)             (None, 100)               30100     
                                                                 
 dense_8 (Dense)             (None, 1)                 101       
                                                                 
Total params: 30,201
Trainable params: 30,201
Non-trainable params: 0
_________________________________________________________________


In [10]:
plot_model(model)

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')


In [19]:
y = np.load("output/y_array.npy")

In [20]:
y

array([-1.        , -0.33333333,  0.33333333, ..., -1.        ,
       -2.        , -3.        ])

In [21]:
min(y)

-5.0

In [22]:
max(y)

5.0

In [24]:
y.mean()

-0.19286085150571133

In [27]:
np.median(y)

0.333333333333333

## FASTTEXT

In [19]:
# pip install gensim==4.1.2
from gensim.models import FastText
import os

In [20]:
ft_model = FastText.load("../../dagw_fasttext_embeddings/fasttext_model/fasttext.model")

In [22]:
ft_y = []
ft_X = []
ft_notfound = []

for index, row in sentiments.iterrows():
    raw_word = row["word"]
    word = raw_word.lower()
    sent = row["score"]
    if word in ft_model.wv:
        embed = ft_model.wv[word]
        ft_y.append(sent)
        ft_X.append(embed)
    else:
        ft_notfound.append(word)

In [24]:
print("y:", len(ft_y))
print("X:", len(ft_X))
print("not found:", len(ft_notfound))

y: 6592
X: 6592
not found: 0


In [25]:
# save X and y arrays
X_ft_array = np.array(ft_X)
y_ft_array = np.array(ft_y)

print("Shape of X:", X_ft_array.shape, "; Shape of y:", y_ft_array.shape)

np.save("output/X_ft_array.npy", X_ft_array)
np.save("output/y_ft_array.npy", y_ft_array)

Shape of X: (6592, 300) ; Shape of y: (6592,)


In [2]:
X_fast = np.load("output/X_ft_array.npy")

In [3]:
X_fast.shape

(6592, 300)

In [4]:
X_fast[:10]

array([[ 9.0589165e-06, -5.4946134e-04, -1.0366088e-03, ...,
         6.6615443e-04, -6.1250117e-04, -9.8689939e-05],
       [-4.1444917e-04, -4.5197987e-04, -7.7520940e-04, ...,
        -3.5094662e-04,  5.7898625e-04,  3.0868297e-04],
       [-3.8131647e-04, -2.1416892e-05,  2.0031886e-04, ...,
        -2.5782618e-04,  9.4951414e-05,  4.1081013e-05],
       ...,
       [ 6.8663503e-04, -2.5554580e-04,  2.8461125e-04, ...,
         1.6999872e-04,  2.7255900e-04, -2.2275893e-04],
       [ 8.8844029e-04, -3.2385800e-04,  4.2867288e-04, ...,
        -2.5822101e-05,  9.4575505e-04,  3.8522884e-04],
       [ 6.3543417e-04,  4.4755335e-04,  5.5332790e-04, ...,
        -3.3969097e-04, -5.0948677e-04,  1.1221757e-04]], dtype=float32)

In [5]:
y_fast = np.load("output/y_ft_array.npy")

In [6]:
y_fast[:10]

array([-1.        , -0.33333333,  0.33333333,  0.66666667, -2.33333333,
        1.33333333,  1.33333333,  1.66666667,  1.66666667,  0.66666667])

In [7]:
max(y_fast)

5.0

In [8]:
min(y_fast)

-5.0

In [10]:
y_fast.mean()

-0.1989406351132686

In [16]:
X = np.load("output/X_ft_array.npy")
y = np.load("output/y_ft_array.npy")

In [17]:
y = np.where(y<=0, -1, y)
y = np.where(y>0, 1, y)
y = y.astype(int)

In [19]:
y[:10]

array([-1, -1,  1,  1, -1,  1,  1,  1,  1,  1])

In [20]:
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y = lb.fit_transform(y)

In [21]:
y[:10]

array([[0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1]])

In [24]:
import sklearn
sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_we