In [33]:
import pandas as pd
import numpy as np
import flair
from flair.embeddings import FlairEmbeddings
from tqdm import tqdm
import torch
from flair.data import Sentence
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn import preprocessing

## tremo dataset

In [2]:
#import tremo dataset
tremopath = '/Users/pinarayaz/Jupyter/NLP/data/tremo_preprocessed.csv'
tremo_df = pd.read_csv(tremopath)
tremo_df.head()

Unnamed: 0,Entry,ValidatedEmotion
0,yeni gün bir mutlu,Happy
1,gece ol sokak geç kork,Fear
2,gerçek hayal,Sadness
3,arkadaş kaybet üz,Sadness
4,insan çıkar ol tiksin,Disgust


In [4]:
txt = tremo_df['Entry'].values.tolist()

In [7]:
embeddings = FlairEmbeddings('resources/taggers/language_model/best-lm.pt')

In [13]:
# create a sentence
sentence = Sentence('yeni gün bir mutlu')
# embed words in sentence
embeddings.embed(sentence)
for token in sentence:
    print(token.embedding)
# data type and size of embedding
print(type(token.embedding))
# storing size (length)
z = token.embedding.size()[0]

tensor([ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000, -0.0000,
         0.0000, -0.7616, -0.0000,  0.0000,  0.0000,  0.0000,  0.0000, -0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000, -0.0000,  0.0000,  0.0000, -0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000, -0.0000,
        -0.0000, -0.0000,  0.0000,  0.0000,  0.0000, -0.0000, -0.0000, -0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
        -0.0000, -0.0000, -0.0000,  0.0000,  0.0000, -0.0000, -0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000, -0.0000,  0.0000, -0.0000,  0.0000,  0.0000,  0.0000, -0.7616,
         0.0000, -0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000, -0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.00

In [14]:
#creating a tensor for storing sentence embeddings
s = torch.zeros(0,z)

#iterating Sentence (tqdm tracks progress)
for entry in tqdm(txt):   
    #empty tensor for words
    w = torch.zeros(0,z)   
    sentence = Sentence(entry)
    embeddings.embed(sentence)
    #for every word
    for token in sentence:
        #storing word Embeddings of each word in a sentence
        w = torch.cat((w,token.embedding.view(-1,z)),0)
    #storing sentence Embeddings (mean of embeddings of all words)
    s = torch.cat((s, w.mean(dim = 0).view(-1, z)),0)

100%|██████████| 25989/25989 [02:13<00:00, 194.00it/s]


In [30]:
#tensor to numpy array
train = s.numpy()
#extracting labels of the training set
target = tremo_df['ValidatedEmotion'][tremo_df['ValidatedEmotion'].isnull()==False].values

#encode labels
le = preprocessing.LabelEncoder()
target = le.fit_transform(target)

In [48]:
#Splitting training set
x_train, x_valid, y_train, y_valid = train_test_split(train, target, random_state=42, test_size=0.2)

#XGBoost compatible data
dtrain = xgb.DMatrix(x_train,y_train)         
dvalid = xgb.DMatrix(x_valid, label = y_valid)

#defining parameters
params = {
          'colsample': 0.9,
          'colsample_bytree': 0.5,
          'eta': 0.1,
          'max_depth': 8,
          'min_child_weight': 6,
          'objective': 'multi:softmax',
          'num_class': 6,
          'subsample': 0.9,
          }

#Training the model
xgb_model = xgb.train(
                      params,
                      dtrain,
                      #feval= custom_eval,
                      num_boost_round= 1000,
                      maximize=True,
                      evals=[(dvalid, "Validation")],
                      early_stopping_rounds=30
                      )

[0]	Validation-merror:0.799923
Will train until Validation-merror hasn't improved in 30 rounds.
[1]	Validation-merror:0.79319
[2]	Validation-merror:0.792997
[3]	Validation-merror:0.792997
[4]	Validation-merror:0.792613
[5]	Validation-merror:0.792997
[6]	Validation-merror:0.792805
[7]	Validation-merror:0.792613
[8]	Validation-merror:0.792805
[9]	Validation-merror:0.792613
[10]	Validation-merror:0.792805
[11]	Validation-merror:0.792805
[12]	Validation-merror:0.792805
[13]	Validation-merror:0.792805
[14]	Validation-merror:0.79242
[15]	Validation-merror:0.792997
[16]	Validation-merror:0.79319
[17]	Validation-merror:0.79319
[18]	Validation-merror:0.792805
[19]	Validation-merror:0.792805
[20]	Validation-merror:0.792997
[21]	Validation-merror:0.792805
[22]	Validation-merror:0.792805
[23]	Validation-merror:0.792997
[24]	Validation-merror:0.79319
[25]	Validation-merror:0.79319
[26]	Validation-merror:0.792997
[27]	Validation-merror:0.792997
[28]	Validation-merror:0.792613
[29]	Validation-merror: