In [1]:
import torch
import flair
import bert_embedding
from flair.data import Sentence

  from ._conv import register_converters as _register_converters


In [2]:
import io
import pandas as pd
data = pd.read_csv('Testing_Emotions_corrected_indexed.csv') 
data.head()

Unnamed: 0,id,label,tweet
0,1,0,i feel so enraged but helpless at the same time
1,2,2,i just dont know why i am feeling so determine...
2,3,3,i feel just terrible a dirigonzo
3,4,3,i feel that i missed a bunch of names this mor...
4,5,4,i still count that as one of the most well wri...


In [3]:
text = data['tweet']
txt = text.tolist()
print(txt[:10])

['i feel so enraged but helpless at the same time', 'i just dont know why i am feeling so determined but i am', 'i feel just terrible a   dirigonzo', 'i feel that i missed a bunch of names this morning the group looked so large', 'i still count that as one of the most well written books i ve ever read but it feels weird to enjoy this person s work', 'i feel honoured to be ranked among them', 'i was feeling determined i figured i could overcome my 1 of heights for the sake of another tick on the bucket list', 'i like it there because well i guess i feel welcomed', 'i feel i should say talented yet again', 'i feel like im less afraid of doing a natural type look now but at the same time i dont think its my favourite thing to try']


In [8]:
from flair.embeddings import WordEmbeddings
from flair.embeddings import CharacterEmbeddings
from flair.embeddings import StackedEmbeddings
from flair.embeddings import FlairEmbeddings
from flair.embeddings import BertEmbeddings
from flair.embeddings import ELMoEmbeddings


### Initialising embeddings (un-comment to use others) ###

glove_embedding = WordEmbeddings('glove')
character_embeddings = CharacterEmbeddings()
flair_forward  = FlairEmbeddings('news-forward-fast')
flair_backward = FlairEmbeddings('news-backward-fast')
bert_embedding = BertEmbeddings()

stacked_embeddings = StackedEmbeddings( embeddings = [ 
                                                      bert_embedding,
                                                      #elmo_embedding
                                                   ])

In [10]:
from flair.data import Sentence
sentence = Sentence('This is BERT Embedding')

stacked_embeddings.embed(sentence)


for token in sentence:
  print(token.embedding)
# data type and size of embedding #
print(type(token.embedding))
# storing size (length) #
z = token.embedding.size()[0]

tensor([-0.6327, -0.4104,  0.1828,  ..., -0.5955, -0.1193,  0.6265])
tensor([-0.3797, -0.4740,  0.5282,  ..., -0.4815, -0.0980,  1.0802])
tensor([ 0.4525,  0.0918,  0.4873,  ...,  0.6933, -0.1054, -0.0516])
tensor([ 0.1616,  0.0039, -0.3333,  ...,  0.0916,  0.0540,  0.1815])
<class 'torch.Tensor'>


In [None]:
from tqdm import tqdm ## tracks progress of loop ##

# creating a tensor for storing sentence embeddings #
s = torch.zeros(0,z)

# iterating Sentence (tqdm tracks progress) #


for q in tqdm(txt):
    w = torch.zeros(0,z)
    sentence = Sentence(q)
    stacked_embeddings.embed(sentence)
    for token in sentence:
        w = torch.cat((w,token.embedding.view(-1,z)),0)
        s = torch.cat((s, w.mean(dim = 0).view(-1, z)),0)

 41%|█████████████████████████████▌                                          | 4113/10001 [9:17:37<26:02:24, 15.92s/it]

In [None]:
from flair.embeddings import DocumentPoolEmbeddings

document_embeddings = DocumentPoolEmbeddings([bert_embedding,
                                                      ])
z = sentence.embedding.size()[1]

s = torch.zeros(0,z)

for tweet in tqdm(txt):
    sentence = Sentence(tweet)
    document_embeddings.embed(sentence)
    s = torch.cat((s, sentence.embedding.view(-1,z)),0)

In [None]:
## tensor to numpy array ##
X = s.numpy()   

## Test set ##
test = X[3962:,:]
train = X[:3962,:]

# extracting labels of the training set #
target = data['label'][data['label'].isnull()==False].values

In [None]:
def custom_eval(preds, dtrain):
    labels = dtrain.get_label().astype(np.int)
    preds = (preds >= 0.3).astype(np.int)
    return [('f1_score', f1_score(labels, preds))]

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

### Splitting training set ###
x_train, x_valid, y_train, y_valid = train_test_split(train, target,  
                                                      random_state=42, 
                                                          test_size=0.3)

### XGBoost compatible data ###
dtrain = xgb.DMatrix(x_train,y_train)         
dvalid = xgb.DMatrix(x_valid, label = y_valid)

### defining parameters ###
params = {
          'colsample': 0.9,
          'colsample_bytree': 0.5,
          'eta': 0.1,
          'max_depth': 8,
          'min_child_weight': 6,
          'objective': 'binary:logistic',
          'subsample': 0.9
          }

### Training the model ###
xgb_model = xgb.train(
                      params,
                      dtrain,
                      feval= custom_eval,
                      num_boost_round= 1000,
                      maximize=True,
                      evals=[(dvalid, "Validation")],
                      early_stopping_rounds=30
                      )

In [None]:
### Reformatting test set for XGB ###
dtest = xgb.DMatrix(test)

### Predicting ###
predict = xgb_model.predict(dtest) # predicting