In [None]:
! python3.7 -m pip install git+https://github.com/zalandoresearch/flair.git

In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
np.random.seed(42)

In [None]:
source = pd.read_csv("refined_for_bert.csv", sep=";")

In [None]:
source_r = source[["subs", "text"]]

In [None]:
# source_r_pruned = source_r[source_r['subs'].isin(["neutrality", "happiness", "anger", "sadness"])]

In [None]:
# len(source_r_pruned)

In [None]:
# Shuffle the dataframe and split in three parts. 80% train, 10% dev and 10% test.
train, dev, test = np.split(source_r.sample(frac=1), [int(.8*len(source_r)), int(.9*len(source_r))])

In [None]:
def convert_to_fasttext(df, part="train"):

    with open(f'./{part}.txt', 'a') as fil:
        for _, row in df.iterrows():
            label = row["subs"]
            text = row["text"]
            fil.write(f"__label__{label} {text}" + os.linesep)
    

In [None]:
convert_to_fasttext(train, "train")
convert_to_fasttext(dev, "dev")
convert_to_fasttext(test, "test")

In [None]:
from flair.data_fetcher import NLPTaskDataFetcher
from pathlib import Path
from flair.data import TaggedCorpus

In [None]:
# use your own data path
data_folder = Path('./refined_subs/')

# load corpus containing training, test and dev data
corpus: TaggedCorpus = NLPTaskDataFetcher.load_classification_corpus(data_folder,
                                                                     test_file='test.txt',
                                                                     dev_file='dev.txt',
                                                                     train_file='train.txt')

In [None]:
# print(corpus.obtain_statistics())

In [None]:
corpus.train[:5]

In [None]:
label_dict = corpus.make_label_dictionary()

In [None]:
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings, BertEmbeddings, DocumentRNNEmbeddings, DocumentPoolEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer

In [None]:
# flair_forward_embedding = FlairEmbeddings('dutch-forward')
# flair_backward_embedding = FlairEmbeddings('dutch-backward')
bert_embeddings = BertEmbeddings('bert-base-multilingual-cased')
embeddings = [bert_embeddings]

In [None]:
document_embeddings: DocumentPoolEmbeddings = DocumentPoolEmbeddings(embeddings)

In [None]:
# from hyperopt import hp
# from flair.hyperparameter.param_selection import SearchSpace, Parameter

# # define your search space
# search_space = SearchSpace()
# search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[
#     embeddings
# ])

# search_space.add(Parameter.)
# search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.01, 0.03, 0.07, 0.09, 0.1])
# search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[8, 16, 20])

In [None]:
# from flair.hyperparameter.param_selection import TextClassifierParamSelector, OptimizationValue

# # create the parameter selector
# param_selector = TextClassifierParamSelector(
#     corpus, 
#     multi_label=False, 
#     base_path='optimization/results', 
#     document_embedding_type='mean',
#     max_epochs=50, 
#     training_runs=3,
#     optimization_value=OptimizationValue.DEV_SCORE
# )

# # start the optimization
# param_selector.optimize(search_space, max_evals=100)

In [None]:
from flair.optim import AdamW, SGDW
from torch.optim.adam import Adam

# 5. create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False)

# 6. initialize the text classifier trainer
trainer = ModelTrainer(classifier, corpus, optimizer=Adam)

In [None]:
# 7. start the training
trainer.train('./train_results_pool_bert_200epochs_0.1lr_adam/',
              learning_rate=0.1,
              anneal_factor=0.7,
              mini_batch_size=32,
              max_epochs=200)

In [None]:
from flair.data import Sentence

classifier = TextClassifier.load_from_file('./train_results_pool_bert_50epochs_0.01lr/best-model.pt')

In [None]:
# create example sentence
sentence = Sentence('Dit is toch ongelooflijk, prachtig deze service.')

# predict tags and print
classifier.predict(sentence, multi_class_prob=True)

print(sentence.labels)

# Let's extract embeddings now and use them as features for a simple linear model.

In [None]:
all_train_sentences = corpus.train

for s in corpus.train:
    document_embeddings.embed(s)

In [None]:
all_embeddings = [s.get_embedding().numpy() for sentence in all_train_sentences]

In [None]:
from sklearn.svm import LinearSVC

In [None]:
all_labels = [label_dict.get_idx_for_item(s.get_label_names()[0]) for s in corpus.train]

In [None]:
clf = LinearSVC(class_weight="balanced", multi_class="ovr", max_iter=10000)
clf.fit(all_embeddings, all_labels) 

In [None]:
test_sentence = corpus.test[69]
test_sentence

In [None]:
document_embeddings.embed(test_sentence)

clf.predict([test_sentence.get_embedding().numpy()])

In [None]:
for s in corpus.test:
    document_embeddings.embed(s)
    print(label_dict.get_item_for_index(clf.predict([s.get_embedding().numpy()])[0]))

In [None]:
# Ideeen Robin:
# - BOW zou misschien beter voor emoties kunnen werken, want vaak wordt de emotie van een zin aan de hand van een paar steekwoorden bepaald. Dus meer een soort lexicon based sentimen analysis.
# - Misschien is emotie voorspellen aan de hand van puur tekst uberhaupt geen lineair probleem: wij als mensen hebben al moeiete om dat te doen zonder video en audio context.
# - Add non linear layer to address above?
# - We kunnen softmax laag proberen toe te voegen aan het eind van de lineair model. (zit al in textclassifier)
# - Het is ook een uitkomst als het blijkt dat puur tekst niet zeggend genoeg is om emotie te bepalen! Niks mis mee.

In [None]:
# lr 0.0001