In [1]:
import pandas as pd
import os

df = pd.read_csv('flair-vgg16-data.csv', names=['_id', 'message', 'image_concept', 'published', 'disabled'])
df['available'] = 0

all_images_path = 'data/all_images'
for i, row in df.iterrows():
    if os.path.isfile(os.path.join(all_images_path, row['_id'] + '.jpg')):
        df.at[i, 'available']= 1    
        
df_published = df.loc[df.query('available == 1 and published == 1').index]
df_published['label'] = '__label__published'
df_published['text'] = df_published['image_concept'] + ' ' + df_published['message']
df_published = df_published.loc[df_published['text'].notnull()]
published_count = len(df_published)


df_disabled = df.loc[df.query('available == 1 and disabled == 1').index]
df_disabled['label'] = '__label__disabled'
df_disabled['text'] = df_disabled['image_concept'] + ' ' + df_disabled['message']
df_disabled = df_disabled.loc[df_disabled['text'].notnull()]
df_disabled = df_disabled[:published_count]



df_all = pd.concat([df_published, df_disabled], ignore_index=True)

df_all = df_all.drop(['_id', 'message', 'image_concept', 'published', 'disabled', 'available'], axis=1)

df_all = df_all.reset_index(drop=True)

df_all

Unnamed: 0,label,text
0,__label__published,seascape water shoal sea turquoise sun tropica...
1,__label__published,tree travel vacation seashore water hotel isla...
2,__label__published,relaxation beach sea vacation sand recreation ...
3,__label__published,nature travel diving water sea underwater ocea...
4,__label__published,outdoors landscape beach sky nature rural nope...
...,...,...
1435,__label__disabled,sky water seashore sea travel winter ship land...
1436,__label__disabled,travel golf ocean grass water sand nature sea ...
1437,__label__disabled,watercraft water people noperson recreation se...
1438,__label__disabled,adult people class girl grouptogether portrait...


In [2]:
from sklearn.model_selection import train_test_split

train_df, validation_df = train_test_split(df_all, test_size=0.4, random_state=42)
validation_df, test_df = train_test_split(validation_df, test_size=0.4, random_state=42)

train_df = train_df.reset_index(drop=True)
validation_df = validation_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_csv = 'flair_classification_data/train.csv'
dev_csv = 'flair_classification_data/dev.csv'
test_csv = 'flair_classification_data/test.csv'

train_df.to_csv(train_csv, sep='\t', index=False, header=False)
validation_df.to_csv(dev_csv, sep='\t', index=False, header=False)
test_df.to_csv(test_csv, sep='\t', index=False, header=False)


In [3]:
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [4]:
corpus = NLPTaskDataFetcher.load_classification_corpus(
    Path('flair_classification_data'),
    test_file='test.csv',
    dev_file='dev.csv',
    train_file='train.csv'
)

2020-03-31 13:32:40,656 Reading data from flair_classification_data
2020-03-31 13:32:40,657 Train: flair_classification_data/train.csv
2020-03-31 13:32:40,658 Dev: flair_classification_data/dev.csv
2020-03-31 13:32:40,660 Test: flair_classification_data/test.csv


  """
  train_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  test_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  dev_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc


In [5]:
document_embeddings = DocumentRNNEmbeddings([
    WordEmbeddings('twitter'),
#     FlairEmbeddings('news-forward'),
#     FlairEmbeddings('news-backward')
], hidden_size=128)

In [6]:
from flair.embeddings import Sentence

sentence1 = Sentence('The grass is green . And the sky is blue .')

# embed the sentence with our document embedding
document_embeddings.embed(sentence1)

# now check out the embedded sentence.
print(sentence1.get_embedding().shape)

sentence2 = Sentence("""It accounts for virtually all discussion in the media, enjoying priority over such topics as the 2020 US presidential election or the UK finally leaving the EU for good in less than 9 months. People are flooding social media with COVID information, which can only mean one thing: data. Fresh data waiting to be analysed. And analyse it we will.""")

# embed the sentence with our document embedding
document_embeddings.embed(sentence2)

# now check out the embedded sentence.
print(sentence2.get_embedding().shape)



torch.Size([128])
torch.Size([128])


In [7]:
classifier = TextClassifier(
    document_embeddings, 
    label_dictionary=corpus.make_label_dictionary(),
    multi_label=False
)

trainer = ModelTrainer(classifier, corpus)

2020-03-31 13:32:47,731 Computing label dictionary. Progress:


100%|██████████| 864/864 [00:00<00:00, 185801.82it/s]

2020-03-31 13:32:47,739 [b'published', b'disabled']





In [8]:
trainer.train('./', max_epochs=20)

2020-03-31 13:32:47,748 ----------------------------------------------------------------------------------------------------
2020-03-31 13:32:47,749 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('twitter')
    )
    (word_reprojection_map): Linear(in_features=100, out_features=100, bias=True)
    (rnn): GRU(100, 128, batch_first=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Linear(in_features=128, out_features=2, bias=True)
  (loss_function): CrossEntropyLoss()
  (beta): 1.0
  (weights): None
  (weight_tensor) None
)"
2020-03-31 13:32:47,750 ----------------------------------------------------------------------------------------------------
2020-03-31 13:32:47,750 Corpus: "Corpus: 864 train + 345 dev + 231 test sentences"
2020-03-31 13:32:47,751 ----------------------------------------------------------------------------------------------------
2020-03-31 13:32:

2020-03-31 13:34:05,098 epoch 5 - iter 2/27 - loss 0.68643901 - samples/sec: 301.78
2020-03-31 13:34:05,269 epoch 5 - iter 4/27 - loss 0.68374841 - samples/sec: 392.04
2020-03-31 13:34:05,520 epoch 5 - iter 6/27 - loss 0.70374759 - samples/sec: 264.43
2020-03-31 13:34:05,774 epoch 5 - iter 8/27 - loss 0.69756867 - samples/sec: 259.66
2020-03-31 13:34:05,991 epoch 5 - iter 10/27 - loss 0.69820455 - samples/sec: 306.20
2020-03-31 13:34:06,171 epoch 5 - iter 12/27 - loss 0.69945021 - samples/sec: 370.70
2020-03-31 13:34:06,408 epoch 5 - iter 14/27 - loss 0.70096400 - samples/sec: 279.63
2020-03-31 13:34:06,624 epoch 5 - iter 16/27 - loss 0.70179895 - samples/sec: 308.04
2020-03-31 13:34:06,819 epoch 5 - iter 18/27 - loss 0.69946518 - samples/sec: 341.16
2020-03-31 13:34:07,016 epoch 5 - iter 20/27 - loss 0.69420349 - samples/sec: 336.80
2020-03-31 13:34:07,231 epoch 5 - iter 22/27 - loss 0.69622840 - samples/sec: 308.57
2020-03-31 13:34:07,401 epoch 5 - iter 24/27 - loss 0.69487832 - samp

2020-03-31 13:34:24,343 epoch 10 - iter 14/27 - loss 0.68988609 - samples/sec: 333.93
2020-03-31 13:34:24,598 epoch 10 - iter 16/27 - loss 0.68628854 - samples/sec: 257.99
2020-03-31 13:34:24,789 epoch 10 - iter 18/27 - loss 0.68433243 - samples/sec: 350.53
2020-03-31 13:34:25,047 epoch 10 - iter 20/27 - loss 0.68454681 - samples/sec: 254.86
2020-03-31 13:34:25,247 epoch 10 - iter 22/27 - loss 0.68216915 - samples/sec: 332.38
2020-03-31 13:34:25,432 epoch 10 - iter 24/27 - loss 0.68329974 - samples/sec: 359.87
2020-03-31 13:34:25,616 epoch 10 - iter 26/27 - loss 0.68269935 - samples/sec: 361.99
2020-03-31 13:34:25,737 ----------------------------------------------------------------------------------------------------
2020-03-31 13:34:25,738 EPOCH 10 done: loss 0.6809 - lr 0.0050
2020-03-31 13:34:26,528 DEV : loss 0.6661758422851562 - score 0.6174
2020-03-31 13:34:26,572 BAD EPOCHS (no improvement): 2
2020-03-31 13:34:26,573 --------------------------------------------------------------

2020-03-31 13:34:43,560 epoch 15 - iter 24/27 - loss 0.68855658 - samples/sec: 299.20
2020-03-31 13:34:43,815 epoch 15 - iter 26/27 - loss 0.68933688 - samples/sec: 259.25
2020-03-31 13:34:43,942 ----------------------------------------------------------------------------------------------------
2020-03-31 13:34:43,943 EPOCH 15 done: loss 0.6908 - lr 0.0025
2020-03-31 13:34:44,728 DEV : loss 0.6645771861076355 - score 0.629
2020-03-31 13:34:44,771 BAD EPOCHS (no improvement): 0
2020-03-31 13:34:59,458 ----------------------------------------------------------------------------------------------------
2020-03-31 13:34:59,660 epoch 16 - iter 2/27 - loss 0.69688779 - samples/sec: 320.26
2020-03-31 13:34:59,912 epoch 16 - iter 4/27 - loss 0.67707573 - samples/sec: 262.32
2020-03-31 13:35:00,140 epoch 16 - iter 6/27 - loss 0.68184748 - samples/sec: 289.96
2020-03-31 13:35:00,333 epoch 16 - iter 8/27 - loss 0.68635160 - samples/sec: 344.91
2020-03-31 13:35:00,561 epoch 16 - iter 10/27 - loss

2020-03-31 13:36:16,786 ----------------------------------------------------------------------------------------------------
2020-03-31 13:36:16,787 Testing using best model ...
2020-03-31 13:36:16,788 loading file best-model.pt
2020-03-31 13:36:20,648 0.619	0.619	0.619
2020-03-31 13:36:20,649 
MICRO_AVG: acc 0.4483 - f1-score 0.619
MACRO_AVG: acc 0.4481 - f1-score 0.6189
disabled   tp: 69 - fp: 53 - fn: 35 - tn: 74 - precision: 0.5656 - recall: 0.6635 - accuracy: 0.4395 - f1-score: 0.6107
published  tp: 74 - fp: 35 - fn: 53 - tn: 69 - precision: 0.6789 - recall: 0.5827 - accuracy: 0.4568 - f1-score: 0.6271
2020-03-31 13:36:20,650 ----------------------------------------------------------------------------------------------------


{'test_score': 0.619,
 'dev_score_history': [0.571,
  0.5768,
  0.5855,
  0.6261,
  0.6116,
  0.6174,
  0.5942,
  0.6087,
  0.6,
  0.6174,
  0.6116,
  0.6,
  0.6058,
  0.6145,
  0.629,
  0.6261,
  0.629,
  0.6319,
  0.6319,
  0.629],
 'train_loss_history': [0.7100120517942641,
  0.6948796687302766,
  0.7066304617457919,
  0.6875062430346454,
  0.6896996520183705,
  0.69245242189478,
  0.6914506179315073,
  0.6796734200583564,
  0.675808087543205,
  0.6808763654143722,
  0.6863869296179878,
  0.6831044554710388,
  0.6859326428837247,
  0.6822632087601556,
  0.6908169079709936,
  0.6807656817966037,
  0.6742793785201179,
  0.6861562993791368,
  0.6731926732593112,
  0.6744583050409952],
 'dev_loss_history': [tensor(0.6817, device='cuda:0'),
  tensor(0.6780, device='cuda:0'),
  tensor(0.6740, device='cuda:0'),
  tensor(0.6720, device='cuda:0'),
  tensor(0.6704, device='cuda:0'),
  tensor(0.6693, device='cuda:0'),
  tensor(0.6699, device='cuda:0'),
  tensor(0.6676, device='cuda:0'),
  tens