In [19]:
import pandas as pd
import os

df = pd.read_csv('flair-vgg16-data.csv', names=['_id', 'message', 'image_concept', 'published', 'disabled'])
df['available'] = 0

all_images_path = 'data/all_images'
for i, row in df.iterrows():
    if os.path.isfile(os.path.join(all_images_path, row['_id'] + '.jpg')):
        df.at[i, 'available']= 1    
        
df_published = df.loc[df.query('available == 1 and published == 1').index]
df_published['label'] = '__label__published'
df_published['text'] = df_published['image_concept'] + ' ' + df_published['message']
df_published = df_published.loc[df_published['text'].notnull()]
published_count = len(df_published)


df_disabled = df.loc[df.query('available == 1 and disabled == 1').index]
df_disabled['label'] = '__label__disabled'
df_disabled['text'] = df_disabled['image_concept'] + ' ' + df_disabled['message']
df_disabled = df_disabled.loc[df_disabled['text'].notnull()]
df_disabled = df_disabled[:published_count]



df_all = pd.concat([df_published, df_disabled], ignore_index=True)


df_all = df_all.reset_index(drop=True)

df_all

Unnamed: 0,_id,message,image_concept,published,disabled,available,label,text
0,5e5836fee917e8d9a8a7b277,endless blues greatbarrierreef australia whits...,seascape water shoal sea turquoise sun tropica...,1,0,1,__label__published,seascape water shoal sea turquoise sun tropica...
1,5e58343ded065ad79e312f3d,hamiltonisland,tree travel vacation seashore water hotel isla...,1,0,1,__label__published,tree travel vacation seashore water hotel isla...
2,5e57dc939e88b6be2ac42800,we are going coconuts for hamiltonisland here ...,relaxation beach sea vacation sand recreation ...,1,0,1,__label__published,relaxation beach sea vacation sand recreation ...
3,5e55dca437fa5927dcdf02f3,en route to gbr embrace the elevation in luxur...,nature travel diving water sea underwater ocea...,1,0,1,__label__published,nature travel diving water sea underwater ocea...
4,5e55d69eb9e5b725cd7ba02f,golf course views hamiltonislandgolfcourse whi...,outdoors landscape beach sky nature rural nope...,1,0,1,__label__published,outdoors landscape beach sky nature rural nope...
...,...,...,...,...,...,...,...,...
1435,5e4e3124497f22be9069f067,golf trips with the boys are always wicked and...,sky water seashore sea travel winter ship land...,0,1,1,__label__disabled,sky water seashore sea travel winter ship land...
1436,5e4e3124ffb21abead202386,golf trips with the boys are always wicked and...,travel golf ocean grass water sand nature sea ...,0,1,1,__label__disabled,travel golf ocean grass water sand nature sea ...
1437,5e4e3124ffb21abead202385,throwback to that time i was warm and tanned q...,watercraft water people noperson recreation se...,0,1,1,__label__disabled,watercraft water people noperson recreation se...
1438,5e4e3123164d73be9b8cd43e,golf trips with the boys are always wicked and...,adult people class girl grouptogether portrait...,0,1,1,__label__disabled,adult people class girl grouptogether portrait...


In [20]:
df_all[['label', 'text']]

Unnamed: 0,label,text
0,__label__published,seascape water shoal sea turquoise sun tropica...
1,__label__published,tree travel vacation seashore water hotel isla...
2,__label__published,relaxation beach sea vacation sand recreation ...
3,__label__published,nature travel diving water sea underwater ocea...
4,__label__published,outdoors landscape beach sky nature rural nope...
...,...,...
1435,__label__disabled,sky water seashore sea travel winter ship land...
1436,__label__disabled,travel golf ocean grass water sand nature sea ...
1437,__label__disabled,watercraft water people noperson recreation se...
1438,__label__disabled,adult people class girl grouptogether portrait...


In [21]:
from sklearn.model_selection import train_test_split

train_df, validation_df = train_test_split(df_all, test_size=0.4, random_state=42)
validation_df, test_df = train_test_split(validation_df, test_size=0.4, random_state=42)

train_df = train_df.reset_index(drop=True)
validation_df = validation_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_csv = 'flair_classification_data/train.csv'
dev_csv = 'flair_classification_data/dev.csv'
test_csv = 'flair_classification_data/test.csv'

train_df[['label', 'text']].to_csv(train_csv, sep='\t', index=False, header=False)
validation_df[['label', 'text']].to_csv(dev_csv, sep='\t', index=False, header=False)
test_df[['label', 'text']].to_csv(test_csv, sep='\t', index=False, header=False)


In [22]:
train_df2 = pd.read_csv('flair_classification_data/train.csv', sep='\t', names=['label', 'text'])
train_df2

Unnamed: 0,label,text
0,__label__published,outdoors nature land water ocean sea shoreline...
1,__label__disabled,exotic hotel noperson beach palm swimming vaca...
2,__label__disabled,zipup noperson science conceptual fashion insu...
3,__label__disabled,sky water vacation sea desktop pattern nature ...
4,__label__disabled,portrait people beautiful sunglasses woman man...
...,...,...
859,__label__disabled,couple woman boat people water sea luxury summ...
860,__label__disabled,sunglasses watersports man fun paddle oar wate...
861,__label__disabled,sand water leisure beach sea tropical vacation...
862,__label__disabled,vacation beach resort sun palm swimmingpool wa...


In [23]:
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path


In [24]:
corpus = NLPTaskDataFetcher.load_classification_corpus(
    Path('flair_classification_data'),
    test_file='test.csv',
    dev_file='dev.csv',
    train_file='train.csv'
)

2020-04-06 12:51:04,492 Reading data from flair_classification_data
2020-04-06 12:51:04,493 Train: flair_classification_data/train.csv
2020-04-06 12:51:04,494 Dev: flair_classification_data/dev.csv
2020-04-06 12:51:04,495 Test: flair_classification_data/test.csv


  """
  train_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  test_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  dev_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc


In [59]:
label_dict = corpus.make_label_dictionary()

2020-04-06 14:03:19,483 Computing label dictionary. Progress:


100%|██████████| 864/864 [00:00<00:00, 167989.92it/s]

2020-04-06 14:03:19,492 [b'published', b'disabled']





In [65]:
corpus.make_vocab_dictionary().get_items()

['<unk>',
 'hamiltonisland',
 'the',
 'water',
 'beach',
 'island',
 'travel',
 'sea',
 'ocean',
 'summer',
 'nature',
 'outdoors',
 'to',
 'australia',
 'and',
 'vacation',
 'a',
 'tropical',
 'in',
 'of',
 'sky',
 'noperson',
 'person',
 'people',
 'human',
 'whitsundays',
 'hamilton',
 'seashore',
 'queensland',
 'vehicle',
 'tree',
 'sun',
 'landscape',
 'リ',
 'boat',
 'ト',
 'recreation',
 'ン',
 'sunset',
 'on',
 'land',
 'sand',
 'we',
 'leisure',
 'coast',
 'i',
 'shoreline',
 'is',
 'transportation',
 'for',
 'this',
 'you',
 'it',
 'plant',
 'with',
 'woman',
 'た',
 'beautiful',
 'relaxation',
 'watercraft',
 'resort',
 's',
 'greatbarrierreef',
 'one',
 'い',
 'fun',
 'our',
 'girl',
 'love',
 'の',
 'ス',
 'portrait',
 'hotel',
 'day',
 'ア',
 'at',
 'man',
 'seascape',
 'reef',
 'vessel',
 'lake',
 'building',
 'ラ',
 'scenery',
 'exotic',
 'な',
 'promontory',
 'っ',
 'に',
 'yacht',
 '行',
 'turquoise',
 'family',
 'オ',
 'bay',
 'so',
 'clothing',
 'whitehavenbeach',
 'apparel',
 '

In [25]:
document_embeddings = DocumentRNNEmbeddings([
        WordEmbeddings('twitter'),
    #     FlairEmbeddings('news-forward'),
    #     FlairEmbeddings('news-backward')
    ], 
    hidden_size=128,
    reproject_words=True,
    reproject_words_dimension=128
)

In [26]:
from flair.embeddings import Sentence

sentence1 = Sentence('The grass is green . And the sky is blue .')

# embed the sentence with our document embedding
document_embeddings.embed(sentence1)

# now check out the embedded sentence.
print(sentence1.get_embedding().shape)

sentence2 = Sentence("""It accounts for virtually all discussion in the media, enjoying priority over such topics as the 2020 US presidential election or the UK finally leaving the EU for good in less than 9 months. People are flooding social media with COVID information, which can only mean one thing: data. Fresh data waiting to be analysed. And analyse it we will.""")

# embed the sentence with our document embedding
document_embeddings.embed(sentence2)

# now check out the embedded sentence.
print(sentence2.get_embedding().shape)



torch.Size([128])
torch.Size([128])


In [27]:
classifier = TextClassifier(
    document_embeddings, 
    label_dictionary=corpus.make_label_dictionary(),
    multi_label=False
)

print(classifier)

trainer = ModelTrainer(classifier, corpus)

2020-04-06 12:51:21,221 Computing label dictionary. Progress:


100%|██████████| 864/864 [00:00<00:00, 23923.15it/s]

2020-04-06 12:51:21,261 [b'published', b'disabled']
TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('twitter')
    )
    (word_reprojection_map): Linear(in_features=100, out_features=128, bias=True)
    (rnn): GRU(128, 128, batch_first=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Linear(in_features=128, out_features=2, bias=True)
  (loss_function): CrossEntropyLoss()
  (beta): 1.0
  (weights): None
  (weight_tensor) None
)





In [14]:
trainer.train('./', max_epochs=20, learning_rate=0.1)

2020-04-06 12:33:08,182 ----------------------------------------------------------------------------------------------------
2020-04-06 12:33:08,183 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('twitter')
    )
    (word_reprojection_map): Linear(in_features=100, out_features=128, bias=True)
    (rnn): GRU(128, 128, batch_first=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Linear(in_features=128, out_features=2, bias=True)
  (loss_function): CrossEntropyLoss()
  (beta): 1.0
  (weights): None
  (weight_tensor) None
)"
2020-04-06 12:33:08,184 ----------------------------------------------------------------------------------------------------
2020-04-06 12:33:08,185 Corpus: "Corpus: 864 train + 345 dev + 231 test sentences"
2020-04-06 12:33:08,186 ----------------------------------------------------------------------------------------------------
2020-04-06 12:33:

2020-04-06 12:33:54,321 epoch 5 - iter 2/27 - loss 0.64334485 - samples/sec: 331.33
2020-04-06 12:33:54,520 epoch 5 - iter 4/27 - loss 0.66364813 - samples/sec: 335.90
2020-04-06 12:33:54,760 epoch 5 - iter 6/27 - loss 0.66396772 - samples/sec: 275.61
2020-04-06 12:33:54,969 epoch 5 - iter 8/27 - loss 0.67268697 - samples/sec: 317.43
2020-04-06 12:33:55,218 epoch 5 - iter 10/27 - loss 0.67805111 - samples/sec: 265.83
2020-04-06 12:33:55,422 epoch 5 - iter 12/27 - loss 0.67275711 - samples/sec: 327.65
2020-04-06 12:33:55,686 epoch 5 - iter 14/27 - loss 0.66785309 - samples/sec: 250.46
2020-04-06 12:33:55,856 epoch 5 - iter 16/27 - loss 0.67033946 - samples/sec: 395.71
2020-04-06 12:33:56,071 epoch 5 - iter 18/27 - loss 0.66519839 - samples/sec: 309.93
2020-04-06 12:33:56,291 epoch 5 - iter 20/27 - loss 0.67047878 - samples/sec: 302.43
2020-04-06 12:33:56,495 epoch 5 - iter 22/27 - loss 0.66634789 - samples/sec: 326.47
2020-04-06 12:33:56,703 epoch 5 - iter 24/27 - loss 0.66875681 - samp

2020-04-06 12:34:45,118 epoch 10 - iter 14/27 - loss 0.64778780 - samples/sec: 303.12
2020-04-06 12:34:45,304 epoch 10 - iter 16/27 - loss 0.64652052 - samples/sec: 361.39
2020-04-06 12:34:45,501 epoch 10 - iter 18/27 - loss 0.64474945 - samples/sec: 336.85
2020-04-06 12:34:45,712 epoch 10 - iter 20/27 - loss 0.64191903 - samples/sec: 316.46
2020-04-06 12:34:45,934 epoch 10 - iter 22/27 - loss 0.64240784 - samples/sec: 300.67
2020-04-06 12:34:46,187 epoch 10 - iter 24/27 - loss 0.64137957 - samples/sec: 261.14
2020-04-06 12:34:46,443 epoch 10 - iter 26/27 - loss 0.64735124 - samples/sec: 257.45
2020-04-06 12:34:46,559 ----------------------------------------------------------------------------------------------------
2020-04-06 12:34:46,560 EPOCH 10 done: loss 0.6490 - lr 0.1000
2020-04-06 12:34:47,424 DEV : loss 0.6518052816390991 - score 0.6754
2020-04-06 12:34:47,468 BAD EPOCHS (no improvement): 0
2020-04-06 12:35:02,696 --------------------------------------------------------------

2020-04-06 12:35:36,084 epoch 15 - iter 26/27 - loss 0.62285865 - samples/sec: 284.12
2020-04-06 12:35:36,211 ----------------------------------------------------------------------------------------------------
2020-04-06 12:35:36,212 EPOCH 15 done: loss 0.6244 - lr 0.1000
2020-04-06 12:35:37,074 DEV : loss 0.6487848162651062 - score 0.6232
2020-04-06 12:35:37,120 BAD EPOCHS (no improvement): 3
2020-04-06 12:35:37,121 ----------------------------------------------------------------------------------------------------
2020-04-06 12:35:37,338 epoch 16 - iter 2/27 - loss 0.62236482 - samples/sec: 297.75
2020-04-06 12:35:37,537 epoch 16 - iter 4/27 - loss 0.61676866 - samples/sec: 335.52
2020-04-06 12:35:37,755 epoch 16 - iter 6/27 - loss 0.65586740 - samples/sec: 304.20
2020-04-06 12:35:37,974 epoch 16 - iter 8/27 - loss 0.65788950 - samples/sec: 303.05
2020-04-06 12:35:38,144 epoch 16 - iter 10/27 - loss 0.65378333 - samples/sec: 394.20
2020-04-06 12:35:38,386 epoch 16 - iter 12/27 - los

2020-04-06 12:36:26,396 ----------------------------------------------------------------------------------------------------
2020-04-06 12:36:26,397 Testing using best model ...
2020-04-06 12:36:26,398 loading file best-model.pt
2020-04-06 12:36:30,398 0.6537	0.6537	0.6537
2020-04-06 12:36:30,399 
MICRO_AVG: acc 0.4855 - f1-score 0.6537
MACRO_AVG: acc 0.4851 - f1-score 0.6531499999999999
disabled   tp: 71 - fp: 47 - fn: 33 - tn: 80 - precision: 0.6017 - recall: 0.6827 - accuracy: 0.4702 - f1-score: 0.6396
published  tp: 80 - fp: 33 - fn: 47 - tn: 71 - precision: 0.7080 - recall: 0.6299 - accuracy: 0.5000 - f1-score: 0.6667
2020-04-06 12:36:30,399 ----------------------------------------------------------------------------------------------------


{'test_score': 0.6537,
 'dev_score_history': [0.6029,
  0.6348,
  0.6145,
  0.5478,
  0.6638,
  0.6551,
  0.6725,
  0.6638,
  0.6696,
  0.6754,
  0.6319,
  0.6986,
  0.6725,
  0.6667,
  0.6232,
  0.6812,
  0.6928,
  0.687,
  0.658,
  0.7159],
 'train_loss_history': [0.711567136976454,
  0.7023470710825037,
  0.6828084786732992,
  0.6863388772364016,
  0.6666182544496324,
  0.6654118961758084,
  0.6695940538688943,
  0.6611989913163362,
  0.6567812650292008,
  0.6489906333110951,
  0.6326575323387429,
  0.6398089947523894,
  0.6390212575594584,
  0.6334975207293475,
  0.6243826393727903,
  0.6459086482171659,
  0.6180060196805883,
  0.6185051004091898,
  0.6089737724374842,
  0.6159107243573224],
 'dev_loss_history': [tensor(0.6664, device='cuda:0'),
  tensor(0.6800, device='cuda:0'),
  tensor(0.6675, device='cuda:0'),
  tensor(0.6816, device='cuda:0'),
  tensor(0.6610, device='cuda:0'),
  tensor(0.6535, device='cuda:0'),
  tensor(0.6652, device='cuda:0'),
  tensor(0.6579, device='cuda:

In [17]:
classifier([Sentence("this is a sentence")])



tensor([[ 0.6935, -0.8708]], device='cuda:0', grad_fn=<AddmmBackward>)

In [50]:
import torch
import torch.nn as nn
import numpy as np
import random
from flair.training_utils import store_embeddings
train_on_gpu = torch.cuda.is_available()
torch.from_numpy(train_df[:10][['published', 'disabled']].values).shape

torch.Size([10, 2])

In [51]:
def get_batches(df, batch_size=16):
    df = df.sample(frac=1).reset_index(drop=True)
    n_batches = len(df)//batch_size    
    for i in range(0, len(df), batch_size):
        x = []        
        for txt in df[i:i+batch_size]['image_concept']:
            words = txt.split()            
            random.shuffle(words)
            txt = ' '.join(words)
            x.append(Sentence(txt))
        
        # disabled 0, published 1
        y = [1 if is_published else 0 for is_published in df[i:i+batch_size]['published']]
        yield x, torch.tensor(y, dtype=torch.long)


def train_model(model, epochs, lr, train_df, val_df, checkpoint_file, early_stopping=5):        
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    best_loss = np.inf
    no_improvement = 0

    if train_on_gpu:
        model = model.cuda()
        
    for epoch in range(epochs):        
        total_train_loss = 0
        total_val_loss = 0
        train_loss = 0
        val_loss = 0
        
        # Train
        model.train()        
        for i, (sentences, labels) in enumerate(get_batches(train_df)):         
            if train_on_gpu:
                labels = labels.cuda()
            
            optimizer.zero_grad()
            
            out = model(sentences)            
            
            loss = criterion(out, labels)
            loss.backward()
            
            nn.utils.clip_grad_norm_(model.parameters(), 5.0)
            optimizer.step()
            
            total_train_loss += loss.item()
            
            store_embeddings(sentences, 'cpu')
            
            if i % 10 == 0:
                print(f"Epoch {epoch}, Batch {i}, train loss {loss.item()/labels.size(0)}")
            
            
        train_loss = total_train_loss/len(train_df)
        print(f"> Epoch {epoch}, train loss {train_loss}")
        
        # Eval
        model.eval()
        for sentences, labels in get_batches(val_df):
            if train_on_gpu:
                labels = labels.cuda()
            
            out = model(sentences)
            loss = criterion(out, labels)
            total_val_loss += loss.item()
            
            store_embeddings(sentences, 'cpu')
            
            
        val_loss = total_val_loss / len(val_df)
        
        print(f"> Epoch {epoch}, val loss {val_loss}")
        
        if val_loss < best_loss:
            best_loss = val_loss
            no_improvement = 0
            torch.save(model.state_dict(), checkpoint_file)
            print("Saved model.")
        else:
            no_improvement += 1
            print("No improvement.")
            if no_improvement >= early_stopping:
                print(f"Early Stopping")
                break
            
                                              
checkpoint_file = 'flair_text_model_2.pt'      
lr = 0.005
epochs = 5            

train_model(classifier, epochs, lr, train_df, validation_df, checkpoint_file)

Epoch 0, Batch 0, train loss 0.06326187402009964
Epoch 0, Batch 10, train loss 0.06166648492217064
Epoch 0, Batch 20, train loss 0.0313153900206089
Epoch 0, Batch 30, train loss 0.03767106682062149
Epoch 0, Batch 40, train loss 0.04703482985496521
Epoch 0, Batch 50, train loss 0.0449230782687664
> Epoch 0, train loss 0.039944981759483064
> Epoch 0, val loss 0.018333641817604288
Saved model.
Epoch 1, Batch 0, train loss 0.03845386579632759
Epoch 1, Batch 10, train loss 0.020252227783203125
Epoch 1, Batch 20, train loss 0.012711026705801487
Epoch 1, Batch 30, train loss 0.019124727696180344
Epoch 1, Batch 40, train loss 0.056158095598220825
Epoch 1, Batch 50, train loss 0.024085968732833862
> Epoch 1, train loss 0.024453740704942634
> Epoch 1, val loss 0.014631336430708568
Saved model.
Epoch 2, Batch 0, train loss 0.020150186493992805
Epoch 2, Batch 10, train loss 0.013921372592449188
Epoch 2, Batch 20, train loss 0.023955384269356728
Epoch 2, Batch 30, train loss 0.017117992043495178
Ep

In [58]:
def eval_model(model, test_df):
    if train_on_gpu:
        model = model.cuda()
        
    model.eval()
    num_correct = 0
    total = 0
    for i, (sentences, labels) in enumerate(get_batches(test_df)):
        if train_on_gpu:
            labels = labels.cuda()
            
        out = model(sentences)
        _, pred = torch.max(out, 1)
                
        correct = (labels == pred)
        correct = correct.cpu().numpy() if train_on_gpu else correct.numpy()
        
        num_correct += np.sum(correct)
        total += labels.size(0)
        
        store_embeddings(sentences, 'cpu')
        

    print(f"{num_correct}/{total} correct. Accuracy: {num_correct*100/total} %")
    
    
eval_model(classifier, test_df)

216/231 correct. Accuracy: 93.50649350649351 %
