In [1]:
import pandas as pd
import os

df = pd.read_csv('flair-vgg16-data.csv', names=['_id', 'message', 'image_concept', 'published', 'disabled'])
df['available'] = 0

all_images_path = 'data/all_images'
for i, row in df.iterrows():
    if os.path.isfile(os.path.join(all_images_path, row['_id'] + '.jpg')):
        df.at[i, 'available']= 1    
        
df_published = df.loc[df.query('available == 1 and published == 1').index]
df_published['text'] = df_published['image_concept'] + ' ' + df_published['message']
df_published = df_published.loc[df_published['text'].notnull()]

published_count = len(df_published)

df_disabled = df.loc[df.query('available == 1 and disabled == 1').index]
df_disabled['text'] = df_disabled['image_concept'] + ' ' + df_disabled['message']
df_disabled = df_disabled.loc[df_disabled['text'].notnull()]


df_disabled = df_disabled[:published_count]

print(f"published {len(df_published)}, disabled {len(df_disabled)}")

df_all = pd.concat([df_published, df_disabled], ignore_index=True)

df_all = df_all.reset_index(drop=True)

df_all

published 720, disabled 720


Unnamed: 0,_id,message,image_concept,published,disabled,available,text
0,5e5836fee917e8d9a8a7b277,endless blues greatbarrierreef australia whits...,seascape water shoal sea turquoise sun tropica...,1,0,1,seascape water shoal sea turquoise sun tropica...
1,5e58343ded065ad79e312f3d,hamiltonisland,tree travel vacation seashore water hotel isla...,1,0,1,tree travel vacation seashore water hotel isla...
2,5e57dc939e88b6be2ac42800,we are going coconuts for hamiltonisland here ...,relaxation beach sea vacation sand recreation ...,1,0,1,relaxation beach sea vacation sand recreation ...
3,5e55dca437fa5927dcdf02f3,en route to gbr embrace the elevation in luxur...,nature travel diving water sea underwater ocea...,1,0,1,nature travel diving water sea underwater ocea...
4,5e55d69eb9e5b725cd7ba02f,golf course views hamiltonislandgolfcourse whi...,outdoors landscape beach sky nature rural nope...,1,0,1,outdoors landscape beach sky nature rural nope...
...,...,...,...,...,...,...,...
1435,5e4e3124497f22be9069f067,golf trips with the boys are always wicked and...,sky water seashore sea travel winter ship land...,0,1,1,sky water seashore sea travel winter ship land...
1436,5e4e3124ffb21abead202386,golf trips with the boys are always wicked and...,travel golf ocean grass water sand nature sea ...,0,1,1,travel golf ocean grass water sand nature sea ...
1437,5e4e3124ffb21abead202385,throwback to that time i was warm and tanned q...,watercraft water people noperson recreation se...,0,1,1,watercraft water people noperson recreation se...
1438,5e4e3123164d73be9b8cd43e,golf trips with the boys are always wicked and...,adult people class girl grouptogether portrait...,0,1,1,adult people class girl grouptogether portrait...


In [2]:
from sklearn.model_selection import train_test_split

train_df, validation_df = train_test_split(df_all, test_size=0.4, random_state=42)
validation_df, test_df = train_test_split(validation_df, test_size=0.4, random_state=42)

train_df = train_df.reset_index(drop=True)
validation_df = validation_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print(f"train {len(train_df)}, val {len(validation_df)}, test {len(test_df)}")

train 864, val 345, test 231


In [3]:
from flair.embeddings import (
    Sentence, 
    WordEmbeddings, 
    FlairEmbeddings, 
    StackedEmbeddings, 
    DocumentRNNEmbeddings,
    BytePairEmbeddings
)
from flair.training_utils import store_embeddings


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [4]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import numpy as np

train_on_gpu = torch.cuda.is_available()

def get_batches(df, batch_size=16):
    n_batches = len(df)//batch_size    
    for i in range(0, len(df), batch_size):
        x = [Sentence(txt) for txt in df[i:i+batch_size]['text']]
        y = [1 if label else 0 for label in df[i:i+batch_size]['published']]
        yield x, torch.FloatTensor(y)
    
    
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.df = df
        self.document_embeddings = DocumentRNNEmbeddings([
            WordEmbeddings('twitter'),
#            BytePairEmbeddings('en')
#             FlairEmbeddings('news-forward'),
#             FlairEmbeddings('news-backward')
        ], hidden_size=128)
        self.fc = nn.Linear(128, 1)
        self.sig = nn.Sigmoid()
        
    def forward(self, sentences):
        self.document_embeddings.embed(sentences)
        
        text_embedding_list = [
            s.embedding.unsqueeze(0) for s in sentences
        ]
        
        text_embedding_tensor = torch.cat(text_embedding_list, 0).cuda()
        
        out = self.sig(self.fc(text_embedding_tensor))
        
        return out
    

batch_size = 16

model = MyModel()
print(model)

MyModel(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('twitter')
    )
    (word_reprojection_map): Linear(in_features=100, out_features=100, bias=True)
    (rnn): GRU(100, 128, batch_first=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [5]:
def train_model(model, epochs, lr, train_df, val_df, checkpoint_file, early_stopping=5):        
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    best_loss = np.inf
    no_improvement = 0

    if train_on_gpu:
        model = model.cuda()
        
    for epoch in range(epochs):        
        total_train_loss = 0
        total_val_loss = 0
        train_loss = 0
        val_loss = 0
        
        # Train
        model.train()        
        for i, (sentences, labels) in enumerate(get_batches(train_df)):         
            if train_on_gpu:
                labels = labels.cuda()
            
            optimizer.zero_grad()
            
            out = model(sentences)
            loss = criterion(out.squeeze(), labels)
            loss.backward()
            
            nn.utils.clip_grad_norm_(model.parameters(), 5.0)
            optimizer.step()
            
            total_train_loss += loss.item()
            
            store_embeddings(sentences, 'cpu')
            
            if i % 10 == 0:
                print(f"Epoch {epoch}, Batch {i}, train loss {loss.item()/labels.size(0)}")
            
            
        train_loss = total_train_loss/len(train_df)
        print(f"> Epoch {epoch}, train loss {train_loss}")
        
        # Eval
        model.eval()
        for sentences, labels in get_batches(val_df):
            if train_on_gpu:
                labels = labels.cuda()
            
            out = model(sentences)
            loss = criterion(out.squeeze(), labels)
            total_val_loss += loss.item()
            
            store_embeddings(sentences, 'cpu')
            
            
        val_loss = total_val_loss / len(val_df)
        
        print(f"> Epoch {epoch}, val loss {val_loss}")
        
        if val_loss < best_loss:
            best_loss = val_loss
            no_improvement = 0
            torch.save(model.state_dict(), checkpoint_file)
            print("Saved model.")
        else:
            no_improvement += 1
            print("No improvement.")
            if no_improvement >= early_stopping:
                print(f"Early Stopping")
                break
            
                                              
checkpoint_file = 'flair_text_model_custom.pt'      
lr = 0.005
epochs = 5            

train_model(model, epochs, lr, train_df, validation_df, checkpoint_file)



Epoch 0, Batch 0, train loss 0.041444361209869385
Epoch 0, Batch 10, train loss 0.052553366869688034
Epoch 0, Batch 20, train loss 0.04126397520303726
Epoch 0, Batch 30, train loss 0.04194075986742973
Epoch 0, Batch 40, train loss 0.05388794094324112
Epoch 0, Batch 50, train loss 0.04926455020904541
> Epoch 0, train loss 0.04331002122274152
> Epoch 0, val loss 0.04157462310099947
Saved model.
Epoch 1, Batch 0, train loss 0.04449247568845749
Epoch 1, Batch 10, train loss 0.04590047150850296
Epoch 1, Batch 20, train loss 0.04555549472570419
Epoch 1, Batch 30, train loss 0.045813318341970444
Epoch 1, Batch 40, train loss 0.043537646532058716
Epoch 1, Batch 50, train loss 0.040530726313591
> Epoch 1, train loss 0.04139460802630142
> Epoch 1, val loss 0.040946247612220654
Saved model.
Epoch 2, Batch 0, train loss 0.042425207793712616
Epoch 2, Batch 10, train loss 0.0397714227437973
Epoch 2, Batch 20, train loss 0.03969942778348923
Epoch 2, Batch 30, train loss 0.050785575062036514
Epoch 2, 

In [6]:
def eval_model(model, test_df):
    if train_on_gpu:
        model = model.cuda()
        
    model.eval()
    num_correct = 0
    for i, (sentences, labels) in enumerate(get_batches(test_df)):
        if train_on_gpu:
            labels = labels.cuda()
            
        out = model(sentences)
        pred = torch.round(out.squeeze())
        correct = (pred == labels)
        correct = correct.cpu().numpy() if train_on_gpu else correct.numpy()
        
        num_correct += np.sum(correct)
        
        store_embeddings(sentences, 'cpu')
        
    total = len(test_df)
    print(f"{num_correct}/{total} correct. Accuracy: {num_correct*100/total} %")
    
    
best_model = MyModel()
best_model.load_state_dict(torch.load(checkpoint_file))
eval_model(best_model, test_df)


208/231 correct. Accuracy: 90.04329004329004 %
