In [None]:
import pandas as pd
import os

df = pd.read_csv('flair-vgg16-data.csv', names=['_id', 'message', 'image_concept', 'published', 'disabled'])
df['available'] = 0

all_images_path = 'data/all_images'
for i, row in df.iterrows():
    if os.path.isfile(os.path.join(all_images_path, row['_id'] + '.jpg')):
        df.at[i, 'available']= 1    
        
df_published = df.loc[df.query('available == 1 and published == 1').index]
df_published['label'] = '__label__published'
df_published['text'] = df_published['image_concept'] + ' ' + df_published['message']
df_published = df_published.loc[df_published['text'].notnull()]
published_count = len(df_published)


df_disabled = df.loc[df.query('available == 1 and disabled == 1').index]
df_disabled['label'] = '__label__disabled'
df_disabled['text'] = df_disabled['image_concept'] + ' ' + df_disabled['message']
df_disabled = df_disabled.loc[df_disabled['text'].notnull()]
df_disabled = df_disabled[:published_count]



df_all = pd.concat([df_published, df_disabled], ignore_index=True)


df_all = df_all.reset_index(drop=True)

df_all

In [None]:
df_all[['label', 'text']]

In [None]:
#df_all.to_csv('docker/local_test/data.csv', header=False, index=False)

In [None]:
# df_docker = pd.read_csv('docker/local_test/data.csv', names=['_id', 'message', 'image_concept', 'published', 'disabled', 'available', 'label', 'text'])
# df_docker

In [None]:
from sklearn.model_selection import train_test_split

train_df, validation_df = train_test_split(df_all, test_size=0.4, random_state=42)
validation_df, test_df = train_test_split(validation_df, test_size=0.4, random_state=42)

train_df = train_df.reset_index(drop=True)
validation_df = validation_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_csv = 'flair_classification_data/train.csv'
dev_csv = 'flair_classification_data/dev.csv'
test_csv = 'flair_classification_data/test.csv'

train_df[['label', 'text']].to_csv(train_csv, sep='\t', index=False, header=False)
validation_df[['label', 'text']].to_csv(dev_csv, sep='\t', index=False, header=False)
test_df[['label', 'text']].to_csv(test_csv, sep='\t', index=False, header=False)


In [None]:
train_df2 = pd.read_csv('flair_classification_data/train.csv', sep='\t', names=['label', 'text'])
train_df2

In [None]:
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path


In [None]:
corpus = NLPTaskDataFetcher.load_classification_corpus(
    Path('flair_classification_data'),
    test_file='test.csv',
    dev_file='dev.csv',
    train_file='train.csv'
)

In [None]:
label_dict = corpus.make_label_dictionary()

In [None]:
label_dict

In [None]:
#corpus.make_vocab_dictionary().get_items()

In [None]:
document_embeddings = DocumentRNNEmbeddings([
        WordEmbeddings('twitter'),
    #     FlairEmbeddings('news-forward'),
    #     FlairEmbeddings('news-backward')
    ], 
    hidden_size=128,
    reproject_words=True,
    reproject_words_dimension=128
)

In [None]:
from flair.embeddings import Sentence

sentence1 = Sentence('The grass is green . And the sky is blue .')

# embed the sentence with our document embedding
document_embeddings.embed(sentence1)

# now check out the embedded sentence.
print(sentence1.get_embedding().shape)

sentence2 = Sentence("""It accounts for virtually all discussion in the media, enjoying priority over such topics as the 2020 US presidential election or the UK finally leaving the EU for good in less than 9 months. People are flooding social media with COVID information, which can only mean one thing: data. Fresh data waiting to be analysed. And analyse it we will.""")

# embed the sentence with our document embedding
document_embeddings.embed(sentence2)

# now check out the embedded sentence.
print(sentence2.get_embedding().shape)



In [None]:
classifier = TextClassifier(
    document_embeddings, 
    label_dictionary=['published', 'disabled'],
    multi_label=True
)

print(classifier)

trainer = ModelTrainer(classifier, corpus)

In [None]:
#trainer.train('./', max_epochs=20, learning_rate=0.1)

In [None]:
classifier([Sentence("this is a sentence")])



In [None]:
import torch
import torch.nn as nn
import numpy as np
import random
from flair.training_utils import store_embeddings
train_on_gpu = torch.cuda.is_available()
torch.from_numpy(train_df[:10][['published', 'disabled']].values).shape

In [None]:
def get_batches(df, target_names, batch_size=16):
    df = df.sample(frac=1).reset_index(drop=True)
    for i in range(0, len(df), batch_size):
        x = []
        y = []
        for row in df[i:i+batch_size].iterrow():
            image_concept = '' if pd.isna(row['image_concept']) else row['image_concept']
            message = '' if pd.isna(row['message']) else row['message']                        
            
            # shuffle image concepts
            words = image_concept.split()
            random.shuffle(words)
            image_concept = ' '.join(words)
            
            # join message and image_concept together
            txt = ' '.join([message, image_concept])                    
            x.append(Sentence(txt))                        
            y.append([row[t] for t in target_names])
        
        yield x, torch.tensor(y, dtype=torch.long)


def train_model(model, epochs, lr, train_df, val_df, target_names, checkpoint_file, early_stopping=5):        
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    best_loss = np.inf
    no_improvement = 0

    if train_on_gpu:
        model = model.cuda()
        
    for epoch in range(epochs):        
        total_train_loss = 0
        total_val_loss = 0
        train_loss = 0
        val_loss = 0
        
        # Train
        model.train()        
        for i, (sentences, labels) in enumerate(get_batches(train_df, target_names)):         
            if train_on_gpu:
                labels = labels.cuda()
            
            optimizer.zero_grad()
            
            out = model(sentences)            
            
            loss = criterion(out, labels)
            loss.backward()
            
            nn.utils.clip_grad_norm_(model.parameters(), 5.0)
            optimizer.step()
            
            total_train_loss += loss.item()
            
            store_embeddings(sentences, 'cpu')
            
            if i % 10 == 0:
                print(f"Epoch {epoch}, Batch {i}, train loss {loss.item()/labels.size(0)}")
            
            
        train_loss = total_train_loss/len(train_df)
        print(f"> Epoch {epoch}, train loss {train_loss}")
        
        # Eval
        model.eval()
        for sentences, labels in get_batches(val_df):
            if train_on_gpu:
                labels = labels.cuda()
            
            out = model(sentences)
            loss = criterion(out, labels)
            total_val_loss += loss.item()
            
            store_embeddings(sentences, 'cpu')
            
            
        val_loss = total_val_loss / len(val_df)
        
        print(f"> Epoch {epoch}, val loss {val_loss}")
        
        if val_loss < best_loss:
            best_loss = val_loss
            no_improvement = 0
            torch.save(model.state_dict(), checkpoint_file)
            print("Saved model.")
        else:
            no_improvement += 1
            print("No improvement.")
            if no_improvement >= early_stopping:
                print(f"Early Stopping")
                break
            
                                              
checkpoint_file = 'flair_text_model_2.pt'      
lr = 0.005
epochs = 5            

target_names = ['published', 'disabled']
train_model(classifier, epochs, lr, train_df, validation_df, target_names, checkpoint_file)

In [None]:
def eval_model(model, test_df):
    if train_on_gpu:
        model = model.cuda()
        
    model.eval()
    num_correct = 0
    total = 0
    for i, (sentences, labels) in enumerate(get_batches(test_df)):
        if train_on_gpu:
            labels = labels.cuda()
            
        out = model(sentences)
        _, pred = torch.max(out, 1)
                
        correct = (labels == pred)
        correct = correct.cpu().numpy() if train_on_gpu else correct.numpy()
        
        num_correct += np.sum(correct)
        total += labels.size(0)
        
        store_embeddings(sentences, 'cpu')
        

    print(f"{num_correct}/{total} correct. Accuracy: {num_correct*100/total} %")
    
    
eval_model(classifier, test_df)