In [5]:
import pandas as pd
import os

df = pd.read_csv('flair-vgg16-data.csv', names=['_id', 'message', 'image_concept', 'published', 'disabled'])
df['available'] = 0

all_images_path = 'data/all_images'
for i, row in df.iterrows():
    if os.path.isfile(os.path.join(all_images_path, row['_id'] + '.jpg')):
        df.at[i, 'available']= 1    
        
df_published = df.loc[df.query('available == 1 and published == 1').index]
published_count = len(df_published)

print(f'published_count {published_count}')
df_disabled = df.loc[df.query('available == 1 and disabled == 1').index]
df_disabled = df_disabled[:published_count]

df_all = pd.concat([df_published, df_disabled], ignore_index=True)
df_all = df_all.reset_index(drop=True)
df_all

published_count 2658


Unnamed: 0,_id,message,image_concept,published,disabled,available
0,5e5836fee917e8d9a8a7b277,endless blues greatbarrierreef australia whits...,seascape water shoal sea turquoise sun tropica...,1,0,1
1,5e58343ded065ad79e312f3d,hamiltonisland,tree travel vacation seashore water hotel isla...,1,0,1
2,5e57dc939e88b6be2ac42800,we are going coconuts for hamiltonisland here ...,relaxation beach sea vacation sand recreation ...,1,0,1
3,5e55dca437fa5927dcdf02f3,en route to gbr embrace the elevation in luxur...,nature travel diving water sea underwater ocea...,1,0,1
4,5e55d69eb9e5b725cd7ba02f,golf course views hamiltonislandgolfcourse whi...,outdoors landscape beach sky nature rural nope...,1,0,1
...,...,...,...,...,...,...
5311,5e3eb4abf0227ce8e168c7d4,insane bonuses with fm fast start bonuses up t...,car automobile vehicle transportation human pe...,0,1,1
5312,5e3ea825870465e4bf3a1deb,ciropicariello chibevefianovasanoevalontano gr...,apparel clothing water human person outdoors v...,0,1,1
5313,5e3ea823ee8283e4d618f6f4,ciropicariello chibevefianovasanoevalontano gr...,human person clothing apparel vessel transport...,0,1,1
5314,5e3ea503ee8283e4d618f1fc,y u me bigfella 006 mrandmrsbond husbandandwif...,person face human smile female girl teen blond...,0,1,1


In [6]:
from sklearn.model_selection import train_test_split

train_df, validation_df = train_test_split(df_all, test_size=0.4, random_state=42)
validation_df, test_df = train_test_split(validation_df, test_size=0.4, random_state=42)

train_df = train_df.reset_index(drop=True)
validation_df = validation_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)


In [7]:
print(f"train df: {len(train_df)}, published {len(train_df.loc[train_df['published'] == 1])}")

print(f"val df: {len(validation_df)}, published {len(validation_df.loc[validation_df['published'] == 1])}")

print(f"test df: {len(test_df)}, published {len(test_df.loc[test_df['published'] == 1])}")


train df: 3189, published 1568
val df: 1276, published 662
test df: 851, published 428


In [10]:
import pickle
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

train_on_gpu = torch.cuda.is_available()


class MyDataset(Dataset):
    def __init__(self, df, id_to_text_features, id_to_image_features):
        super().__init__()
        self.df = df
        self.id_to_text_features = id_to_text_features
        self.id_to_image_features = id_to_image_features
        
    def __getitem__(self, index):
        _id = self.df.loc[index]['_id']
        label = 1 if self.df.loc[index]['published'] else 0
        text_features = self.id_to_text_features[_id]
        image_features = self.id_to_image_features[_id]
        
        features = np.concatenate([text_features, image_features])
        
        return features, label
    
    def __len__(self):
        return len(self.df)
        

id_to_image_features_file = 'flair_vgg16_image_features.pkl'
id_to_text_features_file = 'flair_vgg16_text_features_twitter.pkl'
id_to_image_features = None
id_to_text_features = None
with open(id_to_image_features_file, 'rb') as f:
    id_to_image_features = pickle.load(f)    
with open(id_to_text_features_file, 'rb') as f:
    id_to_text_features = pickle.load(f)


        
batch_size = 16
train_dataset = MyDataset(train_df, id_to_text_features, id_to_image_features)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = MyDataset(validation_df, id_to_text_features, id_to_image_features)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True )

test_dataset = MyDataset(test_df, id_to_text_features, id_to_image_features)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True )

                    
class Simple(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(29284, 1)
        #self.fc2 = nn.Linear(4096, 1)
        self.dropout = nn.Dropout(p=0.2)
        self.sig = nn.Sigmoid()

        nn.init.xavier_uniform(self.fc1.weight)
        #nn.init.xavier_uniform(self.fc2.weight)
        
        
    def forward(self, x):
        x = self.dropout(x)        
        x = self.sig(self.fc1(x))        
#         x = self.dropout(x)        
#         x = self.sig(self.fc2(x))
        return x

def train_model(model, epochs, lr, train_dataloader, val_dataloader, checkpoint_file, early_stopping=5,):        
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    best_loss = np.inf
    no_improvement = 0

    if train_on_gpu:
        model = model.cuda()
        
    for epoch in range(epochs):        
        total_train_loss = 0
        total_val_loss = 0
        train_loss = 0
        val_loss = 0
        
        # Train
        model.train()        
        for i, (features, labels) in enumerate(train_dataloader):
            if train_on_gpu:
                features, labels = features.cuda(), labels.cuda()

            optimizer.zero_grad()
            
            out = model(features)
            loss = criterion(out.squeeze(), labels.float())
            loss.backward()
            
            optimizer.step()
            
            total_train_loss += loss.item()
            
            if i % 100 == 0:
                print(f"Epoch {epoch}, Batch {i}, train loss {loss.item()/labels.size(0)}")
            
            
        train_loss = total_train_loss/len(train_dataloader.dataset)
        print(f"> Epoch {epoch}, train loss {train_loss}")
        
        # Eval
        model.eval()
        for features, labels in val_dataloader:
            if train_on_gpu:
                features, labels = features.cuda(), labels.cuda()
                
            out = model(features)
            loss = criterion(out.squeeze(), labels.float())
            total_val_loss += loss.item()
            
            
        val_loss = total_val_loss / len(val_dataloader.dataset)
        
        print(f"> Epoch {epoch}, val loss {val_loss}")
        
        if val_loss < best_loss:
            best_loss = val_loss
            no_improvement = 0
            torch.save(model.state_dict(), checkpoint_file)
            print("Saved model.")
        else:
            no_improvement += 1
            print("No improvement.")
            if no_improvement >= early_stopping:
                print(f"Early Stopping")
                break
            
                                              
checkpoint_file = 'flair_vgg16_simple.pt'      
lr = 0.0001
epochs = 100            
model = Simple()
print(model)

train_model(model, epochs, lr, train_dataloader, val_dataloader, checkpoint_file)
        



Simple(
  (fc1): Linear(in_features=29284, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (sig): Sigmoid()
)
Epoch 0, Batch 0, train loss 0.049432702362537384
Epoch 0, Batch 100, train loss 0.055464982986450195
> Epoch 0, train loss 0.04782920747799124
> Epoch 0, val loss 0.0419798557147337
Saved model.
Epoch 1, Batch 0, train loss 0.04510163143277168
Epoch 1, Batch 100, train loss 0.04410645738244057
> Epoch 1, train loss 0.03275674217526312
> Epoch 1, val loss 0.04148116056167967
Saved model.
Epoch 2, Batch 0, train loss 0.021382298320531845
Epoch 2, Batch 100, train loss 0.021304506808519363
> Epoch 2, train loss 0.027314025888849704
> Epoch 2, val loss 0.04057943519660298
Saved model.
Epoch 3, Batch 0, train loss 0.023289240896701813
Epoch 3, Batch 100, train loss 0.03841153532266617
> Epoch 3, train loss 0.022103662986054845
> Epoch 3, val loss 0.04133205258267053
No improvement.
Epoch 4, Batch 0, train loss 0.0075067440047860146
Epoch 4, Batch 100, train 

In [11]:
def eval_model(model, test_dataloader):
    if train_on_gpu:
        model = model.cuda()
        
    model.eval()
    num_correct = 0
    for i, (features, labels) in enumerate(test_dataloader):
        if train_on_gpu:
            features, labels = features.cuda(), labels.cuda()
            
        out = model(features)
        pred = torch.round(out.squeeze())
        correct = (pred == labels)
        correct = correct.cpu().numpy() if train_on_gpu else correct.numpy()
        
        num_correct += np.sum(correct)
        
    total = len(test_dataloader.dataset)
    print(f"{num_correct}/{total} correct. Accuracy: {num_correct*100/total} %")
    
    
best_model = Simple()
best_model.load_state_dict(torch.load(checkpoint_file))
eval_model(best_model, test_dataloader)
            



605/851 correct. Accuracy: 71.09283196239718 %
