In [1]:
import pandas as pd
import os

df = pd.read_csv('flair-vgg16-data.csv', names=['_id', 'message', 'image_concept', 'published', 'disabled'])
df['available'] = 0

all_images_path = 'data/all_images'
for i, row in df.iterrows():
    if os.path.isfile(os.path.join(all_images_path, row['_id'] + '.jpg')):
        df.at[i, 'available']= 1    
        
df_published = df.loc[df.query('available == 1 and published == 1').index]
df_disabled = df.loc[df.query('available == 1 and disabled == 1').index]

df_all = pd.concat([df_published, df_disabled], ignore_index=True)
df_all

Unnamed: 0,_id,message,image_concept,published,disabled,available
0,5e5836fee917e8d9a8a7b277,endless blues greatbarrierreef australia whits...,seascape water shoal sea turquoise sun tropica...,1,0,1
1,5e58343ded065ad79e312f3d,hamiltonisland,tree travel vacation seashore water hotel isla...,1,0,1
2,5e57dc939e88b6be2ac42800,we are going coconuts for hamiltonisland here ...,relaxation beach sea vacation sand recreation ...,1,0,1
3,5e55dca437fa5927dcdf02f3,en route to gbr embrace the elevation in luxur...,nature travel diving water sea underwater ocea...,1,0,1
4,5e55d69eb9e5b725cd7ba02f,golf course views hamiltonislandgolfcourse whi...,outdoors landscape beach sky nature rural nope...,1,0,1
...,...,...,...,...,...,...
8052,5e253779f1b8d48ba5de7d32,colours so bright they hurt your eyes tropical...,outdoors nature scenery landscape water land o...,0,1,1
8053,5e252d334610948976f731e5,호 주 학 생 비 자 치 료 마 사 지 과 정 치 료 마 사 지 과 정 은 마 사 ...,human person patient therapy massage heel spa,0,1,1
8054,5e252d334610948976f731e6,호 주 학 생 비 자 치 료 마 사 지 과 정 치 료 마 사 지 과 정 은 마 사 ...,plant paper text flower blossom,0,1,1
8055,5e252d3342307c89757703c0,호 주 학 생 비 자 치 료 마 사 지 과 정 치 료 마 사 지 과 정 은 마 사 ...,person human finger hand dating face arm,0,1,1


In [2]:
from sklearn.model_selection import train_test_split

train_df, validation_df = train_test_split(df_all, test_size=0.4, random_state=42)
validation_df, test_df = train_test_split(validation_df, test_size=0.4, random_state=42)

train_df = train_df.reset_index(drop=True)
validation_df = validation_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)


In [3]:
print(f"train df: {len(train_df)}, published {len(train_df.loc[train_df['published'] == 1])}")

print(f"val df: {len(validation_df)}, published {len(validation_df.loc[validation_df['published'] == 1])}")

print(f"test df: {len(test_df)}, published {len(test_df.loc[test_df['published'] == 1])}")


train df: 4834, published 1580
val df: 1933, published 647
test df: 1290, published 431


In [4]:
import pickle
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

train_on_gpu = torch.cuda.is_available()


class MyDataset(Dataset):
    def __init__(self, df, id_to_text_features, id_to_image_features):
        super().__init__()
        self.df = df
        self.id_to_text_features = id_to_text_features
        self.id_to_image_features = id_to_image_features
        
    def __getitem__(self, index):
        _id = self.df.loc[index]['_id']
        label = 1 if self.df.loc[index]['published'] else 0
        text_features = self.id_to_text_features[_id]
        image_features = self.id_to_image_features[_id]
        
        features = np.concatenate([text_features, image_features])
        
        return features, label
    
    def __len__(self):
        return len(self.df)
        

id_to_image_features_file = 'flair_vgg16_image_features.pkl'
id_to_text_features_file = 'flair_vgg16_text_features_twitter.pkl'
id_to_image_features = None
id_to_text_features = None
with open(id_to_image_features_file, 'rb') as f:
    id_to_image_features = pickle.load(f)    
with open(id_to_text_features_file, 'rb') as f:
    id_to_text_features = pickle.load(f)


        
batch_size = 16
train_dataset = MyDataset(train_df, id_to_text_features, id_to_image_features)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = MyDataset(validation_df, id_to_text_features, id_to_image_features)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True )

test_dataset = MyDataset(test_df, id_to_text_features, id_to_image_features)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True )

                    
class Simple(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(29284, 1)
        #self.fc2 = nn.Linear(4096, 1)
        self.dropout = nn.Dropout(p=0.2)
        self.sig = nn.Sigmoid()

        nn.init.xavier_uniform(self.fc1.weight)
        #nn.init.xavier_uniform(self.fc2.weight)
        
        
    def forward(self, x):
        x = self.dropout(x)        
        x = self.sig(self.fc1(x))        
#         x = self.dropout(x)        
#         x = self.sig(self.fc2(x))
        return x

def train_model(model, epochs, lr, train_dataloader, val_dataloader, checkpoint_file, early_stopping=5,):        
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    best_loss = np.inf
    no_improvement = 0

    if train_on_gpu:
        model = model.cuda()
        
    for epoch in range(epochs):        
        total_train_loss = 0
        total_val_loss = 0
        train_loss = 0
        val_loss = 0
        
        # Train
        model.train()        
        for i, (features, labels) in enumerate(train_dataloader):
            if train_on_gpu:
                features, labels = features.cuda(), labels.cuda()

            optimizer.zero_grad()
            
            out = model(features)
            loss = criterion(out.squeeze(), labels.float())
            loss.backward()
            
            optimizer.step()
            
            total_train_loss += loss.item()
            
            if i % 100 == 0:
                print(f"Epoch {epoch}, Batch {i}, train loss {loss.item()/labels.size(0)}")
            
            
        train_loss = total_train_loss/len(train_dataloader.dataset)
        print(f"> Epoch {epoch}, train loss {train_loss}")
        
        # Eval
        model.eval()
        for features, labels in val_dataloader:
            if train_on_gpu:
                features, labels = features.cuda(), labels.cuda()
                
            out = model(features)
            loss = criterion(out.squeeze(), labels.float())
            total_val_loss += loss.item()
            
            
        val_loss = total_val_loss / len(val_dataloader.dataset)
        
        print(f"> Epoch {epoch}, val loss {val_loss}")
        
        if val_loss < best_loss:
            best_loss = val_loss
            no_improvement = 0
            torch.save(model.state_dict(), checkpoint_file)
            print("Saved model.")
        else:
            no_improvement += 1
            print("No improvement.")
            if no_improvement >= early_stopping:
                print(f"Early Stopping")
                break
            
                                              
checkpoint_file = 'flair_vgg16_simple.pt'      
lr = 0.00005
epochs = 100            
model = Simple()
print(model)

train_model(model, epochs, lr, train_dataloader, val_dataloader, checkpoint_file)
        



Simple(
  (fc1): Linear(in_features=29284, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (sig): Sigmoid()
)
Epoch 0, Batch 0, train loss 0.05642422288656235
Epoch 0, Batch 100, train loss 0.053428493440151215
Epoch 0, Batch 200, train loss 0.06526434421539307
Epoch 0, Batch 300, train loss 0.04304591938853264
> Epoch 0, train loss 0.04537547463886393
> Epoch 0, val loss 0.03918850165972485
Saved model.
Epoch 1, Batch 0, train loss 0.03552984818816185
Epoch 1, Batch 100, train loss 0.015599235892295837
Epoch 1, Batch 200, train loss 0.02722085639834404
Epoch 1, Batch 300, train loss 0.030021648854017258
> Epoch 1, train loss 0.03623483657997208
> Epoch 1, val loss 0.03765777837286573
Saved model.
Epoch 2, Batch 0, train loss 0.024481691420078278
Epoch 2, Batch 100, train loss 0.017380662262439728
Epoch 2, Batch 200, train loss 0.019952483475208282
Epoch 2, Batch 300, train loss 0.040174953639507294
> Epoch 2, train loss 0.030815064604981752
> Epoch 2, val loss 

In [7]:
def eval_model(model, test_dataloader):
    if train_on_gpu:
        model = model.cuda()
        
    model.eval()
    num_correct = 0
    for i, (features, labels) in enumerate(test_dataloader):
        if train_on_gpu:
            features, labels = features.cuda(), labels.cuda()
            
        out = model(features)
        pred = torch.round(out.squeeze())
        correct = (pred == labels)
        correct = correct.cpu().numpy() if train_on_gpu else correct.numpy()
        
        num_correct += np.sum(correct)
        
    total = len(test_dataloader.dataset)
    print(f"{num_correct}/{total} correct. Accuracy: {num_correct*100/total} %")
    
    
best_model = Simple()
best_model.load_state_dict(torch.load(checkpoint_file))
eval_model(best_model, test_dataloader)
            



936/1290 correct. Accuracy: 72.55813953488372 %
