In [1]:
import pandas as pd
import os

df = pd.read_csv('flair-vgg16-data.csv', names=['_id', 'message', 'image_concept', 'published', 'disabled'])
df['available'] = 0

all_images_path = 'data/all_images'
for i, row in df.iterrows():
    if os.path.isfile(os.path.join(all_images_path, row['_id'] + '.jpg')):
        df.at[i, 'available']= 1    

In [2]:
df_published = df.loc[df.query('available == 1 and published == 1').index]
df_disabled = df.loc[df.query('available == 1 and disabled == 1').index]

df_all = pd.concat([df_published, df_disabled], ignore_index=True)
df_all

Unnamed: 0,_id,message,image_concept,published,disabled,available
0,5e5836fee917e8d9a8a7b277,endless blues greatbarrierreef australia whits...,seascape water shoal sea turquoise sun tropica...,1,0,1
1,5e58343ded065ad79e312f3d,hamiltonisland,tree travel vacation seashore water hotel isla...,1,0,1
2,5e57dc939e88b6be2ac42800,we are going coconuts for hamiltonisland here ...,relaxation beach sea vacation sand recreation ...,1,0,1
3,5e55dca437fa5927dcdf02f3,en route to gbr embrace the elevation in luxur...,nature travel diving water sea underwater ocea...,1,0,1
4,5e55d69eb9e5b725cd7ba02f,golf course views hamiltonislandgolfcourse whi...,outdoors landscape beach sky nature rural nope...,1,0,1
...,...,...,...,...,...,...
8052,5e253779f1b8d48ba5de7d32,colours so bright they hurt your eyes tropical...,outdoors nature scenery landscape water land o...,0,1,1
8053,5e252d334610948976f731e5,호 주 학 생 비 자 치 료 마 사 지 과 정 치 료 마 사 지 과 정 은 마 사 ...,human person patient therapy massage heel spa,0,1,1
8054,5e252d334610948976f731e6,호 주 학 생 비 자 치 료 마 사 지 과 정 치 료 마 사 지 과 정 은 마 사 ...,plant paper text flower blossom,0,1,1
8055,5e252d3342307c89757703c0,호 주 학 생 비 자 치 료 마 사 지 과 정 치 료 마 사 지 과 정 은 마 사 ...,person human finger hand dating face arm,0,1,1


# Feature extract

In [3]:
import torchvision
import torch
from torch import nn
import tqdm
from torchvision import transforms
import matplotlib.image as mpimg
from PIL import Image
import pickle

train_on_gpu = torch.cuda.is_available()

def get_pretrained_image_model():
    model = torchvision.models.vgg16(pretrained=True)
    model = nn.Sequential(*list(model.children())[:-1])
    return model


def extract_image_features(df_all):
    pretrained_model = get_pretrained_image_model()
    
    for param in pretrained_model.parameters():
        param.requires_grad = False
        
    pretrained_model.eval()
    
    
    if train_on_gpu:
        pretrained_model = pretrained_model.cuda()
    
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
    transformer = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        normalize
    ])
    
    id_to_features = {}    
    for i, row in tqdm.tqdm(df_all.iterrows(), total=len(df_all)):
        _id = row['_id']
        
        img = Image.open(os.path.join(all_images_path, _id + '.jpg')).convert("RGB")
        
        img = transformer(img)
        
        if train_on_gpu:
            img = img.cuda()
            
        img_rep = pretrained_model(img.unsqueeze(0))
        
        if train_on_gpu:
            img_rep = img_rep.cpu()
            
        
        img_rep = img_rep.numpy().squeeze().flatten()
        
        id_to_features[_id] = img_rep
        
    
    
    return id_to_features
        
        
id_to_image_features_file = 'flair_vgg16_image_features.pkl'

# id_to_image_features = extract_image_features(df_all)
# with open(id_to_image_features_file, 'wb') as f:
#     pickle.dump(id_to_image_features, f)


# id_to_image_features = None
# with open(id_to_image_features_file, 'rb') as f:
#     id_to_image_features = pickle.load(f)

In [4]:
df_all.iloc[171]['message']

'from where i would rather be again what an epic spot hamiltonisland such a beautiful eye snack around every corner and up every hill i will be back if you are looking for paradise in our own back yard i couldn t recommend this place more getting around the island on golf buggies snorkling eating amazing seafood getting friendly with the cockatoos it s all here thanks for an awesome holiday darreng 18 shaes amandabaker 0419 paradise hamiltonisland holiday australia whitsundays beauty takemeback'

In [5]:
import flair
from flair.embeddings import FlairEmbeddings, DocumentPoolEmbeddings, Sentence, WordEmbeddings

def extract_text_features(df_all):
    
    flair_embedding_forward = FlairEmbeddings('news-forward')
    flair_embedding_backward = FlairEmbeddings('news-backward')
    twitter_embedding = WordEmbeddings('twitter')

    document_embedding = DocumentPoolEmbeddings([
        twitter_embedding,
        flair_embedding_forward, 
        flair_embedding_backward
    ])

    id_to_text_features = {}
    for i, row in tqdm.tqdm(df_all.iterrows(), total=len(df_all)):    
        message = row['message']
        _id = row['_id']
        sentence = Sentence(str(message))
        document_embedding.embed(sentence)
        embedding = sentence.get_embedding().cpu().detach().numpy().flatten()
        id_to_text_features[_id] = embedding
        
    return id_to_text_features


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


2020-03-27 13:24:46,889 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/twitter.gensim.vectors.npy not found in cache, downloading to /tmp/tmps6tx6hzr


100%|██████████| 477405728/477405728 [00:51<00:00, 9252807.53B/s] 

2020-03-27 13:25:39,295 copying /tmp/tmps6tx6hzr to cache at /home/ec2-user/.flair/embeddings/twitter.gensim.vectors.npy





2020-03-27 13:25:39,722 removing temp file /tmp/tmps6tx6hzr
2020-03-27 13:25:40,452 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/twitter.gensim not found in cache, downloading to /tmp/tmpo6y6w4s0


100%|██████████| 68268001/68268001 [00:07<00:00, 8639669.83B/s]

2020-03-27 13:25:49,141 copying /tmp/tmpo6y6w4s0 to cache at /home/ec2-user/.flair/embeddings/twitter.gensim
2020-03-27 13:25:49,207 removing temp file /tmp/tmpo6y6w4s0





100%|██████████| 8057/8057 [30:41<00:00,  4.38it/s]  


In [10]:
id_to_text_features_twitter_file = 'f_text_features_twitter.pkl'
with open(id_to_text_features_twitter_file, 'wb') as f:
    pickle.dump(id_to_text_features, f)