In [1]:
import numpy as np
import requests
from PIL import Image
import pandas as pd
from tqdm import tqdm
from efficientnet.keras import EfficientNetB6
from efficientnet.keras import center_crop_and_resize, preprocess_input
import keras

In [2]:
#preprocesses the images and runs through effecientNet
def image_embeddings_keras(image,model):
    image_size = model.input_shape[1]
    cursor = 0
    
    x = center_crop_and_resize(image, image_size=image_size)
    x = preprocess_input(x)
    x = np.expand_dims(x,axis = 0)
    if x.shape == (1, 528, 528, 3):
        embedding = model.predict(x)
    else:
        return(False)
    return(embedding)

#get the updated effecientNet model with the final 3 layers replaced with global pooling and dense layer
def get_image_model():
    source_model = EfficientNetB6(weights='imagenet')
    test_model = source_model.layers[-3].output
    predictions = keras.layers.Dense(2304)(test_model)
    image_embedding_model = keras.Model(inputs = source_model.input, outputs = predictions)
    return(image_embedding_model)

In [3]:
#read dataset
data = pd.read_csv('New_balance_valid_url.csv')
data.head()

Unnamed: 0,tweet,country_code,bounding_box,screen_name,favourites_count,followers_count,statuses_count,friends_count,listed_count,image_url,banner_url,centroid,country_name
0,"#USGS07343000 - N Sulphur Rv nr Cooper, TX \nH...",US,"[[[-106.645646, 25.837092], [-106.645646, 36.5...",USGS_TexasRain,0.0,1377.0,34433.0,1.0,69.0,https://pbs.twimg.com/profile_images/875796316...,https://pbs.twimg.com/profile_banners/71521146...,"(31.1688935, -100.0768885)",United States
1,https://t.co/ONEYjodpYc,US,"[[[-80.196625, 26.150489], [-80.196625, 26.209...",GetYoPrimeOn,2771.0,808.0,27208.0,680.0,3.0,https://pbs.twimg.com/profile_images/112552666...,https://pbs.twimg.com/profile_banners/32416638...,"(26.1799205, -80.156826)",United States
2,this still Denzels best performance,US,"[[[-76.22808, 36.672684], [-76.22808, 36.93239...",marcsuckstoes,225.0,117.0,1389.0,203.0,0.0,https://pbs.twimg.com/profile_images/130316925...,https://pbs.twimg.com/profile_banners/53844606...,"(36.8025415, -76.069946)",United States
3,@annkozma723 @sdtitmas @IanStuart66 @mtholfsen...,US,"[[[-74.041878, 40.570842], [-74.041878, 40.739...",seanmarnold,6518.0,3192.0,6096.0,1957.0,79.0,https://pbs.twimg.com/profile_images/121654772...,https://pbs.twimg.com/profile_banners/18243997...,"(40.655138, -73.9487755)",United States
4,@MatthewCappucci Don’t try. It’s not worth th...,US,"[[[-77.220557, 39.053158], [-77.220557, 39.120...",DRmetwatch,92284.0,1962.0,14132.0,889.0,119.0,https://pbs.twimg.com/profile_images/117405337...,https://pbs.twimg.com/profile_banners/32468285...,"(39.08705500000001, -77.16310250000001)",United States


In [25]:
effNet = get_image_model()

In [20]:
effNet.input_shape

(None, 528, 528, 3)

In [22]:
#extract embeddings from url

full_dict = {}
image_embeddings = {}
count = range(len(data))
for row in tqdm(count):
    embedding_list = []
    
    #extract the image urls
    profile_url =  data.iloc[row]['image_url']
    banner_url = data.iloc[row]['banner_url']
    try:
        #make request to get images, run images through preprocessing and embeddings model, add embeddings to dictionary
        x=np.asarray(Image.open(requests.get(profile_url, stream=True).raw).convert('RGB'))
        x = image_embeddings_keras(x,effNet)
        image_embeddings["profile_embedding"] = x.squeeze()
        
        x=np.asarray(Image.open(requests.get(banner_url, stream=True).raw).convert('RGB'))
        x = image_embeddings_keras(x,effNet)
        image_embeddings["banner_embedding"] = x.squeeze()
    except:
        pass
    
    #check if both profile and banner images were extracted
    if len(image_embeddings)==2:
        full_dict[data.iloc[row]['screen_name']] = image_embeddings
    else:
        pass
    
    #run only 10 times for trial purposes
    if row>10:
        break


  0%|                                                                             | 11/12650 [00:07<2:14:58,  1.56it/s]


In [23]:
full_dict

{'USGS_TexasRain': {'profile_embedding': array([ 0.1885559 ,  0.02216596, -0.09074964, ...,  0.17200744,
         -0.09153228,  0.34097442], dtype=float32),
  'banner_embedding': array([-0.04192941, -0.18187144, -0.2035078 , ...,  0.0423746 ,
         -0.1658499 ,  0.11950806], dtype=float32)},
 'GetYoPrimeOn': {'profile_embedding': array([ 0.1885559 ,  0.02216596, -0.09074964, ...,  0.17200744,
         -0.09153228,  0.34097442], dtype=float32),
  'banner_embedding': array([-0.04192941, -0.18187144, -0.2035078 , ...,  0.0423746 ,
         -0.1658499 ,  0.11950806], dtype=float32)},
 'marcsuckstoes': {'profile_embedding': array([ 0.1885559 ,  0.02216596, -0.09074964, ...,  0.17200744,
         -0.09153228,  0.34097442], dtype=float32),
  'banner_embedding': array([-0.04192941, -0.18187144, -0.2035078 , ...,  0.0423746 ,
         -0.1658499 ,  0.11950806], dtype=float32)},
 'seanmarnold': {'profile_embedding': array([ 0.1885559 ,  0.02216596, -0.09074964, ...,  0.17200744,
         -0.0

In [24]:
#output shape
full_dict['USGS_TexasRain']['profile_embedding'].shape

(2304,)