## The following code reads the database, extracts for each movie a poster image link and a synopsis.

## The images are then processed through the pretrained VGG16 deep neural network from which the 1000 features of the last layer and the 4096 of the second-to-last layer (full connected) are stored in the database. 

## Similarly, each word of the synopsis is processed though a Word2Vec neural network pretrained with Google News. 300 ouput features are obtained for each word, and are then averaged over the whole synopsis before being stored in the database.

### 1. Import libraries

In [1]:
import urllib
import requests
import numpy as np
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy import Column, Integer, MetaData, Table
import psycopg2
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

### 2. Create pandas dataframe from PostgreSQL database table

In [7]:
table_name='FilmTable'
dbname = 'DBName'
username = 'postgres'
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))

##### Bind the connection to MetaData()
metadata = MetaData(engine)
metadata.reflect(bind=engine)

##### Create pandas df from sql database table #######
with engine.connect() as conn:
    select_statement = metadata.tables['%s' % table_name].select()
    pdfilm_table_from_sql = pd.read_sql_query(select_statement,conn)
    
pdfilm_table_from_sql.head()

Unnamed: 0,Id,imdb_id,adult,original_title,genres,genres_id,release_date,poster_path,overview,original_language,...,runtime,spoken_languages_iso_639_1,status,tagline,title,vote_average,vote_count,predictions,fc2,word2vec
0,2,tt0094675,False,Ariel,"[Drama, Crime]","[18, 80]",1988-10-21,/gZCJZOn4l0Zj5hAxsMbxoS6CL0u.jpg,Taisto Kasurinen is a Finnish coal miner whose...,fi,...,69.0,"[fi, de]",Released,,Ariel,7.1,42,,,
1,3,tt0092149,False,Varjoja paratiisissa,"[Drama, Comedy]","[18, 35]",1986-10-16,/7ad4iku8cYBuB08g9yAU7tHJik5.jpg,"An episode in the life of Nikander, a garbage ...",fi,...,76.0,"[en, fi, sv]",Released,,Shadows in Paradise,7.0,34,,,
2,5,tt0113101,False,Four Rooms,"[Crime, Comedy]","[80, 35]",1995-12-09,/eQs5hh9rxrk1m4xHsIz1w11Ngqb.jpg,It's Ted the Bellhop's first night on the job....,en,...,98.0,[en],Released,Twelve outrageous guests. Four scandalous requ...,Four Rooms,6.5,535,,,
3,6,tt0107286,False,Judgment Night,"[Action, Thriller, Crime]","[28, 53, 80]",1993-10-15,/lNXmgUrP6h1nD53gkFh4WDzT6RZ.jpg,"While racing to a boxing match, Frank, Mike, J...",en,...,110.0,[en],Released,Don't move. Don't whisper. Don't even breathe.,Judgment Night,6.4,77,,,
4,8,tt0825671,False,Life in Loops (A Megacities RMX),[Documentary],[99],2006-01-01,/8YyIjOAxwzD3fZMdmJrfiApod4l.jpg,Timo Novotny labels his new project an experim...,en,...,80.0,"[en, hi, ja, ru, es]",Released,A Megacities remix.,Life in Loops (A Megacities RMX),6.4,4,,,


### 3. Read synopsis of each movie and pass it through a Word2Vec neural network pretrained with Google News. After averaging the 300 features from each word, store them into the database

In [None]:
##### Import some nlp libraries and load Word2Vec model
import gensim
import re
import nltk
# nltk.download()
from nltk.corpus import stopwords

###### Load Google News pre-trained Word2Vec model
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)  

In [10]:
vectors=list()

##### Loop over the movies and get synopsis
for oview in pdfilm_table_from_sql.overview[pdfilm_table_from_sql.poster_path.str.\
                                                  contains("None") == False]:
    del vectors[:]
    
    ##### Convert Overview to array of string
    sentence = "%s" % oview
    sentence = re.sub("[^\w]", " ",  sentence).split()

    ##### Remove stop words #####
    sentence_filtered = [word for word in sentence if word not in stopwords.words('english')]
    
    ##### Loop over words and get their 300 features each  
    for w in sentence_filtered:
        try:
            vectors.append(model[w])
        except:
            continue    
       
    ##### Average the features over the whole synopsis
    mean_vec=list()
    mean_vec=[sum(x)/len(vectors) for x in zip(*vectors)]
     
    ##### Store predictions and features in the pandas dataframe
    indexlist=pdfilm_table_from_sql.index[pdfilm_table_from_sql['overview']==oview].tolist()[0]
    pdfilm_table_from_sql.set_value(indexlist,'word2vec',mean_vec)       
   
    ##### Convert word2vec features into string to store into DB    
    word2vecDB=pdfilm_table_from_sql['word2vec'][indexlist]
    word2vecDB = ' '.join(str(e) for e in word2vecDB)

    ##### Store into DB    
    with engine.connect() as conn:
        try: 
            sql_query = """
    UPDATE "%s"
    SET word2vec='%s'
    WHERE "Id"=%i\
;                                                                               
                """ % (table_name, word2vecDB, pdfilm_table_from_sql.\
                       loc[pdfilm_table_from_sql['overview'] \
                                                               == oview, 'Id'].item())
            conn.execute(sql_query)
        except:
            continue

### 3. Load image of each movie and pass it through a VGG16 pretrained neural network. The 1000 features from the last classification layer, and the 4096 features from the second-to-last layer are then stored in the DB

In [None]:
##### Import specific libraries and load VGG16 model
from PIL import Image
import requests
from io import BytesIO
from keras.models import Model
from quiver_engine.imagenet_utils import preprocess_input
from keras.preprocessing import image
from VGG16_PretrainedV1 import VGG16

##### Load VGG16 model
base_model = VGG16(weights='imagenet')
model = Model(input=base_model.input, output=base_model.get_layer('predictions').output) ### 1000 features
model1 = Model(input=base_model.input, output=base_model.get_layer('fc2').output) ### 4096 features

In [4]:
##### Loop over the pandas df and get the poster image link
for filein in pdfilm_table_from_sql.poster_path[pdfilm_table_from_sql.poster_path.str.\
                                                  contains("None") == False].head(15):
    try:
        ##### Request poster image from url        
        response = requests.get("http://image.tmdb.org/t/p/original%s" % filein)
    except:
        pass    
               
    ##### Load image and resize it to 224x224 px
    img = image.load_img(BytesIO(response.content))
    if img.mode == 'L' or img.mode == 'RGBA':
        img = img.convert('RGB')
    img = img.resize((224, 224), Image.ANTIALIAS)
        
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)

    ##### Get the 1000 features from last layer
    predictions_features = model.predict(x)
        
    ##### Get the 4096 features from second to last layer
    fc2_features = model1.predict(x)

    ##### Store predictions and fc2 features in the pandas dataframe    
    indexlist=pdfilm_table_from_sql.index[pdfilm_table_from_sql['poster_path']==filein].tolist()[0]
    pdfilm_table_from_sql.set_value(indexlist,'predictions',predictions_features)
    pdfilm_table_from_sql.set_value(indexlist,'fc2',fc2_features)
        
    ##### Convert predictions and fc2 into string to store in DB
    predictionDB=pdfilm_table_from_sql['predictions'][indexlist].ravel()
    predictionDB=predictionDB.tolist()
    predictionDBstr = ' '.join(str(e) for e in predictionDB)
    fc2DB=pdfilm_table_from_sql['fc2'][indexlist].ravel()
    fc2DB=fc2DB.tolist()
    fc2DBstr = ' '.join(str(e) for e in fc2DB)
        
    ##### Store into DB    
    with engine.connect() as conn:
        sql_query = """
    UPDATE "%s"
    SET predictions='%s'
    WHERE "Id"=%i\
;                                                                               
                """ % (table_name, predictionDBstr, pdfilm_table_from_sql.\
                       loc[pdfilm_table_from_sql['poster_path'] \
                                                               == filein, 'Id'].item())
        conn.execute(sql_query)
        sql_query = """
    UPDATE "%s"
    SET fc2='%s'
    WHERE "Id"=%i\
;                                                                               
                """ % (table_name, fc2DBstr, pdfilm_table_from_sql.\
                       loc[pdfilm_table_from_sql['poster_path'] \
                                                               == filein, 'Id'].item())
        conn.execute(sql_query)

### 4. Verify the DB was filled correctly

In [7]:
########### Show PostgresSQL table using pandas ##########
with engine.connect() as conn: 
    select_statement = metadata.tables['FilmTable'].select()
    pdfilm_table_from_sql = pd.read_sql_query(select_statement,conn)
    
pdfilm_table_from_sql.head()   

Unnamed: 0,Id,imdb_id,adult,original_title,genres,genres_id,release_date,poster_path,overview,original_language,...,runtime,spoken_languages_iso_639_1,status,tagline,title,vote_average,vote_count,predictions,fc2,word2vec
0,9,tt0425473,False,Sonntag im August,[Drama],[18],2004-09-02,,,de,...,15.0,[de],Released,,Sunday in August,5.3,2,,,
1,12,tt0266543,False,Finding Nemo,"[Animation, Family]","[16, 10751]",2003-05-30,/syPWyeeqzTQIxjIUaIFI7d0TyEY.jpg,"Nemo, an adventurous young clownfish, is unexp...",en,...,100.0,[en],Released,There are 3.7 trillion fish in the ocean. They...,Finding Nemo,7.6,6195,8.760883065406233e-05 0.0032413338776677847 0....,0.687143862247467 0.0 0.0 0.6218513250350952 3...,0.0167744954427 0.0753534105089 -0.10710652669...
2,6,tt0107286,False,Judgment Night,"[Action, Thriller, Crime]","[28, 53, 80]",1993-10-15,/lNXmgUrP6h1nD53gkFh4WDzT6RZ.jpg,"While racing to a boxing match, Frank, Mike, J...",en,...,110.0,[en],Released,Don't move. Don't whisper. Don't even breathe.,Judgment Night,6.4,77,2.3162847355706617e-05 0.0002617507998365909 2...,0.0 0.0 0.0 0.0 0.0 0.36906981468200684 0.0 0....,0.0359727647569 0.0639475504557 0.012349955240...
3,11,tt0076759,False,Star Wars,"[Adventure, Action, Science Fiction]","[12, 28, 878]",1977-05-25,/btTdmkgIvOi0FFip1sPuZI2oQG6.jpg,Princess Leia is captured and held hostage by ...,en,...,121.0,[en],Released,"A long time ago in a galaxy far, far away...",Star Wars,8.1,6693,2.362626901231124e-06 1.1240053026995156e-05 8...,0.0 0.0 0.6151365041732788 0.0 0.5548335909843...,0.0245936802455 0.067919921875 -0.046464865548...
4,13,tt0109830,False,Forrest Gump,"[Comedy, Drama, Romance]","[35, 18, 10749]",1994-07-06,/yE5d3BUhE8hCnkMUJOo1QDoOGNz.jpg,A man with a low IQ has accomplished great thi...,en,...,142.0,[en],Released,"The world will never be the same, once you've ...",Forrest Gump,8.2,8018,7.769698981974216e-07 1.1912162989347053e-07 8...,2.826995849609375 0.0 1.9792219400405884 0.0 0...,0.0516164679276 0.0572188527961 -0.01995367752...
