# The function ShowkNN returns the closest neighbors using a k-NN algorithm from 600 concatenated features of image and synopsis.

### The image poster features are first reduced to 300 features using a t-SVD algorithm. The features for both image and synopsis are normalized, and then concatenated before using them with a k-NN algorithm. 

### 1. Import libraries

In [None]:
import gensim

import numpy as np

import pandas as pd

import matplotlib

from PIL import Image
import requests
from io import BytesIO
from keras.models import Model
from quiver_engine.imagenet_utils import preprocess_input
from keras.preprocessing import image
from VGG16_PretrainedV1 import VGG16

from nltk.corpus import stopwords

from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix

import sqlalchemy

from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2

from sqlalchemy import Column, Integer, MetaData, Table

import string


import re


from scipy import spatial

from sklearn.neighbors import NearestNeighbors


import matplotlib.image as mpimg

In [None]:
##### Open and resize Image function 

def Resize(ImOri):
    
    img = Image.open(ImOri)
    if img.mode == 'L' or img.mode == 'RGBA':
            img = img.convert('RGB')
    img = img.resize((224, 224), Image.ANTIALIAS)

    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)



    x = preprocess_input(x)
    
    return x;

### 2. Function ShowkNN which returns an array of indices corresponding to the nearest neighbor movies

In [None]:
'''Input of ShowkNN function: 
    - Oview: User's snynopsis input
    - ImOri: User's Image input
    - Genr: User's genre filter input
    - Country: User's country filter input
    - DateRange1: User's minimum date
    - DateRange1: User's maximum date
    '''

def ShowkNN(Oview, ImOri, Genr, Country, DateRange1, DateRange2):


    Oview=str(Oview)

    ImOri=str(ImOri)

    Genr=str(Genr)

    Country=str(Country)

    DateRange1=int(DateRange1)

    DateRange2=int(DateRange2)




##### Load VGG16 base model with second to last layer

    base_model = VGG16(weights='imagenet')


    model2 = Model(input=base_model.input, output=base_model.get_layer('fc2').output)

##### Load and resize input image

    Resize(ImOri)
    


##### Get the 4096 features from second to last layer fc2 from input image

  

    fc2_features = model2.predict(x)



    
    
##### Load Google's pre-trained Word2Vec model.
    model1 = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)  



    vectors=[]


    del vectors[:]

##### Convert Overview to array of string


    sentence = "%s" % Oview
    sentence = re.sub("[^\w]", " ",  sentence).split()



##### Remove stop words
    sentenceFiltered = [word for word in sentence if word not in stopwords.words('english')]



##### Loop over words and get their 300 features each


    for w in sentenceFiltered:

        try:

            vectors.append(model1[w])

 
        except:
            continue    

##### Average the features over the whole synopsis


    MeanVec=[]

    del MeanVec[:]

    MeanVec=[sum(x)/len(vectors) for x in zip(*vectors)]


    word2vec_features = MeanVec


##### Convert word2vec_features to numpy array for future concatenation


    word2vec_features=np.asarray(word2vec_features)



##### Connection to DB and bind the connection to MetaData()   

    dbname = 'DBName'
    username = 'postgres'



    engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))

    metadata = MetaData(engine)

    metadata.reflect(bind=engine)


##### Create pandas df from sql database table #######

    with engine.connect() as conn:






        select_statement = metadata.tables['FilmTable'].select()

        pdfilm_table_from_sqlnew = pd.read_sql_query(select_statement,conn)



    pathImageDL="http://image.tmdb.org/t/p/original/"




##### Create arrays to use for storage

    ProdCountries=[]

    del ProdCountries[:]


    GenreTable=[]

    del GenreTable[:]

    MovieTitleTable=[]

    del MovieTitleTable[:]

    posterpathTable=[]

    del posterpathTable[:]

    fc2array=[]

    del fc2array[:]   

    increment=0


    genresArray=[]

    del genresArray[:]
    
    
    word2vecarray=[]

    del word2vecarray[:]   
    

    Date=[]


    del Date[:]








##### Loop over fc2 in pandas df
    for fc2str in pdfilm_table_from_sqlnew.fc2[(pdfilm_table_from_sqlnew.fc2.str.\
                                                contains("None")) == False]:



##### Conditions on genre


        genresArray=pdfilm_table_from_sqlnew.loc[pdfilm_table_from_sqlnew['fc2'] == fc2str                                         , 'genres'].item()


        if(Genr!="All"):


            genreFilter1Bool=False
            genreFilter1=Genr


            genreFilter2Bool=False


            for i in range(0,len(genresArray)):

                if genresArray[i] == '%s' % genreFilter1:
                    genreFilter1Bool=True



            if(genreFilter1Bool == False and genreFilter2Bool==False):
                continue
                
                
##### Conditions on country


        CountryArray=pdfilm_table_from_sqlnew.loc[pdfilm_table_from_sqlnew['fc2'] == fc2str                                     , 'production_countries_iso_3166_1'].item()

        if(Country!="All"):




            CountryFilterBool=False
            CountryFilter=Country


            for i in range(0,len(CountryArray)):

                if CountryArray[i] == '%s' % CountryFilter:
                    CountryFilterBool=True

            if(CountryFilterBool == False):
                continue



##### Conditions on date

        Date="%s" % pdfilm_table_from_sqlnew.loc[pdfilm_table_from_sqlnew['fc2'] == fc2str                                     , 'release_date'].item()

        Date=re.findall(r"[\w']+", Date)

        Date=int(Date[0])



        if(DateFilter!="All"):




            DateFilterBool=False



            if Date <= DateRange2 and Date >= DateRange1:
                DateFilterBool=True


            if(DateFilterBool == False):
                continue




##### Convert fc2str and word2vec to array, append poster link.


        fc2array.append(fc2str.split(' '))
        
        word2vecarray.append(pdfilm_table_from_sqlnew.word2vec[i].split(' '))
       

        posterpathTable.append(pathImageDL +pdfilm_table_from_sqlnew.loc[pdfilm_table_from_sqlnew['fc2']\
                                                                           == fc2str, 'poster_path'].item())

        MovieTitleTable.append(pdfilm_table_from_sqlnew.loc[pdfilm_table_from_sqlnew['fc2'] == fc2str                                                          , 'original_title'].item())
        GenreTable.append(pdfilm_table_from_sqlnew.loc[pdfilm_table_from_sqlnew['fc2'] == fc2str                                                          , 'genres'].item())

        ProdCountries.append(pdfilm_table_from_sqlnew.loc[pdfilm_table_from_sqlnew['fc2'] == fc2str                                                          , 'production_countries_iso_3166_1'].item())


    word2vecarray2=[]

    fc2array2=[]
    
    posterpathTable2=[]
    
    del posterpathTable2[:]

    del word2vecarray2[:]
    
    del fc2array2[:]


    for i, j in enumerate(word2vecarray):
        if(word2vecarray[i][0]!='None'):
            word2vecarray2.append(word2vecarray[i])
            fc2array2.append(fc2array[i])
            posterpathTable2.append(posterpathTable[i])



        
##### Convert fc2array2 and word2vecarray2 to numpy float array 


    X=np.array(fc2array2)


    X=X.astype(np.float)


    Y=np.array(word2vecarray2)
    

    Y=Y.astype(np.float)
    

##### Reduce number of features from 4096 to 300 for images using T-SVD



    RS=9202017

    svd = TruncatedSVD(n_components=300, random_state=RS)  ##### algorithm = arpack or randomaized by default

    svd.fit(X)  ##### Fit on database fc2 column which contain 4096 features per movie poster

    fc2_featuresTSVD=svd.transform(fc2_features) #### reduce user's input image 4096 features to 300

    XTSVD=svd.transform(X) #### reduce 4096 features to 300 for each movie poster in the database

    
    
##### Normalization before concatenating, using the standard deviation and mean of features
    
    fc2_featuresTSVDravel=fc2_featuresTSVD.ravel()

    
    fc2_featuresTSVDravelNorm=(fc2_featuresTSVDravel-fc2_featuresTSVDravel.mean())/fc2_featuresTSVDravel.std()

    word2vec_featuresNorm=(word2vec_features-word2vec_features.mean())/word2vec_features.std()

    
    
    for i,j in enumerate(XTSVD):
        XTSVD[i]=(XTSVD[i]-XTSVD[i].mean())/XTSVD[i].std()
        Y[i]=(Y[i]-Y[i].mean())/Y[i].std()
    
    
    
    
    
    
##### Concatenation of word2vec and fc2 features

    fc2PlusWord2Vec=np.concatenate((fc2_featuresTSVDravelNorm, word2vec_featuresNorm), axis=0)


    XTSVDPlusY=np.concatenate((XTSVD, Y), axis=1)
    
##### Find the closest neighbors using k-NN algorithm


    distance,index = spatial.cKDTree(XTSVDPlusY).query(fc2PlusWord2Vec,k=12)



    return posterpathTable2, index;

