# The function ShowkNN returns the closest neighbors using a k-NN algorithm from 600 concatenated features of image and synopsis.

### The image poster features are first reduced to 300 features using a t-SVD algorithm. The features for both image and synopsis are normalized, and then concatenated before using them with a k-NN algorithm. 

### 1. Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import string
import re
import gensim
from nltk.corpus import stopwords
from PIL import Image
import requests
from io import BytesIO
from keras.models import Model
from quiver_engine.imagenet_utils import preprocess_input
from keras.preprocessing import image
from VGG16_PretrainedV1 import VGG16
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix
from sklearn.neighbors import NearestNeighbors
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
from sqlalchemy import Column, Integer, MetaData, Table
import psycopg2
from scipy import spatial
import matplotlib.image as mpimg

In [None]:
##### Open and resize Image function 
def resize(im_ori):
    img = Image.open(im_ori)
    if img.mode == 'L' or img.mode == 'RGBA':
            img = img.convert('RGB')
    img = img.resize((224, 224), Image.ANTIALIAS)
    
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    
    return x;

### 2. Function show_knn which returns an array of indices corresponding to the nearest neighbor movies

In [None]:
'''Input of ShowkNN function: 
    - Oview: User's snynopsis input
    - ImOri: User's Image input
    - Genr: User's genre filter input
    - Country: User's country filter input
    - DateRange1: User's minimum date
    - DateRange1: User's maximum date
    '''
def show_knn(oview, im_ori, genr, country, date_range1, date_range2):
    oview=str(oview)
    im_ori=str(im_ori)
    genr=str(genr)
    country=str(country)
    date_range1=int(date_range1)
    date_range2=int(date_range2)

    ##### Load VGG16 base model with second to last layer
    base_model = VGG16(weights='imagenet')
    model2 = Model(input=base_model.input, output=base_model.get_layer('fc2').output)
    
    ##### Load and resize input image
    resize(im_ori)
    
    ##### Get the 4096 features from second to last layer fc2 from input image
    fc2_features = model2.predict(x)

    ##### Load Google's pre-trained Word2Vec model.
    model1 = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)  
    vectors=list()
    
    ##### Convert Overview to array of string
    sentence = "%s" % oview
    sentence = re.sub("[^\w]", " ",  sentence).split()
    
    ##### Remove stop words
    sentence_filtered = [word for word in sentence if word not in stopwords.words('english')]
    
    ##### Loop over words and get their 300 features each
    for w in sentence_filtered:
        try:
            vectors.append(model1[w]) 
        except:
            continue    
    ##### Average the features over the whole synopsis
    mean_vec=list()
    mean_vec=[sum(x)/len(vectors) for x in zip(*vectors)]
    word2vec_features = mean_vec
    
    ##### Convert word2vec_features to numpy array for future concatenation
    word2vec_features=np.asarray(word2vec_features)

    ##### Connection to DB and bind the connection to MetaData()   
    dbname = 'DBName'
    username = 'postgres'
    engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
    metadata = MetaData(engine)
    metadata.reflect(bind=engine)

    ##### Create pandas df from sql database table #######
    with engine.connect() as conn:
        select_statement = metadata.tables['FilmTable'].select()
        pdfilm_table_from_sqlnew = pd.read_sql_query(select_statement,conn)
        path_image_dl="http://image.tmdb.org/t/p/original/"
    
    ##### Create arrays to use for storage
    prod_countries=list()
    genre_table=list()
    movie_title_table=list()
    posterpath_table=list()
    fc2array=list()
    genres_array=list()
    word2vecarray=list()
    date=list()

    increment=0
    ##### Loop over fc2 in pandas df
    for fc2str in pdfilm_table_from_sqlnew.fc2[(pdfilm_table_from_sqlnew.fc2.str.\
                                                contains("None")) == False]:
        ##### Conditions on genre
        genres_array=pdfilm_table_from_sqlnew.loc[pdfilm_table_from_sqlnew['fc2'] == fc2str, 'genres'].item()
        if(genr!="All"):
            genre_filter1bool=False
            genre_filter1=genr
            genre_filter2bool=False
            for i in range(0,len(genres_array)):
                if genres_array[i] == '%s' % genre_filter1:
                    genre_filter1bool=True
            if(genre_filter1bool == False and genre_filter2bool==False):
                continue

        ##### Conditions on country
        country_array=pdfilm_table_from_sqlnew.loc[pdfilm_table_from_sqlnew['fc2'] == fc2str, 'production_countries_iso_3166_1'].item()
        if(country!="All"):
            country_filterbool=False
            country_filter=country
            for i in range(0,len(country_array)):
                if country_array[i] == '%s' % country_filter:
                    country_filterbool=True
            if(country_filterbool == False):
                continue
                
        ##### Conditions on date
        date="%s" % pdfilm_table_from_sqlnew.loc[pdfilm_table_from_sqlnew['fc2'] == fc2str, 'release_date'].item()
        date=re.findall(r"[\w']+", date)
        date=int(date[0])
        if(date_filter!="All"):
            date_filterbool=False
            if date <= date_range2 and date >= date_range1:
                date_filterbool=True
            if(date_filterbool == False):
                continue
        
        ##### Convert fc2str and word2vec to array, append poster link.
        fc2array.append(fc2str.split(' '))
        word2vecarray.append(pdfilm_table_from_sqlnew.word2vec[i].split(' '))
        posterpathTable.append(path_image_dl +pdfilm_table_from_sqlnew.loc[pdfilm_table_from_sqlnew['fc2']\
                                                                           == fc2str, 'poster_path'].item())
        movie_title_table.append(pdfilm_table_from_sqlnew.loc[pdfilm_table_from_sqlnew['fc2'] == fc2str, 'original_title'].item())
        genre_table.append(pdfilm_table_from_sqlnew.loc[pdfilm_table_from_sqlnew['fc2'] == fc2str, 'genres'].item())
        prod_countries.append(pdfilm_table_from_sqlnew.loc[pdfilm_table_from_sqlnew['fc2'] == fc2str, 'production_countries_iso_3166_1'].item())

    word2vecarray2=list()
    fc2array2=list()
    posterpathTable2=list()

    for i, j in enumerate(word2vecarray):
        if(word2vecarray[i][0]!='None'):
            word2vecarray2.append(word2vecarray[i])
            fc2array2.append(fc2array[i])
            posterpath_table2.append(posterpath_table[i])
       
    ##### Convert fc2array2 and word2vecarray2 to numpy float array 
    X=np.array(fc2array2)
    X=X.astype(np.float)
    Y=np.array(word2vecarray2)
    Y=Y.astype(np.float)

    ##### Reduce number of features from 4096 to 300 for images using T-SVD
    RS=9202017
    svd = TruncatedSVD(n_components=300, random_state=RS)  ##### algorithm = arpack or randomaized by default
    svd.fit(X)  ##### Fit on database fc2 column which contain 4096 features per movie poster
    fc2_featuresTSVD=svd.transform(fc2_features) #### reduce user's input image 4096 features to 300
    XTSVD=svd.transform(X) #### reduce 4096 features to 300 for each movie poster in the database

    ##### Normalization before concatenating, using the standard deviation and mean of features
    fc2_featuresTSVD_ravel=fc2_featuresTSVD.ravel()
    fc2_featuresTSVD_ravel_norm=(fc2_featuresTSVD_ravel-fc2_featuresTSVD_ravel.mean())/fc2_featuresTSVD_ravel.std()
    word2vec_features_norm=(word2vec_features-word2vec_features.mean())/word2vec_features.std()

    for i,j in enumerate(XTSVD):
        XTSVD[i]=(XTSVD[i]-XTSVD[i].mean())/XTSVD[i].std()
        Y[i]=(Y[i]-Y[i].mean())/Y[i].std()
    
    ##### Concatenation of word2vec and fc2 features
    fc2_plus_word2vec=np.concatenate((fc2_featuresTSVD_ravel_norm, word2vec_features_norm), axis=0)
    XTSVDPlusY=np.concatenate((XTSVD, Y), axis=1)
    
    ##### Find the closest neighbors using k-NN algorithm
    distance,index = spatial.cKDTree(XTSVDPlusY).query(fc2_plus_word2vec,k=12)

    return posterpath_table2, index;