In [None]:
import pickle
import glob
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import date
from collections import defaultdict,Counter

# Helper Functions

In [None]:
categoryWeights = {
    'actor': 10,
    'actress': 10,
    'director': 9,
    'producer': 8
}

In [None]:
cosineWeight = 0.6
ageWeight = 0.3
ratingWeight = 0.1

In [None]:
def getCurrentYear():
    return date.today().year

In [None]:
def cosim(v1, v2):
    u = np.float64(v1)
    v = np.float64(v2)
    return np.dot(u,v)/(np.linalg.norm(u)*np.linalg.norm(v))

In [None]:
def getAgeFactor(release_year):
    diff = getCurrentYear() - int(release_year)
    return 1/math.exp(diff/100)

In [None]:
def getRatingFactor(rating):
    return rating/10

In [None]:
def getFinalScore(cosine, age, rating):
    return (cosineWeight * cosine) + (ageWeight * age) + (ratingWeight * rating)

# Load Data & Preprocess

In [None]:
tsv_files = glob.glob('./dataset/*.tsv')
for file in tsv_files:
    print('Processing',file)
    destination_path = open('./savFiles/'+file[10:-4]+'.sav', 'wb')
    pickle.dump(pd.read_table(file,sep='\t',low_memory=False, na_values=['\\N','nan']), destination_path)
    print('Finished',file)

In [None]:
basics = pickle.load(open('./savFiles/title.basics.sav','rb'))
akas = pickle.load(open('./savFiles/title.akas.sav','rb'))
ratings = pickle.load(open('./savFiles/title.ratings.sav','rb'))
principal = pickle.load(open('./savFiles/title.principal.sav','rb'))
people = pickle.load(open('./savFiles/name.basics.sav','rb'))

In [None]:
df_basics = basics.drop(['originalTitle','isAdult','runtimeMinutes','genres','endYear'],axis=1)

df_basics = df_basics[df_basics['titleType']=='movie']
df_basics['startYear'] = df_basics['startYear'].fillna(str(getCurrentYear()))

df_basics = df_basics.drop(['titleType'],axis=1)

In [None]:
df_akas = akas.drop(['ordering','language','types','attributes','isOriginalTitle','title'],axis=1)
df_akas = df_akas[df_akas['region']== 'US']

In [None]:
df_ratings = ratings.drop(['numVotes'],axis=1)
df_ratings['averageRating'] = df_ratings['averageRating'].fillna(0.0)

In [None]:
df_people = people.drop(['birthYear','deathYear','primaryProfession','knownForTitles'],axis=1)
df_people

In [None]:
us_all_titles = df_akas['titleId'].unique()

all_movie_titles = df_basics['tconst'].unique()
set_movies = set(all_movie_titles)

us_movie_titles = [title for title in us_all_titles if title in set_movies]

peopleList = df_people['nconst'].unique()
peopleCount = len(peopleList)

print('Number of all regional movies:',len(all_movie_titles))
print('Number of US regional titles:',len(us_all_titles))
print('Number of US regional movies:',len(us_movie_titles))
print('Number of people:',peopleCount)

In [None]:
categories = ['actor','director','producer','actress']

df_principal = principal.drop(['ordering','job','characters'],axis=1)

df_principal = df_principal[df_principal['category'].isin(categories)]
df_principal = df_principal[df_principal['tconst'].isin(us_movie_titles)]

In [None]:
data = {}

for _, row in tqdm(principleData.iterrows()):
    
    if row["tconst"] not in data.keys():
        data[row["tconst"]] = np.zeros(peopleCount)
        
    if row["nconst"] in peopleList
        index_ = peopleList.index(row["nconst"])
        if categoryWeights[row['category']] > data[row["tconst"]][index_]:
            data[row["tconst"]][index_] = categoryWeights[row['category']]
        
titlePeople_df = pd.from_dict(data.items(), columns=['tconst','vector'])

In [None]:
final_df = pd.DataFrame(us_movie_titles, columns=['tconst'])

final_df = final_df.merge(df_basics, on = 'tconst', how = 'left')
final_df = final_df.merge(df_ratings, on = 'tconst', how = 'left')
final_df = final_df.merge(titlePeople_df, on = 'tconst', how = 'left')

In [None]:
final_df = final_df.set_index('tconst')
finalTitleData = final_df.to_dict('index')

In [None]:
final_df

In [None]:
with open('CleanTitleData.json', 'w') as fp:
    json.dump(finalTitleData, fp, indent=4)

# Query

In [None]:
def getQueryVector(pplList):
    queryvector = np.zeros(peopleCount)
    for person in pplList:
        if person in peopleList:
            queryvector[peopleData.index(person)] = categoryWeights['actor']
    return queryvector

In [None]:
query = ['Tom Holland','Zendaya']

In [None]:
queryVector = getQueryVector(query)

# Calculate Scores

In [None]:
f = open('CleanTitleData.json')
cleanTitleData = json.load(f)
f.close()

In [None]:
abc = { n: cleanTitleData[n] for n in list(cleanTitleData.keys()) 
    if (int(cleanTitleData[n]['startYear']) >= 2019 and cleanTitleData[n]['averageRating'] != 0 )
}

In [None]:
scores = {}

for titleId in tqdm(abc.keys()):
    sim = cosim(queryVector, cleanTitleData[titleId][vector])
    age = getAgeFactor(cleanTitleData[titleId]['startYear'])
    rating  = getRatingFactor(cleanTitleData[titleId]['averageRating'])
    scores[titleId] = getFinalScore(sim,age,rating)

In [None]:
result = dict(Counter(scores).most_common(25))

# Final result

In [None]:
for r in result.keys():
    print(cleanTitleData[r]['primaryName'], " : ",result[r])