In [1]:
import pandas as pd
import numpy as np
import json 
from collections import Counter
from datetime import date
import math
from tqdm import tqdm

# Query

In [2]:
query = ['Tom Holland','Zendaya']

# Pre-Processing

In [104]:
principleData = pd.read_csv("title.principal", sep="\t")

In [3]:
actorData = pd.read_csv("name.basics.tsv",sep="\t").drop(['birthYear','deathYear',	'primaryProfession','knownForTitles'], axis=1)

In [4]:
actorNconstList = actorData.groupby('primaryName')['nconst'].apply(list).to_dict()

In [107]:
data = {}
for index, row in tqdm(principleData.iterrows()):
    if row["tconst"] not in data.keys():
        data[row["tconst"]] = {}
    if row["nconst"] not in data[row["tconst"]].keys():
        data[row["tconst"]][row["nconst"]] = []
    data[row["tconst"]][row["nconst"]].append(row['category'])

50118947it [32:38, 25588.00it/s]


In [109]:
with open('./data/principleData.json', 'w') as fp:
    json.dump(data, fp,  indent=4)

In [5]:
f = open('./data/principleData.json')
movieData = json.load(f)
f.close()

In [6]:
catWeights = {
    'self': 9,
    'director': 9,
    'cinematographer': 6,
    'composer': 5,
    'producer': 8,
    'editor': 5,
    'actor': 10,
    'actress': 10,
    'writer': 7,
    'production_designer': 1,
    'archive_footage': 1,
    'archive_sound': 1
}

In [7]:
nnConstVectorSize = actorData["nconst"].unique().tolist()
nCV = len(nnConstVectorSize)

In [140]:
nCV

11594277

In [154]:
movieTitlesData = pd.read_csv("title.basics.tsv",sep="\t").drop(['originalTitle','isAdult',	'runtimeMinutes','genres'], axis=1)

In [None]:
movieTitlesRating = pd.read_csv("title.ratings.tsv",sep="\t").drop(['numVotes'], axis=1)

In [None]:
movieTitlesRating.head()

Unnamed: 0,tconst,averageRating
0,tt0000001,5.7
1,tt0000002,5.9
2,tt0000003,6.5
3,tt0000004,5.8
4,tt0000005,6.2


In [None]:

newDf = movieTitlesData.merge(movieTitlesRating, on='tconst', how='left')


In [None]:
newDf[newDf["tconst"]=='tt0000180']

Unnamed: 0,tconst,titleType,primaryTitle,startYear,endYear,averageRating
177,tt0000180,short,Le chemin de croix,1898,\N,


In [None]:
data2 = {}

for index, row in newDf.iterrows():
    if row['titleType'] == "movie":
        if row['tconst'] not in data2.keys():
            data2[row["tconst"]] = {}
        data2[row["tconst"]]['name'] = row["primaryTitle"]
        if row['titleType'] != "TV_Series":
            data2[row["tconst"]]['release_year'] = '1950' if row['startYear'] == "\\N" else row['startYear']
        else:
            if row['endYear'] == "\\N":
                data2[row["tconst"]]['release_year'] = date.today().year
            else:
                data2[row["tconst"]]['release_year'] = row['endYear']
        if pd.isna(row['averageRating']):
            data2[row["tconst"]]['rating'] = 0
        else:
            data2[row["tconst"]]['rating'] = row['averageRating']

In [None]:
with open('./data/titles.json', 'w') as fp:
    json.dump(data2, fp,  indent=4)

In [None]:
f = open('./data/titles.json')
titleData = json.load(f)
f.close()

# Helper functions

In [8]:
def cosim(v1, v2):
    u = np.float64(v1)
    v = np.float64(v2)
    return np.dot(u,v)/(np.linalg.norm(u)*np.linalg.norm(v))

In [9]:
def getMaxCategoryWeight(categories):
    val = 0
    for cat in categories:
        catVal = catWeights[cat]
        if catVal > val:
            val = catVal
    return val

In [152]:
def getVector(tconst):
    v = np.zeros(nCV)
    for person in movieData[tconst].keys():
        if person in nnConstVectorSize:
            categories = movieData[tconst][person]
            v[nnConstVectorSize.index(person)] = getMaxCategoryWeight(categories)
    return v

In [None]:

def getAgeFactor(release_year):
    currentYear = date.today().year
    diff = currentYear - int(release_year)
    return 1/math.exp(diff/100)

In [None]:
def getRatingFactor(rating):
    return rating/10

In [None]:
def getFinalScore(cosine, age, rating):
    cosineWeight = 0.7
    ageWeight = 0.2 
    ratingWeight = 0.1 
    return (cosineWeight * cosine) + (ageWeight * age) + (ratingWeight * rating)

# Query Vector

In [None]:
queryVect = np.zeros(nCV)
for p in query:
    nConstList = actorNconstList[p]
    for entry in nConstList:
        if entry in nnConstVectorSize:
            queryVect[nnConstVectorSize.index(entry)] = getMaxCategoryWeight(catWeights)

# Filtering Dataset 
Filtering the dataset due to constraint on resources and computing power. Here, we will be matching our query with titles that are of type "Movies", are released in or after 2020 and have a rating of more than 0. 

In [None]:
filtered = {n: movieData[n] for n in list(movieData.keys()) 
if (n in titleData.keys() and int(titleData[n]['release_year'])>=2020 and titleData[n]['rating']!=0 )}

In [None]:
len(list(filtered.keys()))

16550

## Score Calculation

In [None]:
scores = {}
for movieId in tqdm(filtered.keys()):
        sim = cosim(queryVect, getVector(movieId))
        age = getAgeFactor(titleData[movieId]['release_year'])
        
        rating  = getRatingFactor(titleData[movieId]['rating'])
        scores[movieId] = (sim,age,rating, getFinalScore(sim,age,rating))


100%|██████████| 16550/16550 [4:53:08<00:00,  1.06s/it]   


In [None]:
scoreDict = dict(Counter(scores).most_common(25))

# Results

In [None]:
titles = []
names = []
years = []
ratings = [] 
scores = []
sim = []
ageFactor = []
ratingFactor = []

for title in scoreDict:
    titles.append(title)
    names.append(titleData[title]["name"])
    years.append(titleData[title]["release_year"])
    ratings.append(titleData[title]["rating"])
    scores.append(0.7*scoreDict[title][0] + 0.2*scoreDict[title][1] + 0.1*scoreDict[title][2])
    sim.append(scoreDict[title][0])
    ageFactor.append(scoreDict[title][1])
    ratingFactor.append(scoreDict[title][2])

results = pd.DataFrame({
          "titleId" : titles,
          "Title Name" : names,
          "Release year" : years,
          "Rating" : ratings,
          "Score" : scores,
          "Similarity score" : sim,
          "Age factor" : ageFactor,
          "Rating factor" : ratingFactor
          })   

In [None]:
final_results = results.sort_values(by=['Score'], ascending=False)

In [None]:
final_results

Unnamed: 0,titleId,Title Name,Release year,Rating,Score,Similarity score,Age factor,Rating factor
0,tt10872600,Spider-Man: No Way Home,2021,8.4,0.384871,0.146944,0.99005,0.84
4,tt1160419,Dune,2021,8.1,0.329927,0.072739,0.99005,0.81
1,tt12676326,Malcolm & Marie,2021,6.6,0.326872,0.089803,0.99005,0.66
2,tt7146812,Onward,2020,7.4,0.323221,0.075974,0.980199,0.74
6,tt7395114,The Devil All the Time,2020,7.1,0.31746,0.072028,0.980199,0.71
3,tt1464335,Uncharted,2022,6.5,0.316959,0.074227,1.0,0.65
7,tt9130508,Cherry,2021,6.6,0.314364,0.071935,0.99005,0.66
5,tt2076822,Chaos Walking,2021,5.7,0.30543,0.072028,0.99005,0.57
15,tt17160594,The Freedoms,2022,10.0,0.3,0.0,1.0,1.0
20,tt19769530,Philios,2022,10.0,0.3,0.0,1.0,1.0
