In [1]:
import pandas as pd
import numpy as np
import json 
from collections import Counter


In [2]:
query = ['Tom Holland','Zendaya']

In [104]:
principleData = pd.read_csv("title.principal", sep="\t")

In [3]:
actorData = pd.read_csv("name.basics.tsv",sep="\t").drop(['birthYear','deathYear',	'primaryProfession','knownForTitles'], axis=1)

In [4]:
actorNconstList = actorData.groupby('primaryName')['nconst'].apply(list).to_dict()

In [107]:
data = {}
for index, row in tqdm(principleData.iterrows()):
    if row["tconst"] not in data.keys():
        data[row["tconst"]] = {}
    if row["nconst"] not in data[row["tconst"]].keys():
        data[row["tconst"]][row["nconst"]] = []
    data[row["tconst"]][row["nconst"]].append(row['category'])

50118947it [32:38, 25588.00it/s]


In [109]:
with open('./data/principleData.json', 'w') as fp:
    json.dump(data, fp,  indent=4)

In [5]:
f = open('./data/principleData.json')
movieData = json.load(f)
f.close()

In [6]:
catWeights = {
    'self': 9,
    'director': 9,
    'cinematographer': 6,
    'composer': 5,
    'producer': 8,
    'editor': 5,
    'actor': 10,
    'actress': 10,
    'writer': 7,
    'production_designer': 1,
    'archive_footage': 1,
    'archive_sound': 1
}

In [7]:
nnConstVectorSize = actorData["nconst"].unique().tolist()
nCV = len(nnConstVectorSize)

In [8]:
def cosim(v1, v2):
    u = np.float64(v1)
    v = np.float64(v2)
    return np.dot(u,v)/(np.linalg.norm(u)*np.linalg.norm(v))

In [9]:
def getMaxCategoryWeight(categories):
    val = 0
    for cat in categories:
        catVal = catWeights[cat]
        if catVal > val:
            val = catVal
    return val

In [51]:
def getVector(tconst):
    v = np.zeros(nCV)
    for person in movieData[tconst].keys():
        if person in nnConstVectorSize:
            categories = movieData[tconst][person]
            v[nnConstVectorSize.index(person)] = getMaxCategoryWeight(categories)
    return v

In [15]:
movieTitlesData = pd.read_csv("title.basics.tsv",sep="\t").drop(['originalTitle','isAdult',	'runtimeMinutes','genres'], axis=1)

In [20]:
movieTitlesRating = pd.read_csv("title.ratings.tsv",sep="\t").drop(['numVotes'], axis=1)

In [21]:
movieTitlesRating.head()

Unnamed: 0,tconst,averageRating
0,tt0000001,5.7
1,tt0000002,5.9
2,tt0000003,6.5
3,tt0000004,5.8
4,tt0000005,6.2


In [39]:
from datetime import date

newDf = movieTitlesData.merge(movieTitlesRating, on='tconst', how='left')


In [40]:
newDf[newDf["tconst"]=='tt0000180']

Unnamed: 0,tconst,titleType,primaryTitle,startYear,endYear,averageRating
177,tt0000180,short,Le chemin de croix,1898,\N,


In [119]:
data2 = {}

for index, row in newDf.iterrows():
    if row['titleType'] == "movie":
        if row['tconst'] not in data2.keys():
            data2[row["tconst"]] = {}
        data2[row["tconst"]]['name'] = row["primaryTitle"]
        if row['titleType'] != "TV_Series":
            data2[row["tconst"]]['release_year'] = '1950' if row['startYear'] == "\\N" else row['startYear']
        else:
            if row['endYear'] == "\\N":
                data2[row["tconst"]]['release_year'] = date.today().year
            else:
                data2[row["tconst"]]['release_year'] = row['endYear']
        if pd.isna(row['averageRating']):
            data2[row["tconst"]]['rating'] = 0
        else:
            data2[row["tconst"]]['rating'] = row['averageRating']

In [120]:
with open('./data/titles.json', 'w') as fp:
    json.dump(data2, fp,  indent=4)

In [121]:
f = open('./data/titles.json')
titleData = json.load(f)
f.close()

In [89]:
from datetime import date
import math

def getAgeFactor(release_year):
    currentYear = date.today().year
    diff = currentYear - int(release_year)
    return 1/math.exp(diff/100)

In [78]:
def getRatingFactor(rating):
    return rating/10

In [79]:
def getFinalScore(cosine, age, rating):
    cosineWeight = 0.6
    ageWeight = 0.3
    ratingWeight = 0.1
    return (cosineWeight * cosine) + (ageWeight * age) + (ratingWeight * rating)

In [80]:
queryVect = np.zeros(nCV)
for p in query:
    nConstList = actorNconstList[p]
    for entry in nConstList:
        if entry in nnConstVectorSize:
            queryVect[nnConstVectorSize.index(entry)] = getMaxCategoryWeight(catWeights)

In [128]:
filtered = {n: movieData[n] for n in list(movieData.keys()) 
if (n in titleData.keys() and int(titleData[n]['release_year'])>2020 and titleData[n]['rating']!=0 )}

In [129]:
len(list(filtered.keys()))

8950

In [102]:
filtered

{'tt0116700': {'nm0869018': ['producer'],
  'nm0053903': ['actor'],
  'nm0025445': ['actor'],
  'nm0243517': ['actor'],
  'nm0060655': ['actor'],
  'nm0181117': ['director'],
  'nm0494378': ['writer'],
  'nm0902463': ['writer'],
  'nm0556550': ['producer'],
  'nm0753651': ['producer']},
 'tt0120589': {'nm2212828': ['composer'],
  'nm0269077': ['actor'],
  'nm0655945': ['actress'],
  'nm3580842': ['actor'],
  'nm3253771': ['actor'],
  'nm0004000': ['director'],
  'nm1860699': ['writer'],
  'nm1291775': ['producer'],
  'nm1164842': ['producer'],
  'nm5220214': ['producer']},
 'tt0124961': {'nm3028192': ['actor'],
  'nm0040498': ['actor'],
  'nm0123681': ['actor'],
  'nm0129807': ['actress'],
  'nm0204496': ['actor'],
  'nm0404014': ['director'],
  'nm0343720': ['writer'],
  'nm0339034': ['actress'],
  'nm0369241': ['actor'],
  'nm0494186': ['actor']},
 'tt0265705': {'nm0689789': ['actor'],
  'nm0139473': ['actor'],
  'nm0204525': ['actor'],
  'nm0205087': ['actress'],
  'nm0788223': ['ci

In [126]:
scores = {}
from tqdm import tqdm
for movieId in tqdm(filtered.keys()):
        sim = cosim(queryVect, getVector(movieId))
        age = getAgeFactor(titleData[movieId]['release_year'])
        rating  = getRatingFactor(titleData[movieId]['rating'])
        scores[movieId] = getFinalScore(sim,age,rating)

100%|██████████| 1724/1724 [30:22<00:00,  1.06s/it]


In [127]:
dict(Counter(scores).most_common(25))

{'tt1464335': 0.4095361771415123,
 'tt14690746': 0.4,
 'tt15333126': 0.4,
 'tt15341150': 0.4,
 'tt15394074': 0.4,
 'tt16112972': 0.4,
 'tt16755990': 0.4,
 'tt16790672': 0.4,
 'tt17160594': 0.4,
 'tt18163404': 0.4,
 'tt18311204': 0.4,
 'tt18938056': 0.4,
 'tt18951862': 0.4,
 'tt19769530': 0.4,
 'tt11915042': 0.399,
 'tt12762788': 0.399,
 'tt15295828': 0.399,
 'tt17007480': 0.399,
 'tt18561246': 0.399,
 'tt18970130': 0.399,
 'tt18989934': 0.399,
 'tt10366418': 0.398,
 'tt13130926': 0.398,
 'tt13528560': 0.398,
 'tt17043084': 0.398}

In [6]:
prin[prin["tconst"] == "tt0000001"][["nconst","category"]]

Unnamed: 0,nconst,category
0,nm1588970,self
1,nm0005690,director
2,nm0374658,cinematographer


In [13]:
def getNNconstAndCat(tt):
    return prin[prin["tconst"] == tt][["nconst","category"]]

In [14]:
getNNconstAndCat("tt0000001")

Unnamed: 0,nconst,category
0,nm1588970,self
1,nm0005690,director
2,nm0374658,cinematographer


In [16]:
nnConstVectorSize = prin["nconst"].unique().tolist()

In [19]:
nCV = len(nnConstVectorSize)

In [23]:
catListUnique = prin["category"].unique().tolist()

In [24]:
catListUnique

['self',
 'director',
 'cinematographer',
 'composer',
 'producer',
 'editor',
 'actor',
 'actress',
 'writer',
 'production_designer',
 'archive_footage',
 'archive_sound']

In [60]:
def getVector(tconst):
    v = np.zeros(nCV)
    nconstOut = getNNconstAndCat(tconst)
    nnList = nconstOut["nconst"].to_list()
    catList = nconstOut["category"].to_list()
    for i,nn in enumerate(nnList):
        index_ = nnConstVectorSize.index(nn)
        cat = catList[i]
        v[index_] = catListUnique.index(cat) + 1
    return v


In [58]:
titles = prin["tconst"].unique()

In [64]:
len(titles)

8011609

In [62]:
vectorList = []

In [85]:
from tqdm import tqdm

for i in tqdm(titles):
    vectorList.append(getVector(i))

  0%|          | 30649/8011609 [00:54<3:56:19, 562.87it/s] 


KeyboardInterrupt: 

In [37]:
actorNames = pd.read_csv("name.basics.tsv",sep="\t").drop(['birthYear','deathYear',	'primaryProfession','knownForTitles'])

In [44]:
actorNames[actorNames["primaryName"] == "Tom Holland"]

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
261824,nm0276169,Tom Holland,1943,\N,"actor,writer,director","tt0089175,tt0086154,tt0117894,tt0094862"
369904,nm0390847,Tom Holland,\N,\N,"actor,visual_effects","tt0051301,tt0041532,tt0047739,tt0153094"
369905,nm0390848,Tom Holland,\N,\N,costume_department,tt0056499
369906,nm0390849,Tom Holland,\N,\N,,\N
1392814,nm10503517,Tom Holland,\N,\N,actor,\N
2113877,nm11252455,Tom Holland,\N,\N,,\N
2893480,nm12059929,Tom Holland,\N,\N,,"tt6570890,tt0363373,tt11833936"
3389060,nm12574442,Tom Holland,\N,\N,,tt1944942
4009787,nm13231994,Tom Holland,\N,\N,actor,tt9814506
5198083,nm2286597,Tom Holland,1968,\N,"writer,miscellaneous","tt2050498,tt0273026,tt11560302"


In [32]:
v

array([1., 2., 3., ..., 0., 0., 0.])

In [47]:
np.sum(v)

72.0

In [48]:
index_ = nnConstVectorSize.index("nm4043618")

In [49]:
index_

1141671

In [50]:
v[1141671]

7.0

In [51]:
qv = np.zeros(nCV)
qv[1141671] = 1

In [53]:
cosim(qv,v)

0.29475317237328164

In [54]:
import time

In [55]:
time.time()

1651504773.408334