In [None]:
import numpy as np
import pandas as pd
from pprint import pprint
import math
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix
from itertools import combinations

In [None]:
alpha = 0.15
iterations = 10

In [None]:
def tokenize(names):
    return [name.strip().lower() for name in names.split(',')]

In [None]:
def pagerank(B):
    A = B.T
    n = B.shape[0]

    r = np.ones((n,)) / n
    # r = np.ones((n,))
    # r = np.zeros((n,))

    for i in range(iterations):
        r = (1 - alpha) * A @ r + alpha/n

    return r

In [None]:
data = pd.read_csv('NetFlix.csv')
data = data.to_dict()
pprint(data.keys())

In [None]:

actors = [tokenize(data['cast'][k]) for k in data['cast'].keys() if type(data['cast'][k]) == str]

actors_all = list(set([a for movie in actors for a in movie]))
actors_map = {a:i for i,a in enumerate(actors_all)}
actors_map_i = {i:a for i,a in enumerate(actors_all)}
print(len(actors_all))

pairs = {}

for movie in actors: 
    for a in combinations(movie, 2):
        if actors_map[a[0]] < actors_map[a[1]]:
            i = actors_map[a[0]]
            j = actors_map[a[1]]
        else:
            i = actors_map[a[1]]
            j = actors_map[a[0]]
        pairs[(i, j)] = pairs.get((i, j), 0) + 1
    
row = []
col = []
val = []

for p in pairs.items():
    row.append(p[0][0])
    col.append(p[0][1])
    val.append(p[1])
    row.append(p[0][1])
    col.append(p[0][0])
    val.append(p[1])

B = csr_matrix((val, (row, col)))

B = normalize(B, norm='l1', axis=1)

Solving PageRank as a system of linear equations:

In [None]:
r = np.linalg.solve(np.eye(B.shape[0]) - (1-alpha)*B.T, (alpha/B.shape[0]) * np.ones((B.shape[0], 1)))
print(r)

Solving PageRank with matrix inversion:

In [None]:
r = np.linalg.inv(np.eye(n) - (1-alpha)*B.T) @ np.ones((n, 1)) * (alpha/n)

In [None]:
r = pagerank(B)
# print(r)

most_i = np.flip(np.argsort(r)[-10:])
pprint([(actors_map_i[k], r[k]) for k in most_i])