In [64]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import random_split
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import regex as re
from scipy.stats import entropy
import threading
import multiprocessing as mp

In [65]:
# importing and cleaning data
df = pd.read_csv("data.csv", on_bad_lines='skip', header=None)
print(df.shape)

# remove everything except numbers from star columns
df[list(range(2, 12))] = df[list(range(2, 12))].apply(lambda col: col.apply(lambda x: int(re.findall(r'\d+', str(x))[0])))

# rename columns
df = df.rename(columns={0: 'movie',
                        1: 'year',
                        2: '.5 stars', 
                        3: '1 stars', 
                        4: '1.5 stars', 
                        5: '2 stars', 
                        6: '2.5 stars',
                        7: '3 stars',
                        8: '3.5 stars',
                        9: '4 stars',
                        10: '4.5 stars',
                        11: '5 stars',
                       })



df.head()

(4557, 12)


Unnamed: 0,movie,year,.5 stars,1 stars,1.5 stars,2 stars,2.5 stars,3 stars,3.5 stars,4 stars,4.5 stars,5 stars
0,Barbie,2023,19316,44885,28668,152621,147414,615296,547411,1128634,406002,936348
1,Parasite,2019,3261,7681,3274,22162,18457,130686,151528,784645,629450,1976278
2,Interstellar,2014,5436,13833,7069,45801,36376,201868,200064,715673,519801,1878988
3,Fight Club,1999,4929,13036,6527,43328,36710,239648,258091,982018,562208,1368304
4,La La Land,2016,14219,43518,15005,104909,60828,313040,242867,747143,397459,1226958


In [66]:
df["totalReviews"] = df.iloc[:, range(2,12)].sum(axis=1)

In [67]:
df.head()

Unnamed: 0,movie,year,.5 stars,1 stars,1.5 stars,2 stars,2.5 stars,3 stars,3.5 stars,4 stars,4.5 stars,5 stars,totalReviews
0,Barbie,2023,19316,44885,28668,152621,147414,615296,547411,1128634,406002,936348,4026595
1,Parasite,2019,3261,7681,3274,22162,18457,130686,151528,784645,629450,1976278,3727422
2,Interstellar,2014,5436,13833,7069,45801,36376,201868,200064,715673,519801,1878988,3624909
3,Fight Club,1999,4929,13036,6527,43328,36710,239648,258091,982018,562208,1368304,3514799
4,La La Land,2016,14219,43518,15005,104909,60828,313040,242867,747143,397459,1226958,3165946


In [68]:
# cast number of stars to floats
df.iloc[:, range(2,12)] = df.iloc[:, range(2,12)].astype("float64")

1        3261.0
2        5436.0
3        4929.0
4       14219.0
         ...   
4552      474.0
4553      237.0
4554      119.0
4555     9092.0
4556     3409.0
Name: .5 stars, Length: 4557, dtype: float64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.iloc[:, range(2,12)] = df.iloc[:, range(2,12)].astype("float64")
1        7681.0
2       13833.0
3       13036.0
4       43518.0
         ...   
4552     1152.0
4553      538.0
4554      311.0
4555     8901.0
4556     6041.0
Name: 1 stars, Length: 4557, dtype: float64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.iloc[:, range(2,12)] = df.iloc[:, range(2,12)].astype("float64")
1        3274.0
2        7069.0
3        6527.0
4       15005.0
         ...   
4552     1302.0
4553      679.0
4554      369.0
4555     6021.0
4556     4506.0
Name: 1.5 stars, Length: 4557, dtype: float64' has dtype incompatible with int64, please explicitly cast to a compa

In [69]:
df.iloc[:, range(2,12)] = df.iloc[:, range(2,12)].div(df.totalReviews, axis=0)

In [70]:
df.set_index("movie", inplace=True)

In [71]:
df_clean = df.iloc[:, range(1,11)]

In [72]:
df_clean.head()

Unnamed: 0_level_0,.5 stars,1 stars,1.5 stars,2 stars,2.5 stars,3 stars,3.5 stars,4 stars,4.5 stars,5 stars
movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Barbie,0.004797,0.011147,0.00712,0.037903,0.03661,0.152808,0.135949,0.280295,0.10083,0.232541
Parasite,0.000875,0.002061,0.000878,0.005946,0.004952,0.035061,0.040652,0.210506,0.16887,0.5302
Interstellar,0.0015,0.003816,0.00195,0.012635,0.010035,0.055689,0.055191,0.197432,0.143397,0.518355
Fight Club,0.001402,0.003709,0.001857,0.012327,0.010444,0.068183,0.07343,0.279395,0.159955,0.389298
La La Land,0.004491,0.013746,0.004739,0.033137,0.019213,0.098877,0.076712,0.235994,0.125542,0.387549


In [73]:
total_movies = df.shape[0]

In [74]:
# multithreaded

total_movies = len(df_clean)

# Initialize results matrix
res = np.zeros((total_movies, total_movies))

# Generate all pairs
all_pairs = [(i, j) for i in range(total_movies - 1) for j in range(i + 1, total_movies)]

# Function to compute KL divergence in parallel
def compute_kl(pairs):
    local_res = []
    eps = 1e-10  # Small value to avoid division by zero and log(0)

    for i, j in pairs:
        # Convert rows to numpy arrays
        P = df_clean.iloc[i].to_numpy(dtype=np.float64)
        Q = df_clean.iloc[j].to_numpy(dtype=np.float64)

        # Normalize P and Q to make sure they are valid probability distributions
        P /= P.sum()
        Q /= Q.sum()

        # Clip Q to avoid division by zero
        P = np.clip(P, eps, None)
        Q = np.clip(Q, eps, None)

        # Compute KL divergence using the formula
        kl_divergence = np.sum(P * np.log(P / Q))

        local_res.append((i, j, kl_divergence))
    
    return local_res

# Use multiprocessing Pool
num_workers = 6  # Use all available CPU cores
chunk_size = len(all_pairs) // num_workers
chunks = [all_pairs[i:i + chunk_size] for i in range(0, len(all_pairs), chunk_size)]

with mp.Pool(processes=num_workers) as pool:
    # Use tqdm to show progress
    results = []
    with tqdm(total=len(chunks), desc="Computing KL divergence") as pbar:
        for chunk_result in pool.imap_unordered(compute_kl, chunks):
            results.append(chunk_result)
            pbar.update(1)  # Update progress bar

# Update res matrix
for chunk in results:
    for i, j, kl in chunk:
        res[i][j] = kl
        res[j][i] = kl  # Since KL divergence is symmetric in this case

print("KL divergence computation completed.")

Computing KL divergence: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [01:23<00:00, 13.84s/it]


KL divergence computation completed.


In [75]:
def recommend_movies(res, movie):
    result = []
    rankings = list(enumerate(res[movie]))
    rankings.sort(key = lambda x: x[1])
    rankings = [df_clean.index[x[0]].replace('\xa0', ' ') for x in rankings[1:11]]
    title = df_clean.index[movie].replace('\xa0', ' ')
    return title, rankings

In [83]:
recommend_movies(res, 11)

('The Substance',
 ['Manhattan',
  'Babylon',
  'Uncle Boonmee Who Can Recall His Past Lives',
  'Theorem',
  'Tetsuo: The Iron Man',
  'Eraserhead',
  'Buffalo ’66',
  'Challengers',
  'The Witch',
  'American Beauty'])

In [10]:
res = np.zeros((total_movies, total_movies))

for i in tqdm(range(total_movies - 1)):
    P = df_clean.iloc[i]
    for j in range(i + 1, total_movies):
        Q = df_clean.iloc[j]
        kl_divergence = entropy(P.to_numpy(), Q.to_numpy())
        res[i][j] = kl_divergence
        res[j][i] = kl_divergence

  2%|███▍                                                                                                                                                    | 55/2437 [00:26<18:59,  2.09it/s]


KeyboardInterrupt: 

In [111]:
letterboxd = pd.read_csv("letterboxd.csv", on_bad_lines='skip')
print(letterboxd.shape)
print(letterboxd.duplicated().any())
letterboxd.head(10)

(324, 24)
False


Unnamed: 0,movie,year,Watched by x members,Appears in x lists,Liked by x members,No in Top 250,half stars,one stars,onehalf stars,two stars,...,fourhalf stars,five stars,tagline,description,cast,crew,details,genres,themes,url
0,Barbie,2023,5112770,519663,2113121,,19336,44937,28705,152802,...,406192,936742,She’s everything. He’s just Ken.,Barbie and Ken are having the time of their li...,"['Margot Robbie', 'Ryan Gosling', 'America Fer...","{'Director': ['Greta Gerwig'], 'Producers': ['...","{'Studios': ['LuckyChap Entertainment', 'Heyda...","['Comedy', 'Adventure']","['Humanity and the world around us', 'Crude hu...",https://letterboxd.com/film/barbie/
1,Parasite,2019,4932332,642601,2693669,9.0,3263,7688,3279,22196,...,630230,1978201,Act like you own the place.,"All unemployed, Ki-taek’s family takes peculia...","['Song Kang-ho', 'Lee Sun-kyun', 'Cho Yeo-jeon...","{'Director': ['Bong Joon Ho'], 'Producers': ['...","{'Studio': ['Barunson E&A'], 'Country': ['Sout...","['Comedy', 'Thriller', 'Drama']","['Humanity and the world around us', 'Intense ...",https://letterboxd.com/film/parasite-2019/
2,Interstellar,2014,4956733,613257,2372421,53.0,5444,13846,7071,45823,...,520320,1881004,Mankind was born on Earth. It was never meant ...,The adventures of a group of explorers who mak...,"['Matthew McConaughey', 'Anne Hathaway', 'Mich...","{'Director': ['Christopher Nolan'], 'Producers...","{'Studios': ['Legendary Pictures', 'Syncopy', ...","['Science Fiction', 'Drama', 'Adventure']","['Monsters, aliens, sci-fi and the apocalypse'...",https://letterboxd.com/film/interstellar/
3,Fight Club,1999,4979960,532959,2289970,179.0,4937,13048,6534,43359,...,562688,1369448,Mischief. Mayhem. Soap.,A ticking-time-bomb insomniac and a slippery s...,"['Edward Norton', 'Brad Pitt', 'Helena Bonham ...","{'Director': ['David Fincher'], 'Producers': [...","{'Studios': ['Fox 2000 Pictures', 'Regency Ent...",['Drama'],"['Intense violence and sexual transgression', ...",https://letterboxd.com/film/fight-club/
4,La La Land,2016,4328732,611410,1950863,,14230,43529,15017,104979,...,397774,1227934,Here’s to the fools who dream.,"Mia, an aspiring actress, serves lattes to mov...","['Ryan Gosling', 'Emma Stone', 'John Legend', ...","{'Director': ['Damien Chazelle'], 'Producers':...","{'Studios': ['Summit Entertainment', 'Gilbert ...","['Drama', 'Comedy', 'Music', 'Romance']","['Song and dance', 'Humanity and the world aro...",https://letterboxd.com/film/la-la-land/
5,Everything Everywhere All at Once,2022,3745515,585662,1842110,170.0,12933,28824,13727,67081,...,437356,1379382,The universe is so much bigger than you realize.,An aging Chinese immigrant is swept up in an i...,"['Michelle Yeoh', 'Stephanie Hsu', 'Ke Huy Qua...","{'Directors': ['Daniel Scheinert', 'Daniel Kwa...","{'Studios': ['IAC Films', 'AGBO', 'Ley Line En...","['Science Fiction', 'Adventure', 'Comedy', 'Ac...","['Humanity and the world around us', 'Moving r...",https://letterboxd.com/film/everything-everywh...
6,Oppenheimer,2023,3626668,561769,1404062,,6728,16286,8882,55724,...,570989,900948,The world forever changes.,The story of J. Robert Oppenheimer’s role in t...,"['Cillian Murphy', 'Emily Blunt', 'Matt Damon'...","{'Director': ['Christopher Nolan'], 'Producers...","{'Studios': ['Syncopy', 'Universal Pictures', ...","['History', 'Drama']","['Humanity and the world around us', 'Politics...",https://letterboxd.com/film/oppenheimer-2023/
7,Whiplash,2014,3859871,474417,1817759,42.0,2957,7263,3617,23994,...,542515,1249477,The road to greatness can take you to the edge.,"Under the direction of a ruthless instructor, ...","['Miles Teller', 'J.K. Simmons', 'Paul Reiser'...","{'Director': ['Damien Chazelle'], 'Producers':...","{'Studios': ['Bold Films', 'Blumhouse Producti...","['Drama', 'Music']","['Moving relationship stories', 'Student comin...",https://letterboxd.com/film/whiplash-2014/
8,Pulp Fiction,1994,4322926,520513,1881426,202.0,5685,15038,6822,47025,...,453478,1100012,You won’t know the facts until you’ve seen the...,"A burger-loving hit man, his philosophical par...","['John Travolta', 'Samuel L. Jackson', 'Uma Th...","{'Director': ['Quentin Tarantino'], 'Producer'...","{'Studios': ['Miramax', 'A Band Apart', 'Jerse...","['Crime', 'Thriller']","['Crime, drugs and gangsters', 'Intense violen...",https://letterboxd.com/film/pulp-fiction/
9,Joker,2019,4893686,347111,1912699,,12568,34145,21479,117049,...,385490,768481,Put on a happy face.,"During the 1980s, a failed stand-up comedian i...","['Joaquin Phoenix', 'Robert De Niro', 'Zazie B...","{'Director': ['Todd Phillips'], 'Producers': [...","{'Studios': ['Warner Bros. Pictures', 'Joint E...","['Crime', 'Drama', 'Thriller']","['Intense violence and sexual transgression', ...",https://letterboxd.com/film/joker-2019/
