In [64]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import random_split
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import regex as re
from scipy.stats import entropy
import threading
import multiprocessing as mp

In [65]:
# importing and cleaning data
df = pd.read_csv("data.csv", on_bad_lines='skip', header=None)
print(df.shape)

# remove everything except numbers from star columns
df[list(range(2, 12))] = df[list(range(2, 12))].apply(lambda col: col.apply(lambda x: int(re.findall(r'\d+', str(x))[0])))

# rename columns
df = df.rename(columns={0: 'movie',
                        1: 'year',
                        2: '.5 stars', 
                        3: '1 stars', 
                        4: '1.5 stars', 
                        5: '2 stars', 
                        6: '2.5 stars',
                        7: '3 stars',
                        8: '3.5 stars',
                        9: '4 stars',
                        10: '4.5 stars',
                        11: '5 stars',
                       })



df.head()

(4557, 12)


Unnamed: 0,movie,year,.5 stars,1 stars,1.5 stars,2 stars,2.5 stars,3 stars,3.5 stars,4 stars,4.5 stars,5 stars
0,Barbie,2023,19316,44885,28668,152621,147414,615296,547411,1128634,406002,936348
1,Parasite,2019,3261,7681,3274,22162,18457,130686,151528,784645,629450,1976278
2,Interstellar,2014,5436,13833,7069,45801,36376,201868,200064,715673,519801,1878988
3,Fight Club,1999,4929,13036,6527,43328,36710,239648,258091,982018,562208,1368304
4,La La Land,2016,14219,43518,15005,104909,60828,313040,242867,747143,397459,1226958


In [66]:
df["totalReviews"] = df.iloc[:, range(2,12)].sum(axis=1)

In [67]:
df.head()

Unnamed: 0,movie,year,.5 stars,1 stars,1.5 stars,2 stars,2.5 stars,3 stars,3.5 stars,4 stars,4.5 stars,5 stars,totalReviews
0,Barbie,2023,19316,44885,28668,152621,147414,615296,547411,1128634,406002,936348,4026595
1,Parasite,2019,3261,7681,3274,22162,18457,130686,151528,784645,629450,1976278,3727422
2,Interstellar,2014,5436,13833,7069,45801,36376,201868,200064,715673,519801,1878988,3624909
3,Fight Club,1999,4929,13036,6527,43328,36710,239648,258091,982018,562208,1368304,3514799
4,La La Land,2016,14219,43518,15005,104909,60828,313040,242867,747143,397459,1226958,3165946


In [68]:
# cast number of stars to floats
df.iloc[:, range(2,12)] = df.iloc[:, range(2,12)].astype("float64")

1        3261.0
2        5436.0
3        4929.0
4       14219.0
         ...   
4552      474.0
4553      237.0
4554      119.0
4555     9092.0
4556     3409.0
Name: .5 stars, Length: 4557, dtype: float64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.iloc[:, range(2,12)] = df.iloc[:, range(2,12)].astype("float64")
1        7681.0
2       13833.0
3       13036.0
4       43518.0
         ...   
4552     1152.0
4553      538.0
4554      311.0
4555     8901.0
4556     6041.0
Name: 1 stars, Length: 4557, dtype: float64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.iloc[:, range(2,12)] = df.iloc[:, range(2,12)].astype("float64")
1        3274.0
2        7069.0
3        6527.0
4       15005.0
         ...   
4552     1302.0
4553      679.0
4554      369.0
4555     6021.0
4556     4506.0
Name: 1.5 stars, Length: 4557, dtype: float64' has dtype incompatible with int64, please explicitly cast to a compa

In [69]:
df.iloc[:, range(2,12)] = df.iloc[:, range(2,12)].div(df.totalReviews, axis=0)

In [70]:
df.set_index("movie", inplace=True)

In [71]:
df_clean = df.iloc[:, range(1,11)]

In [72]:
df_clean.head()

Unnamed: 0_level_0,.5 stars,1 stars,1.5 stars,2 stars,2.5 stars,3 stars,3.5 stars,4 stars,4.5 stars,5 stars
movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Barbie,0.004797,0.011147,0.00712,0.037903,0.03661,0.152808,0.135949,0.280295,0.10083,0.232541
Parasite,0.000875,0.002061,0.000878,0.005946,0.004952,0.035061,0.040652,0.210506,0.16887,0.5302
Interstellar,0.0015,0.003816,0.00195,0.012635,0.010035,0.055689,0.055191,0.197432,0.143397,0.518355
Fight Club,0.001402,0.003709,0.001857,0.012327,0.010444,0.068183,0.07343,0.279395,0.159955,0.389298
La La Land,0.004491,0.013746,0.004739,0.033137,0.019213,0.098877,0.076712,0.235994,0.125542,0.387549


In [73]:
total_movies = df.shape[0]

In [74]:
# multithreaded

total_movies = len(df_clean)

# Initialize results matrix
res = np.zeros((total_movies, total_movies))

# Generate all pairs
all_pairs = [(i, j) for i in range(total_movies - 1) for j in range(i + 1, total_movies)]

# Function to compute KL divergence in parallel
def compute_kl(pairs):
    local_res = []
    eps = 1e-10  # Small value to avoid division by zero and log(0)

    for i, j in pairs:
        # Convert rows to numpy arrays
        P = df_clean.iloc[i].to_numpy(dtype=np.float64)
        Q = df_clean.iloc[j].to_numpy(dtype=np.float64)

        # Normalize P and Q to make sure they are valid probability distributions
        P /= P.sum()
        Q /= Q.sum()

        # Clip Q to avoid division by zero
        P = np.clip(P, eps, None)
        Q = np.clip(Q, eps, None)

        # Compute KL divergence using the formula
        kl_divergence = np.sum(P * np.log(P / Q))

        local_res.append((i, j, kl_divergence))
    
    return local_res

# Use multiprocessing Pool
num_workers = 6  # Use all available CPU cores
chunk_size = len(all_pairs) // num_workers
chunks = [all_pairs[i:i + chunk_size] for i in range(0, len(all_pairs), chunk_size)]

with mp.Pool(processes=num_workers) as pool:
    # Use tqdm to show progress
    results = []
    with tqdm(total=len(chunks), desc="Computing KL divergence") as pbar:
        for chunk_result in pool.imap_unordered(compute_kl, chunks):
            results.append(chunk_result)
            pbar.update(1)  # Update progress bar

# Update res matrix
for chunk in results:
    for i, j, kl in chunk:
        res[i][j] = kl
        res[j][i] = kl  # Since KL divergence is symmetric in this case

print("KL divergence computation completed.")

Computing KL divergence: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [01:23<00:00, 13.84s/it]


KL divergence computation completed.


In [75]:
def recommend_movies(res, movie):
    result = []
    rankings = list(enumerate(res[movie]))
    rankings.sort(key = lambda x: x[1])
    rankings = [df_clean.index[x[0]].replace('\xa0', ' ') for x in rankings[1:11]]
    title = df_clean.index[movie].replace('\xa0', ' ')
    return title, rankings

In [81]:
for i in range(0, 100):
    print(recommend_movies(res, i))

('Barbie', ['Spider-Man: No Way Home', 'Fantasia', 'Lady Bird', 'Titanic', 'Halloween', 'The Texas Chain Saw Massacre', 'The Hunger Games: Catching Fire', 'Scott Pilgrim vs. the World', 'Yellow Submarine', 'Midsommar'])
('Parasite', ['The Shawshank Redemption', 'The Godfather Part II', 'The Beatles: Get Back', 'City of God', '12 Angry Men', 'The Dark Knight', 'The Godfather', 'Seven Samurai', 'Neon Genesis Evangelion', 'Berserk'])
('Interstellar', ['The Lord of the Rings: The Return of the King', 'Banana Fish', 'Normal People', 'Neon Genesis Evangelion: The End of Evangelion', 'The Godfather', 'Spirited Away', 'HOMECOMING: A film by Beyoncé', 'Serial Experiments Lain', 'Portrait of a Lady on Fire', 'The Lord of the Rings: The Fellowship of the Ring'])
('Fight Club', ['Pulp Fiction', 'Fantastic Mr. Fox', 'Eternal Sunshine of the Spotless Mind', 'Singin’ in the Rain', 'Before Sunset', 'Before Sunrise', 'Good Will Hunting', 'The Wrong Trousers', 'Dead Poets Society', 'Life Is Beautiful'])

In [10]:
res = np.zeros((total_movies, total_movies))

for i in tqdm(range(total_movies - 1)):
    P = df_clean.iloc[i]
    for j in range(i + 1, total_movies):
        Q = df_clean.iloc[j]
        kl_divergence = entropy(P.to_numpy(), Q.to_numpy())
        res[i][j] = kl_divergence
        res[j][i] = kl_divergence

  2%|███▍                                                                                                                                                    | 55/2437 [00:26<18:59,  2.09it/s]


KeyboardInterrupt: 