# **Using k-Nearest Neighbors with Pearson Correlation and Jaccard Similarity to get movie-movie similarity**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import classification_report, f1_score
import warnings
warnings.filterwarnings('ignore')
# datadir = '/kaggle/input/content/dataset'
datadir = '../../ml1m/content/dataset/'
support_dir = '.'

seed = 42
np.random.seed(seed)
# random.seed(seed)
# torch.manual_seed(seed)
# torch.cuda.manual_seed(seed)

## Load data

In [2]:
movies_train = pd.read_csv(os.path.join(datadir, 'movies_train.dat'), engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='latin-1', index_col=False).set_index('movieid')
movies_train['genre'] = movies_train['genre'].apply(lambda x: x.split('|'))

movies_train['year'] = movies_train['title'].apply(lambda x: x[-5:-1]).astype('int')

movies_train['title'] = movies_train['title'].apply(lambda x: x[:-7])

movies_train.head()

Unnamed: 0_level_0,title,genre,year
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1650,Washington Square,[Drama],1997
185,"Net, The","[Sci-Fi, Thriller]",1995
1377,Batman Returns,"[Action, Adventure, Comedy, Crime]",1992
3204,"Boys from Brazil, The",[Thriller],1978
1901,Dear Jesse,[Documentary],1997


In [3]:
ratings = pd.read_csv(os.path.join(datadir, 'ratings.dat'), engine='python',
                          sep='::', names=['userid', 'movieid', 'rating', 'timestamp'])
ratings['movieid'] = ratings['movieid'].astype('category')
ratings['userid'] = ratings['userid'].astype('category')
ratings.head()

Unnamed: 0,userid,movieid,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
movies_users_pivot = ratings.pivot(index='userid', columns='movieid', values='rating')

movies_users_pivot

movieid,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,2.0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,,,,2.0,,3.0,,,,,...,,,,,,,,,,
6037,,,,,,,,,,,...,,,,,,,,,,
6038,,,,,,,,,,,...,,,,,,,,,,
6039,,,,,,,,,,,...,,,,,,,,,,


Some movies may not have ratings. We will exclude these movies from our network.

In [5]:
print("Total movies in training set: ", len(movies_train.index))
print("Total movies in ratings set: ", len(movies_users_pivot.columns))
print("Total movies in ratings and not in train: ", len(movies_users_pivot.columns) - len(movies_users_pivot.columns.intersection(movies_train.index)))
print("Total movies in train and not in ratings: ", len(movies_train.index) - len(movies_users_pivot.columns.intersection(movies_train.index)))

Total movies in training set:  3106
Total movies in ratings set:  3706
Total movies in ratings and not in train:  743
Total movies in train and not in ratings:  143


In [13]:
# keep only training movies
movies_users_pivot = movies_users_pivot[movies_users_pivot.columns.intersection(movies_train.index)]
movies_users_pivot

movieid,1,2,3,4,5,6,7,9,10,11,...,3937,3938,3939,3943,3944,3945,3947,3948,3949,3950
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,2.0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,,,,2.0,,3.0,,,,3.0,...,,,,,,,,,,
6037,,,,,,,,,,,...,,,,,,,,,,
6038,,,,,,,,,,,...,,,,,,,,,,
6039,,,,,,,,,,,...,,,,,,,,,,


## Construct a movie-movie similarity network

For each pair of movies, we calculate the two ratings-based similarity measures: Pearson correlation coefficient, and Jaccard  index of common users. We then construct a network with the movies as nodes, and weighted edges representing the similarity between the movies.

### 1. The Pearson correlation coefficient

The Pearson correlation coefficient of the ratings given to two movies $i$ and $j$ by the same users is defined as
$$\rho_{ij} = \frac{\sum_{u \in U_{ij}} (r_{ui} - \bar{r}_i)(r_{uj} - \bar{r}_j)}{\sqrt{\sum_{u \in U_{ij}} (r_{ui} - \bar{r}_i)^2} \sqrt{\sum_{u \in U_{ij}} (r_{uj} - \bar{r}_j)^2}},$$
where $U_{ij}$ is the set of users who have rated both movies $i$ and $j$, and $\bar{r}_i = \frac{1}{|U_{m}|} \sum_{u \in U{i}}{r_{ui}}$ is the average rating of movie $i$.

Function that takes inputs of two movie IDs and returns the Pearson correlation coefficient between the two movies using the ratings in the users-movies pivot table

In [15]:
def pearson_id(i, j) -> float:
    """
    Calculate the Pearson correlation coefficient between movies i and j
    
    Parameters
    ----------
    i : int
        Id of movie i
    j : int
        Id of movie j

    Returns
    -------
    Pearson correlation coefficient between movies i and j
    """

    # Get the average rating for movie i
    r_avg_i = movies_users_pivot[i].mean()
    # Get the average rating for movie j
    r_avg_j = movies_users_pivot[j].mean()
    
    # Get the set of users who have rated both movies i and j
    U_i = movies_users_pivot[i].dropna().index.to_numpy()
    U_j = movies_users_pivot[j].dropna().index.to_numpy()
    U_ij = np.intersect1d(U_i, U_j)

    if U_ij.shape[0] == 0:
        return -1.0
    # print(U_ij)

    # ratings of each user in U_ij for movie i
    r_ui = movies_users_pivot.loc[U_ij, i].to_numpy()
    # ratings of each user in U_ij for movie j
    r_uj = movies_users_pivot.loc[U_ij, j].to_numpy()

    # Calculate the numerator of the Pearson correlation coefficient
    numerator = np.sum(np.multiply(r_ui - r_avg_i, r_uj - r_avg_j))    
    # Calculate the denominator of the Pearson correlation coefficient
    denominator = np.sqrt(np.multiply(np.sum((r_ui - r_avg_i)**2), np.sum((r_uj - r_avg_j)**2)))
    
    # Return the Pearson correlation coefficient
    return numerator / denominator

Function that takes inputs of two movie's ratings arrays and returns the Pearson correlation coefficient between the two movies.

In [22]:
def pearson(ratings_i, ratings_j) -> float:
    """Calculate the Pearson correlation coefficient between movies i and j

    Parameter
    ---------
    ratings_i : ArrayLike
        Array of ratings of movie i
    ratings_j : ArrayLike
        Array of ratings of movie j

    Returns
    -------
    Pearson correlation coefficient between movies i and j
    
    """
    # Get the ratings of movie i not equal to NaN
    if ratings_i.shape[0] != movies_users_pivot.index.shape[0]:
        U_i = np.where(ratings_i != 0.0)
    else:
        U_i = np.where(~np.isnan(ratings_i))
    # Get the average rating for movie i
    r_avg_i = np.mean(ratings_i[U_i])

    # Get the ratings of movie j not equal to NaN
    if ratings_j.shape[0] != movies_users_pivot.index.shape[0]:
        U_j = np.where(ratings_j != 0.0)
    else:
        U_j = np.where(~np.isnan(ratings_j))
    # Get the average rating for movie j
    r_avg_j = np.mean(ratings_j[U_j])

    # Get the set of users who have rated both movies i and j
    U_ij = np.intersect1d(U_i, U_j)

    if U_ij.shape[0] == 0:
        return -1.0
    
    # Get the ratings of movie i for users in U_ij
    ratings_i = ratings_i[U_ij]
    ratings_j = ratings_j[U_ij]

    # Calculate the numerator of the Pearson correlation coefficient
    numerator = np.sum(np.multiply(ratings_i - r_avg_i, ratings_j - r_avg_j))
    # Calculate the denominator of the Pearson correlation coefficient
    denominator = np.sqrt(np.multiply(np.sum((ratings_i - r_avg_i)**2), np.sum((ratings_j - r_avg_j)**2)))

    # Return the Pearson correlation coefficient
    return numerator / denominator


Test 2 functions and compare.

In [24]:
samples = movies_train.sample(2).index.tolist()
print(samples)
pearson_id(*samples), pearson(movies_users_pivot[samples[0]].to_numpy(), movies_users_pivot[samples[1]].to_numpy())

[1376, 2178]


(-0.10287261715094519, -0.10287261715094519)

Construct the correlation matrix for all pairs of movies in the dataset.

In [None]:
# keep only training movies that have ratings
pearson_network = movies_users_pivot[movies_users_pivot.columns.intersection(movies_train.index)].fillna(0).corr(pearson)
pearson_network = pearson_network.fillna(-1.0)
pearson_network

movieid,1,2,3,4,5,6,7,9,10,11,...,3937,3938,3939,3943,3944,3945,3947,3948,3949,3950
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.262649,0.146536,0.109375,0.170156,0.168087,0.189333,0.045705,0.215653,0.249066,...,0.057543,0.027730,0.014994,0.039913,0.002483,0.054038,-0.007335,0.132009,0.076941,0.050559
2,0.262649,1.000000,0.169586,0.111616,0.196561,0.137334,0.193658,0.126871,0.302042,0.177533,...,0.102869,0.023772,0.016038,0.025345,0.004734,0.055814,0.047608,0.108027,0.075658,0.060941
3,0.146536,0.169586,1.000000,0.158659,0.268062,0.095834,0.238816,0.100622,0.164252,0.208919,...,0.071665,0.061954,0.028327,0.007877,0.058906,0.032291,0.028135,0.104775,0.050016,0.039715
4,0.109375,0.111616,0.158659,1.000000,0.247529,0.071081,0.187311,0.042133,0.082035,0.199820,...,0.005687,-0.009365,-0.008756,0.037847,0.048747,-0.008854,0.011165,0.067022,0.064363,0.008411
5,0.170156,0.196561,0.268062,0.247529,1.000000,0.075012,0.264749,0.116465,0.176002,0.253253,...,0.044932,0.020287,0.014867,0.002233,0.078781,0.032615,-0.002865,0.107924,0.048976,0.033442
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3945,0.054038,0.055814,0.032291,-0.008854,0.032615,-0.010224,0.020188,0.012223,0.017353,0.007014,...,0.016351,0.003342,0.010356,0.055192,0.100569,1.000000,0.012443,0.090117,0.057280,0.051354
3947,-0.007335,0.047608,0.028135,0.011165,-0.002865,0.057124,0.029058,0.039280,0.062022,0.017452,...,0.182014,-0.005336,-0.004989,0.089518,0.006972,0.012443,1.000000,0.048989,0.089883,0.088407
3948,0.132009,0.108027,0.104775,0.067022,0.107924,0.112623,0.106178,0.052270,0.117866,0.144558,...,0.090009,0.054492,0.045063,0.152830,0.044625,0.090117,0.048989,1.000000,0.275060,0.145673
3949,0.076941,0.075658,0.050016,0.064363,0.048976,0.130746,0.063735,0.075553,0.061091,0.066213,...,0.060597,0.034401,0.024902,0.225868,0.093686,0.057280,0.089883,0.275060,1.000000,0.290993


In [11]:
pearson_network.to_csv('pearson_correlation_network.csv')

In [None]:
# # load precomputed pearson correlation network
# pearson_network = pd.read_csv(os.path.join(support_dir, 'pearson_correlation_network.csv'), index_col=0)

# pearson_network.columns.name = 'movieid'
# pearson_network.index = pearson_network.index.astype('int')
# pearson_network.columns = pearson_network.columns.astype('int')

# pearson_network

movieid,1,2,3,4,5,6,7,9,10,11,...,3937,3938,3939,3943,3944,3945,3947,3948,3949,3950
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.262649,0.146536,0.109375,0.170156,0.168087,0.189333,0.045705,0.215653,0.249066,...,0.057543,0.027730,0.014994,0.039913,0.002483,0.054038,-0.007335,0.132009,0.076941,0.050559
2,0.262649,1.000000,0.169586,0.111616,0.196561,0.137334,0.193658,0.126871,0.302042,0.177533,...,0.102869,0.023772,0.016038,0.025345,0.004734,0.055814,0.047608,0.108027,0.075658,0.060941
3,0.146536,0.169586,1.000000,0.158659,0.268062,0.095834,0.238816,0.100622,0.164252,0.208919,...,0.071665,0.061954,0.028327,0.007877,0.058906,0.032291,0.028135,0.104775,0.050016,0.039715
4,0.109375,0.111616,0.158659,1.000000,0.247529,0.071081,0.187311,0.042133,0.082035,0.199820,...,0.005687,-0.009365,-0.008756,0.037847,0.048747,-0.008854,0.011165,0.067022,0.064363,0.008411
5,0.170156,0.196561,0.268062,0.247529,1.000000,0.075012,0.264749,0.116465,0.176002,0.253253,...,0.044932,0.020287,0.014867,0.002233,0.078781,0.032615,-0.002865,0.107924,0.048976,0.033442
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3945,0.054038,0.055814,0.032291,-0.008854,0.032615,-0.010224,0.020188,0.012223,0.017353,0.007014,...,0.016351,0.003342,0.010356,0.055192,0.100569,1.000000,0.012443,0.090117,0.057280,0.051354
3947,-0.007335,0.047608,0.028135,0.011165,-0.002865,0.057124,0.029058,0.039280,0.062022,0.017452,...,0.182014,-0.005336,-0.004989,0.089518,0.006972,0.012443,1.000000,0.048989,0.089883,0.088407
3948,0.132009,0.108027,0.104775,0.067022,0.107924,0.112623,0.106178,0.052270,0.117866,0.144558,...,0.090009,0.054492,0.045063,0.152830,0.044625,0.090117,0.048989,1.000000,0.275060,0.145673
3949,0.076941,0.075658,0.050016,0.064363,0.048976,0.130746,0.063735,0.075553,0.061091,0.066213,...,0.060597,0.034401,0.024902,0.225868,0.093686,0.057280,0.089883,0.275060,1.000000,0.290993


### 2. The Jaccard index

The Jaccard index of common users, which is the fraction of users who rated both $i$ and $j$ to all users who rated $i$ and/or $j$, is defined as:

<!-- intersection over union -->
$$\phi_{ij} = \frac{|U_{ij}|}{|U_{i} \cup U_{j}|}.$$


The function that takes inputs of two movie IDs, and returns the Jaccard index between the two movies using the ratings in the users-movies pivot table.

In [25]:
def jaccard_id(i, j) -> float:
    """
    Calculate the Jaccard similarity coefficient between movies i and j

    Parameters
    ----------
    i : int
        Id of movie i
    j : int
        Id of movie j

    Returns
    -------
    Jaccard similarity coefficient between movies i and j
    """
    # Get the set of users who have rated both movies i and j
    U_i = movies_users_pivot[i].dropna().index.to_numpy()
    U_j = movies_users_pivot[j].dropna().index.to_numpy()
    U_ij = np.intersect1d(U_i, U_j)
    union_ij = np.union1d(U_i, U_j)

    # Calculate the Jaccard similarity coefficient
    return U_ij.shape[0] / union_ij.shape[0]

The function that takes inputs of two movie's ratings arrays and returns the Jaccard index between the two movies.

In [26]:
def jaccard(ratings_i, ratings_j) -> float:
    """Calculate the Jaccard similarity coefficient between movies i and j

    Parameter
    ---------
    ratings_i : ArrayLike
        Array of ratings of movie i
    ratings_j : ArrayLike
        Array of ratings of movie j

    Returns
    -------
    Jaccard similarity coefficient between movies i and j
    
    """
    # Get the ratings of movie i not equal to 0
    U_i = np.where(ratings_i != 0)
    # Get the ratings of movie j not equal to 0
    U_j = np.where(ratings_j != 0)

    # Get the set of users who have rated both movies i and j
    U_ij = np.intersect1d(U_i, U_j)

    # Get the set of users who have rated either movie i or j
    union_ij = np.union1d(U_i, U_j)
    
    return U_ij.shape[0] / union_ij.shape[0]


Test 2 functions and compare.

In [29]:
samples = movies_train.sample(2).index.tolist()
# samples = [1, 2]
print(samples)
jaccard_id(*samples), jaccard(movies_users_pivot[samples[0]].fillna(0).to_numpy(), movies_users_pivot[samples[1]].fillna(0).to_numpy())

[3668, 191]


(0.04611650485436893, 0.04611650485436893)

Construct the Jaccard index matrix for all pairs of movies in the dataset.

In [None]:
jaccard_network = movies_users_pivot[movies_users_pivot.columns.intersection(movies_train.index)].fillna(0).corr(jaccard) # fill NaN with 0 because pandas auto remove NaN when calculating correlation
jaccard_network

movieid,1,2,3,4,5,6,7,9,10,11,...,3937,3938,3939,3943,3944,3945,3947,3948,3949,3950
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.214692,0.126543,0.052459,0.098611,0.204872,0.140864,0.027346,0.221673,0.256566,...,0.032198,0.007663,0.006702,0.023070,0.001921,0.014840,0.008992,0.172785,0.070594,0.015729
2,0.214692,1.000000,0.149123,0.068712,0.138128,0.163005,0.163655,0.064987,0.257120,0.180395,...,0.063613,0.008310,0.008322,0.025740,0.002825,0.021978,0.021622,0.135901,0.079484,0.023035
3,0.126543,0.149123,1.000000,0.105802,0.187117,0.117415,0.195402,0.064220,0.148865,0.176791,...,0.062392,0.024341,0.016129,0.026834,0.008282,0.019569,0.026975,0.111111,0.068306,0.023077
4,0.052459,0.068712,0.105802,1.000000,0.167920,0.058151,0.123435,0.050193,0.059059,0.099634,...,0.016667,0.000000,0.000000,0.031008,0.005618,0.004717,0.013514,0.054137,0.060403,0.013575
5,0.098611,0.138128,0.187117,0.167920,1.000000,0.081365,0.191153,0.081522,0.122275,0.159686,...,0.043584,0.012539,0.012579,0.015544,0.009934,0.030395,0.008621,0.083255,0.056338,0.020408
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3945,0.014840,0.021978,0.019569,0.004717,0.030395,0.008205,0.012121,0.021127,0.014161,0.011278,...,0.017143,0.014493,0.029851,0.069231,0.083333,1.000000,0.010309,0.026077,0.029674,0.043011
3947,0.008992,0.021622,0.026975,0.013514,0.008621,0.019467,0.019881,0.019481,0.021668,0.013979,...,0.091954,0.000000,0.000000,0.078571,0.015873,0.010309,1.000000,0.025727,0.040580,0.058252
3948,0.172785,0.135901,0.111111,0.054137,0.083255,0.143401,0.115807,0.032120,0.143791,0.166154,...,0.053911,0.014840,0.014857,0.070391,0.009270,0.026077,0.025727,1.000000,0.191011,0.048055
3949,0.070594,0.079484,0.068306,0.060403,0.056338,0.097970,0.074753,0.043702,0.072907,0.075623,...,0.045238,0.015337,0.012270,0.120448,0.022876,0.029674,0.040580,0.191011,1.000000,0.122257


In [17]:
jaccard_network.loc[samples[0], samples[1]], jaccard_id(*samples)

(0.03636363636363636, 0.03636363636363636)

Save the similarity matrices to files.

In [18]:
jaccard_network.to_csv('jaccard_correlation_network.csv')

In [None]:
# # Load precomputed jaccard correlation network

# jaccard_network = pd.read_csv(os.path.join(support_dir, 'jaccard_correlation_network.csv'), index_col=0)

# jaccard_network.columns.name = 'movieid'
# jaccard_network.index = jaccard_network.index.astype('int')
# jaccard_network.columns = jaccard_network.columns.astype('int')

# jaccard_network

movieid,1,2,3,4,5,6,7,9,10,11,...,3937,3938,3939,3943,3944,3945,3947,3948,3949,3950
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.214692,0.126543,0.052459,0.098611,0.204872,0.140864,0.027346,0.221673,0.256566,...,0.032198,0.007663,0.006702,0.023070,0.001921,0.014840,0.008992,0.172785,0.070594,0.015729
2,0.214692,1.000000,0.149123,0.068712,0.138128,0.163005,0.163655,0.064987,0.257120,0.180395,...,0.063613,0.008310,0.008322,0.025740,0.002825,0.021978,0.021622,0.135901,0.079484,0.023035
3,0.126543,0.149123,1.000000,0.105802,0.187117,0.117415,0.195402,0.064220,0.148865,0.176791,...,0.062392,0.024341,0.016129,0.026834,0.008282,0.019569,0.026975,0.111111,0.068306,0.023077
4,0.052459,0.068712,0.105802,1.000000,0.167920,0.058151,0.123435,0.050193,0.059059,0.099634,...,0.016667,0.000000,0.000000,0.031008,0.005618,0.004717,0.013514,0.054137,0.060403,0.013575
5,0.098611,0.138128,0.187117,0.167920,1.000000,0.081365,0.191153,0.081522,0.122275,0.159686,...,0.043584,0.012539,0.012579,0.015544,0.009934,0.030395,0.008621,0.083255,0.056338,0.020408
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3945,0.014840,0.021978,0.019569,0.004717,0.030395,0.008205,0.012121,0.021127,0.014161,0.011278,...,0.017143,0.014493,0.029851,0.069231,0.083333,1.000000,0.010309,0.026077,0.029674,0.043011
3947,0.008992,0.021622,0.026975,0.013514,0.008621,0.019467,0.019881,0.019481,0.021668,0.013979,...,0.091954,0.000000,0.000000,0.078571,0.015873,0.010309,1.000000,0.025727,0.040580,0.058252
3948,0.172785,0.135901,0.111111,0.054137,0.083255,0.143401,0.115807,0.032120,0.143791,0.166154,...,0.053911,0.014840,0.014857,0.070391,0.009270,0.026077,0.025727,1.000000,0.191011,0.048055
3949,0.070594,0.079484,0.068306,0.060403,0.056338,0.097970,0.074753,0.043702,0.072907,0.075623,...,0.045238,0.015337,0.012270,0.120448,0.022876,0.029674,0.040580,0.191011,1.000000,0.122257


## 3. The product measure

The product measure is defined as the product of the Pearson correlation coefficient and the Jaccard index of common users:
$$\rho_{ij} . \phi_{ij}$$


The function that takes inputs of two movie IDs, and returns the product measure between the two movies using the Pearson correlation matrix and Jaccard index matrix.

In [None]:
def product_measure(i, j):
    """
    Calculate the product measure between movies i and j: product of Pearson correlation coefficient and Jaccard similarity coefficient

    Parameters
    ----------
    i : int
        Id of movie i
    j : int
        Id of movie j

    Returns
    -------
    Product measure between movies i and j
    """
    # Get the Pearson correlation coefficient between movies i and j
    pearson_ij = pearson_id(i, j)
    # Get the Jaccard similarity coefficient between movies i and j
    jaccard_ij = jaccard_id(i, j)

    # Return the product measure between movies i and j
    return pearson_ij * jaccard_ij

samples = movies_train.sample(2).index.tolist()
print(samples)
product_measure(*samples)

In [20]:
# network: pandas DataFrame = pearson_network * jaccard_network

network = pearson_network * jaccard_network
network

movieid,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.056389,0.018543,0.005738,0.016779,0.034436,0.026670,0.002021,0.001250,0.047804,...,0.000921,0.000005,0.000802,0.000382,-0.000066,0.022809,0.005432,0.000795,-0.000016,0.003885
2,0.056389,1.000000,0.025289,0.007669,0.027151,0.022386,0.031693,0.010299,0.008245,0.077661,...,0.000652,0.000013,0.001227,0.002144,0.001029,0.014681,0.006014,0.001404,0.000003,0.003200
3,0.018543,0.025289,1.000000,0.016786,0.050159,0.011252,0.046665,0.002454,0.006462,0.024451,...,0.000211,0.000488,0.000632,0.002448,0.000759,0.011642,0.003416,0.000916,-0.000065,0.002134
4,0.005738,0.007669,0.016786,1.000000,0.041565,0.004133,0.023121,0.001037,0.002115,0.004845,...,0.001174,0.000274,-0.000042,0.000162,0.000151,0.003628,0.003888,0.000114,0.000183,0.002998
5,0.016779,0.027151,0.050159,0.041565,1.000000,0.006103,0.050608,0.003055,0.009494,0.021521,...,0.000035,0.000783,0.000991,0.000555,-0.000025,0.008985,0.002759,0.000682,-0.000035,0.004122
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,0.022809,0.014681,0.011642,0.003628,0.008985,0.016150,0.012296,0.001400,0.001679,0.016948,...,0.010758,0.000414,0.002350,0.014469,0.001260,1.000000,0.052540,0.007000,0.003113,0.069242
3949,0.005432,0.006014,0.003416,0.003888,0.002759,0.012809,0.004764,0.002180,0.003302,0.004454,...,0.027205,0.002143,0.001700,0.013192,0.003647,0.052540,1.000000,0.035576,0.015312,0.072621
3950,0.000795,0.001404,0.000916,0.000114,0.000682,0.002349,0.000804,0.003268,-0.000039,0.001486,...,0.011076,0.018093,0.002209,0.010590,0.005150,0.007000,0.035576,1.000000,0.020867,0.019498
3951,-0.000016,0.000003,-0.000065,0.000183,-0.000035,-0.000007,0.000194,0.002489,-0.000000,-0.000051,...,0.014381,0.011547,0.003039,0.005860,0.000189,0.003113,0.015312,0.020867,1.000000,0.011147


In [21]:
network.to_csv('movies_network.csv')

In [None]:
# # Load precomputed movies network

# network = pd.read_csv(os.path.join(support_dir, 'movies_network.csv'), index_col=0)

# network.columns.name = 'movieid'
# network.index = network.index.astype('int')
# network.columns = network.columns.astype('int')

# network

movieid,1,2,3,4,5,6,7,9,10,11,...,3937,3938,3939,3943,3944,3945,3947,3948,3949,3950
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.056389,0.018543,0.005738,0.016779,0.034436,0.026670,0.001250,0.047804,0.063902,...,0.001853,0.000212,0.000100,0.000921,0.000005,0.000802,-0.000066,0.022809,0.005432,0.000795
2,0.056389,1.000000,0.025289,0.007669,0.027151,0.022386,0.031693,0.008245,0.077661,0.032026,...,0.006544,0.000198,0.000133,0.000652,0.000013,0.001227,0.001029,0.014681,0.006014,0.001404
3,0.018543,0.025289,1.000000,0.016786,0.050159,0.011252,0.046665,0.006462,0.024451,0.036935,...,0.004471,0.001508,0.000457,0.000211,0.000488,0.000632,0.000759,0.011642,0.003416,0.000916
4,0.005738,0.007669,0.016786,1.000000,0.041565,0.004133,0.023121,0.002115,0.004845,0.019909,...,0.000095,-0.000000,-0.000000,0.001174,0.000274,-0.000042,0.000151,0.003628,0.003888,0.000114
5,0.016779,0.027151,0.050159,0.041565,1.000000,0.006103,0.050608,0.009494,0.021521,0.040441,...,0.001958,0.000254,0.000187,0.000035,0.000783,0.000991,-0.000025,0.008985,0.002759,0.000682
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3945,0.000802,0.001227,0.000632,-0.000042,0.000991,-0.000084,0.000245,0.000258,0.000246,0.000079,...,0.000280,0.000048,0.000309,0.003821,0.008381,1.000000,0.000128,0.002350,0.001700,0.002209
3947,-0.000066,0.001029,0.000759,0.000151,-0.000025,0.001112,0.000578,0.000765,0.001344,0.000244,...,0.016737,-0.000000,-0.000000,0.007034,0.000111,0.000128,1.000000,0.001260,0.003647,0.005150
3948,0.022809,0.014681,0.011642,0.003628,0.008985,0.016150,0.012296,0.001679,0.016948,0.024019,...,0.004852,0.000809,0.000670,0.010758,0.000414,0.002350,0.001260,1.000000,0.052540,0.007000
3949,0.005432,0.006014,0.003416,0.003888,0.002759,0.012809,0.004764,0.003302,0.004454,0.005007,...,0.002741,0.000528,0.000306,0.027205,0.002143,0.001700,0.003647,0.052540,1.000000,0.035576


## Get the k-nearest neighbors

Distance matrix is the matrix equal to 1 minus the similarity matrix: $D_{ij} = 1 - S_{ij}$.

In [10]:
distance = 1.0 - network

distance.columns = distance.columns.astype('int')
distance.index = distance.index.astype('int')

distance

movieid,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000000,0.943611,0.981457,0.994262,0.983221,0.965564,0.973330,0.997979,0.998750,0.952196,...,0.999079,0.999995,0.999198,0.999618,1.000066,0.977191,0.994568,0.999205,1.000016,0.996115
2,0.943611,0.000000,0.974711,0.992331,0.972849,0.977614,0.968307,0.989701,0.991755,0.922339,...,0.999348,0.999987,0.998773,0.997856,0.998971,0.985319,0.993986,0.998596,0.999997,0.996800
3,0.981457,0.974711,0.000000,0.983214,0.949841,0.988748,0.953335,0.997546,0.993538,0.975549,...,0.999789,0.999512,0.999368,0.997552,0.999241,0.988358,0.996584,0.999084,1.000065,0.997866
4,0.994262,0.992331,0.983214,0.000000,0.958435,0.995867,0.976879,0.998963,0.997885,0.995155,...,0.998826,0.999726,1.000042,0.999838,0.999849,0.996372,0.996112,0.999886,0.999817,0.997002
5,0.983221,0.972849,0.949841,0.958435,0.000000,0.993897,0.949392,0.996945,0.990506,0.978479,...,0.999965,0.999217,0.999009,0.999445,1.000025,0.991015,0.997241,0.999318,1.000035,0.995878
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,0.977191,0.985319,0.988358,0.996372,0.991015,0.983850,0.987704,0.998600,0.998321,0.983052,...,0.989242,0.999586,0.997650,0.985531,0.998740,0.000000,0.947460,0.993000,0.996887,0.930758
3949,0.994568,0.993986,0.996584,0.996112,0.997241,0.987191,0.995236,0.997820,0.996698,0.995546,...,0.972795,0.997857,0.998300,0.986808,0.996353,0.947460,0.000000,0.964424,0.984688,0.927379
3950,0.999205,0.998596,0.999084,0.999886,0.999318,0.997651,0.999196,0.996732,1.000039,0.998514,...,0.988924,0.981907,0.997791,0.989410,0.994850,0.993000,0.964424,0.000000,0.979133,0.980502
3951,1.000016,0.999997,1.000065,0.999817,1.000035,1.000007,0.999806,0.997511,1.000000,1.000051,...,0.985619,0.988453,0.996961,0.994140,0.999811,0.996887,0.984688,0.979133,0.000000,0.988853


In [14]:
# due to floating point calculation when 1 - network (or something), some distances are close to 0 but are negative
distance = distance.applymap(lambda x: 0 if x <=0 else x) 

distance

movieid,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000000,0.943611,0.981457,0.994262,0.983221,0.965564,0.973330,0.997979,0.998750,0.952196,...,0.999079,0.999995,0.999198,0.999618,1.000066,0.977191,0.994568,0.999205,1.000016,0.996115
2,0.943611,0.000000,0.974711,0.992331,0.972849,0.977614,0.968307,0.989701,0.991755,0.922339,...,0.999348,0.999987,0.998773,0.997856,0.998971,0.985319,0.993986,0.998596,0.999997,0.996800
3,0.981457,0.974711,0.000000,0.983214,0.949841,0.988748,0.953335,0.997546,0.993538,0.975549,...,0.999789,0.999512,0.999368,0.997552,0.999241,0.988358,0.996584,0.999084,1.000065,0.997866
4,0.994262,0.992331,0.983214,0.000000,0.958435,0.995867,0.976879,0.998963,0.997885,0.995155,...,0.998826,0.999726,1.000042,0.999838,0.999849,0.996372,0.996112,0.999886,0.999817,0.997002
5,0.983221,0.972849,0.949841,0.958435,0.000000,0.993897,0.949392,0.996945,0.990506,0.978479,...,0.999965,0.999217,0.999009,0.999445,1.000025,0.991015,0.997241,0.999318,1.000035,0.995878
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,0.977191,0.985319,0.988358,0.996372,0.991015,0.983850,0.987704,0.998600,0.998321,0.983052,...,0.989242,0.999586,0.997650,0.985531,0.998740,0.000000,0.947460,0.993000,0.996887,0.930758
3949,0.994568,0.993986,0.996584,0.996112,0.997241,0.987191,0.995236,0.997820,0.996698,0.995546,...,0.972795,0.997857,0.998300,0.986808,0.996353,0.947460,0.000000,0.964424,0.984688,0.927379
3950,0.999205,0.998596,0.999084,0.999886,0.999318,0.997651,0.999196,0.996732,1.000039,0.998514,...,0.988924,0.981907,0.997791,0.989410,0.994850,0.993000,0.964424,0.000000,0.979133,0.980502
3951,1.000016,0.999997,1.000065,0.999817,1.000035,1.000007,0.999806,0.997511,1.000000,1.000051,...,0.985619,0.988453,0.996961,0.994140,0.999811,0.996887,0.984688,0.979133,0.000000,0.988853


In [16]:
knn = NearestNeighbors(n_neighbors=16, metric='precomputed')
knn.fit(distance)

Function to calculate the distance from a movie to all other movies in the network.

In [None]:
def get_distance(movieid: int) -> np.array:
    """
    Get the distance between movieid and all other movies

    Parameters
    ----------
    movieid : int
        Id of movie

    Returns
    -------
    Array of distances between movieid and all other movies
    """
    # Get the ratings of movieid
    try:
        movie_ratings = movies_users_pivot.loc[:, movieid].fillna(0)
    except KeyError:
        return None
    # print(movie_ratings.shape)

    all_movies = movies_users_pivot.columns.intersection(movies_train.index)
    # print(all_movies.shape)

    # Calculate the pearson correlation coefficient between the movie and all other movies using the ratings
    pearson_corr = movies_users_pivot[all_movies].fillna(0).corrwith(movie_ratings, method=pearson)
    pearson_corr = pearson_corr.fillna(-1.0)
    # print(pearson_corr)

    # Calculate the jaccard similarity coefficient between the movie and all other movies using the ratings
    jaccard_corr = movies_users_pivot[all_movies].fillna(0).corrwith(movie_ratings, method=jaccard)
    jaccard_corr = jaccard_corr.fillna(0)
    # print(jaccard_corr)

    # Calculate the product measure between the movie and all other movies using the ratings
    product_corr = np.multiply(pearson_corr, jaccard_corr)
    # print(product_corr.shape)

    # Calculate the distance between the movie and all other movies using the ratings
    distances = 1.0 - product_corr
    # print(distances.shape)

    distances = distances.apply(lambda x: 0 if x <=0 else x) 
    # Return the distances
    return distances.to_numpy()

get_distance(1650)

array([0.99981133, 0.99926976, 0.9997633 , ..., 0.99841251, 0.99590697,
       0.99567374])

Function to get neighbor movies for each movie

In [None]:
def get_neighbors(movieid: int) -> list:
    """Get the list of neighbors of movieid

    Parameter
    ---------
    movieid : int
        Id of movie

    Returns
    -------
    List of neighbors of movieid
    """
    # Check if movieid is in the list of training movies with ratings
    if movieid in movies_users_pivot.columns.intersection(movies_train.index):
        # print(movieid)
        # Get the distances between movieid and all other movies from distance matrix
        distances = distance.loc[movieid, :].to_numpy()
        
        # Get the indices of the 15 nearest neighbors of movieid
        dis, ids = knn.kneighbors([distances], return_distance=True)

        # get the movieids of the neighbors
        nei_ids = distance.columns[ids[0]].tolist()
        nei_dis = dis[0].tolist()

        # remove the movie itself from the list of neighbors
        if movieid in nei_ids:
            nei_ids.remove(movieid)
            nei_dis.remove(0.0)
        else:
            nei_ids = nei_ids[:-1]
            nei_dis = nei_dis[:-1]

        return nei_ids, nei_dis
    
    else:
        # print(0)
        # Calculate the distances between movieid and all other movies
        distances = get_distance(movieid)
        if distances is None:
            # return empty list
            return [], []

        # Get the indices of the 15 nearest neighbors of movieid
        dis, ids = knn.kneighbors([distances], return_distance=True)
        # print(ids[0], dis[0])
        # get the movieids of the neighbors
        nei_ids = distance.columns[ids[0]].tolist()[:15]
        nei_dis = dis[0].tolist()[:15]
        # print(nei_ids, nei_dis)

        return nei_ids, nei_dis

get_neighbors(10)

([1722,
  1370,
  165,
  349,
  733,
  3082,
  380,
  1377,
  2353,
  316,
  377,
  2617,
  2115,
  1610,
  1917],
 [0.6908701301534905,
  0.7946292014642302,
  0.8079065882831861,
  0.8087818907205007,
  0.8088279086739529,
  0.8169737089985607,
  0.8269603234245736,
  0.8321622664029392,
  0.843835130189504,
  0.8444447851629848,
  0.8498574839202994,
  0.851710364423848,
  0.8530018963999565,
  0.8536959195417965,
  0.8556617635214202])

Get the k-nearest neighbors for each movie

In [21]:
movies_train['neighbors'] = movies_train.apply(lambda x: get_neighbors(x.name), axis=1)

movies_train.head()

Unnamed: 0_level_0,title,genre,year,neighbors,neighbors_genres
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1650,Washington Square,[Drama],1997,"[613, 767, 1728, 1177, 369, 2437, 1056, 2801, ...","[[Drama, Romance], [Drama], [Drama], [Drama], ..."
185,"Net, The","[Sci-Fi, Thriller]",1995,"[1779, 1917, 196, 316, 1876, 1909, 1527, 780, ...","[[Adventure, Sci-Fi, Thriller], [Action, Adven..."
1377,Batman Returns,"[Action, Adventure, Comedy, Crime]",1992,"[153, 1562, 592, 380, 2616, 2002, 10, 1370, 19...","[[Action, Adventure, Comedy, Crime], [Action, ..."
3204,"Boys from Brazil, The",[Thriller],1978,"[3015, 3551, 3230, 2871, 2917, 3445, 3198, 316...","[[Thriller], [Thriller], [Thriller], [Adventur..."
1901,Dear Jesse,[Documentary],1997,"[1553, 1315, 1040, 3905, 1444, 3410, 1899, 358...","[[Comedy], [Documentary], [Drama], [Comedy], [..."


In [None]:
movies_train['neighbors_corr'] = movies_train['neighbors'].apply(lambda x: x[1] if len(x[0]) > 0 else [])
movies_train['neighbors'] = movies_train['neighbors'].apply(lambda x: x[0] if len(x[0]) > 0 else [])

movies_train.head()

Unnamed: 0_level_0,title,genre,year,neighbors,neighbors_corr
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Toy Story,"[Animation, Children's, Comedy]",1995,"[3114, 588, 1265, 2355, 34, 364, 595, 1923, 35...","[0.7863686364967801, 0.7994796294934516, 0.812..."
2,Jumanji,"[Adventure, Children's, Fantasy]",1995,"[3489, 653, 2161, 60, 317, 2054, 3438, 367, 67...","[0.7744544174745062, 0.8499392622015024, 0.865..."
3,Grumpier Old Men,"[Comedy, Romance]",1995,"[432, 370, 2953, 586, 2424, 355, 520, 276, 587...","[0.9340360072542967, 0.9355390084189408, 0.938..."
4,Waiting to Exhale,"[Comedy, Drama]",1995,"[1621, 1353, 1614, 450, 830, 372, 5, 195, 203,...","[0.9316115238576078, 0.9507919187828562, 0.953..."
5,Father of the Bride Part II,[Comedy],1995,"[2953, 586, 2082, 719, 355, 830, 186, 520, 432...","[0.9002727667189788, 0.9072205856647435, 0.911..."


In [None]:
def get_neighbors_genres(row):
    # print(row.name)
    neighbors = row['neighbors']
    if len(neighbors) == 0:
        return []
    neighbors_genre = []
    for neighbor in neighbors:
        neighbors_genre.append(movies_train.loc[neighbor, 'genre'])

    return neighbors_genre

print(get_neighbors_genres(movies_train.iloc[0]))

movies_train['neighbors_genres'] = movies_train.apply(lambda x: get_neighbors_genres(x), axis=1)

movies_train.head()

[['Animation', "Children's", 'Comedy'], ['Animation', "Children's", 'Comedy', 'Musical'], ['Comedy', 'Romance'], ['Animation', "Children's", 'Comedy'], ["Children's", 'Comedy', 'Drama'], ['Animation', "Children's", 'Musical'], ['Animation', "Children's", 'Musical'], ['Comedy'], ['Comedy', 'Romance', 'War'], ['Comedy'], ['Comedy'], ['Action', 'Adventure', 'Comedy', 'Sci-Fi'], ['Comedy'], ['Comedy'], ['Action', 'Adventure', 'Comedy', 'Romance']]


Unnamed: 0_level_0,title,genre,year,neighbors,neighbors_corr,neighbors_genre,neighbors_genres
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Toy Story,"[Animation, Children's, Comedy]",1995,"[3114, 588, 1265, 2355, 34, 364, 595, 1923, 35...","[0.7863686364967801, 0.7994796294934516, 0.812...","[[Animation, Children's, Comedy], [Animation, ...","[[Animation, Children's, Comedy], [Animation, ..."
2,Jumanji,"[Adventure, Children's, Fantasy]",1995,"[3489, 653, 2161, 60, 317, 2054, 3438, 367, 67...","[0.7744544174745062, 0.8499392622015024, 0.865...","[[Adventure, Fantasy], [Action, Adventure, Fan...","[[Adventure, Fantasy], [Action, Adventure, Fan..."
3,Grumpier Old Men,"[Comedy, Romance]",1995,"[432, 370, 2953, 586, 2424, 355, 520, 276, 587...","[0.9340360072542967, 0.9355390084189408, 0.938...","[[Comedy, Western], [Comedy], [Children's, Com...","[[Comedy, Western], [Comedy], [Children's, Com..."
4,Waiting to Exhale,"[Comedy, Drama]",1995,"[1621, 1353, 1614, 450, 830, 372, 5, 195, 203,...","[0.9316115238576078, 0.9507919187828562, 0.953...","[[Drama], [Comedy, Romance], [Comedy], [Comedy...","[[Drama], [Comedy, Romance], [Comedy], [Comedy..."
5,Father of the Bride Part II,[Comedy],1995,"[2953, 586, 2082, 719, 355, 830, 186, 520, 432...","[0.9002727667189788, 0.9072205856647435, 0.911...","[[Children's, Comedy], [Children's, Comedy], [...","[[Children's, Comedy], [Children's, Comedy], [..."


In [None]:
movies_train['neighbors_all'] = movies_train.apply(lambda x: list(zip(x['neighbors'], x['neighbors_corr'], x['neighbors_genres'])), axis=1)
movies_train['neighbors_all'] = movies_train['neighbors_all'].apply(lambda x: sorted(x, key=lambda y: y[1], reverse=True))

movies_train['neighbors'] = movies_train['neighbors_all'].apply(lambda x: [i[0] for i in x])
movies_train['neighbors_corr'] = movies_train['neighbors_all'].apply(lambda x: [i[1] for i in x])
movies_train['neighbors_genres'] = movies_train['neighbors_all'].apply(lambda x: [i[2] for i in x])

movies_train.head()

Unnamed: 0_level_0,title,genre,year,neighbors,neighbors_corr,neighbors_genre,neighbors_genres,neighbors_all
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Toy Story,"[Animation, Children's, Comedy]",1995,"[1197, 2918, 2321, 1580, 1517, 3253, 356, 1923...","[0.88416330050897, 0.8831363077783166, 0.88195...","[[Animation, Children's, Comedy], [Animation, ...","[[Action, Adventure, Comedy, Romance], [Comedy...","[(1197, 0.88416330050897, [Action, Adventure, ..."
2,Jumanji,"[Adventure, Children's, Fantasy]",1995,"[316, 208, 1073, 1967, 2005, 2193, 673, 367, 3...","[0.901372630009624, 0.8925389421906157, 0.8892...","[[Adventure, Fantasy], [Action, Adventure, Fan...","[[Action, Adventure, Sci-Fi], [Action, Adventu...","[(316, 0.901372630009624, [Action, Adventure, ..."
3,Grumpier Old Men,"[Comedy, Romance]",1995,"[1409, 1569, 500, 2792, 597, 3247, 587, 276, 5...","[0.947648898528542, 0.9463958756589729, 0.9462...","[[Comedy, Western], [Comedy], [Children's, Com...","[[Comedy, Romance], [Comedy, Romance], [Comedy...","[(1409, 0.947648898528542, [Comedy, Romance]),..."
4,Waiting to Exhale,"[Comedy, Drama]",1995,"[1593, 1043, 2154, 186, 222, 1888, 203, 195, 5...","[0.9640035923506381, 0.963261934659495, 0.9624...","[[Drama], [Comedy, Romance], [Comedy], [Comedy...","[[Comedy, Romance], [Drama, Romance], [Drama, ...","[(1593, 0.9640035923506381, [Comedy, Romance])..."
5,Father of the Bride Part II,[Comedy],1995,"[3243, 784, 3247, 585, 1485, 500, 432, 520, 18...","[0.9327682940237345, 0.9327356912343412, 0.930...","[[Children's, Comedy], [Children's, Comedy], [...","[[Comedy], [Comedy], [Comedy, Crime], [Comedy]...","[(3243, 0.9327682940237345, [Comedy]), (784, 0..."


In [None]:
movies_train.drop(columns=['neighbors_all'], inplace=True)
movies_train

Unnamed: 0_level_0,title,genre,year,neighbors,neighbors_corr,neighbors_genres
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Toy Story,"[Animation, Children's, Comedy]",1995,"[1197, 2918, 2321, 1580, 1517, 3253, 356, 1923...","[0.88416330050897, 0.8831363077783166, 0.88195...","[[Action, Adventure, Comedy, Romance], [Comedy..."
2,Jumanji,"[Adventure, Children's, Fantasy]",1995,"[316, 208, 1073, 1967, 2005, 2193, 673, 367, 3...","[0.901372630009624, 0.8925389421906157, 0.8892...","[[Action, Adventure, Sci-Fi], [Action, Adventu..."
3,Grumpier Old Men,"[Comedy, Romance]",1995,"[1409, 1569, 500, 2792, 597, 3247, 587, 276, 5...","[0.947648898528542, 0.9463958756589729, 0.9462...","[[Comedy, Romance], [Comedy, Romance], [Comedy..."
4,Waiting to Exhale,"[Comedy, Drama]",1995,"[1593, 1043, 2154, 186, 222, 1888, 203, 195, 5...","[0.9640035923506381, 0.963261934659495, 0.9624...","[[Comedy, Romance], [Drama, Romance], [Drama, ..."
5,Father of the Bride Part II,[Comedy],1995,"[3243, 784, 3247, 585, 1485, 500, 432, 520, 18...","[0.9327682940237345, 0.9327356912343412, 0.930...","[[Comedy], [Comedy], [Comedy, Crime], [Comedy]..."
...,...,...,...,...,...,...
3945,Digimon: The Movie,"[Adventure, Animation, Children's]",2000,"[1115, 2484, 579, 2559, 3564, 3899, 1679, 3894...","[0.9941096312511368, 0.9941096312511368, 0.994...","[[Comedy, Drama], [Comedy], [Thriller], [Anima..."
3947,Get Carter,[Thriller],1971,"[966, 3936, 2178, 2211, 3047, 2179, 3923, 1152...","[0.9871005637424133, 0.9870285233133369, 0.986...","[[Drama], [Drama, Thriller], [Thriller], [Thri..."
3948,Meet the Parents,[Comedy],2000,"[3578, 3555, 3623, 3785, 3949, 3753, 3536, 375...","[0.949914033006739, 0.9493300147267905, 0.9492...","[[Action, Drama], [Action, Thriller], [Action,..."
3949,Requiem for a Dream,[Drama],2000,"[3160, 2769, 3915, 3535, 3852, 3747, 3948, 379...","[0.9567886149826191, 0.9552106875591808, 0.954...","[[Drama], [Crime, Mystery], [Drama], [Comedy, ..."


In [None]:
movies_train_to_save = movies_train.copy(deep=True)
movies_train_to_save['genre'] = movies_train_to_save['genre'].apply(lambda x: '|'.join([i for i in x]))

movies_train_to_save['neighbors'] = movies_train_to_save['neighbors'].apply(lambda x: '|'.join([str(i) for i in x]) if len(x) > 0 else [])
movies_train_to_save['neighbors_genres'] = movies_train_to_save['neighbors_genres'].apply(lambda x: '|'.join(['_'.join(i) for i in x]) if len(x) > 0 else [])
movies_train_to_save['neighbors_corr'] = movies_train_to_save['neighbors_corr'].apply(lambda x: '|'.join([str(i) for i in x]) if len(x) > 0 else [])

movies_train_to_save.to_csv('movies_train_neighbors_pea_jacc.csv')

movies_train_to_save.head()

Unnamed: 0_level_0,title,genre,year,neighbors,neighbors_corr,neighbors_genres
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Toy Story,Animation|Children's|Comedy,1995,1197|2918|2321|1580|1517|3253|356|1923|595|364...,0.88416330050897|0.8831363077783166|0.88195788...,Action_Adventure_Comedy_Romance|Comedy|Comedy|...
2,Jumanji,Adventure|Children's|Fantasy,1995,316|208|1073|1967|2005|2193|673|367|3438|2054|...,0.901372630009624|0.8925389421906157|0.8892929...,Action_Adventure_Sci-Fi|Action_Adventure|Adven...
3,Grumpier Old Men,Comedy|Romance,1995,1409|1569|500|2792|597|3247|587|276|520|355|24...,0.947648898528542|0.9463958756589729|0.9462690...,Comedy_Romance|Comedy_Romance|Comedy|Comedy|Co...
4,Waiting to Exhale,Comedy|Drama,1995,1593|1043|2154|186|222|1888|203|195|5|372|830|...,0.9640035923506381|0.963261934659495|0.9624774...,Comedy_Romance|Drama_Romance|Drama_Romance|Com...
5,Father of the Bride Part II,Comedy,1995,3243|784|3247|585|1485|500|432|520|186|830|355...,0.9327682940237345|0.9327356912343412|0.930380...,Comedy|Comedy|Comedy_Crime|Comedy|Comedy|Comed...


## Predict on test

In [None]:
movies_test = pd.read_csv(os.path.join(datadir, 'movies_test.dat'), engine='python',
                         sep='::', names=['movieid', 'title', 'genre'], encoding='latin-1', index_col=False).set_index('movieid')

movies_test['genre'] = movies_test['genre'].apply(lambda x: x.split('|'))

movies_test['year'] = movies_test['title'].apply(lambda x: x[-5:-1]).astype('int')

movies_test['title'] = movies_test['title'].apply(lambda x: x[:-7])

movies_test.head()

Get neighborhood, neighborhood's genres, and neighborhood's correlation for each movie in the test set

In [None]:
movies_test['neighbors'] = movies_test.apply(lambda x: get_neighbors(x.name), axis=1)
movies_test['neighbors_corr'] = movies_test['neighbors'].apply(lambda x: x[1] if len(x[0]) > 0 else [])
movies_test['neighbors'] = movies_test['neighbors'].apply(lambda x: x[0] if len(x[0]) > 0 else [])
movies_test['neighbors_genres'] = movies_test.apply(lambda x: get_neighbors_genres(x), axis=1)

movies_test.head()

Unnamed: 0_level_0,title,genre,year,neighbors,neighbors_corr,neighbors_genres
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3397,"Great Muppet Caper, The","[Children's, Comedy]",1981,"[3398, 3396, 2083, 3087, 2141, 107, 2081, 1030...","[0.6888511991512698, 0.8538650439499706, 0.901...","[[Children's, Comedy], [Children's, Comedy], [..."
2067,Doctor Zhivago,"[Drama, Romance, War]",1965,"[920, 1944, 1247, 1250, 914, 1204, 1084, 969, ...","[0.892907238858123, 0.9001695228060859, 0.9029...","[[Drama, Romance, War], [Drama, Romance, War],..."
2651,Frankenstein Meets the Wolf Man,[Horror],1943,"[2654, 2647, 2646, 2649, 2650, 2638, 2634, 265...","[0.7862881544303645, 0.8218553755334759, 0.828...","[[Horror], [Horror], [Horror], [Horror], [Horr..."
2989,For Your Eyes Only,[Action],1981,"[2376, 2990, 2991, 3635, 3639, 2993, 3638, 240...","[0.7125289643142872, 0.7438729195422868, 0.779...","[[Action], [Action], [Action], [Action], [Acti..."
3415,"Mirror, The (Zerkalo)",[Drama],1975,"[1232, 3503, 2933, 751, 1163, 668, 3470, 3223,...","[0.9742251112665378, 0.9745191722514355, 0.980...","[[Mystery, Sci-Fi], [Drama, Sci-Fi], [Drama], ..."


In [None]:
# sort neighbors, neighbors_corr, neighbors_genres by neighbors_corr in descending order
movies_test['neighbors_all'] = movies_test.apply(lambda x: list(zip(x['neighbors'], x['neighbors_corr'], x['neighbors_genres'])), axis=1)
movies_test['neighbors_all'] = movies_test['neighbors_all'].apply(lambda x: sorted(x, key=lambda y: y[1], reverse=True))

movies_test['neighbors'] = movies_test['neighbors_all'].apply(lambda x: [i[0] for i in x])
movies_test['neighbors_corr'] = movies_test['neighbors_all'].apply(lambda x: [i[1] for i in x])
movies_test['neighbors_genres'] = movies_test['neighbors_all'].apply(lambda x: [i[2] for i in x])

movies_test.head()

Unnamed: 0_level_0,title,genre,year,neighbors,neighbors_corr,neighbors_genres,predicted_genres,neighbors_all
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3397,"Great Muppet Caper, The","[Children's, Comedy]",1981,"[2096, 2003, 2137, 2423, 3033, 2134, 2413, 103...","[0.9367671624936397, 0.93560628186843, 0.93524...","[[Animation, Children's, Musical], [Comedy, Ho...","[Comedy, Children's, Animation]","[(2096, 0.9367671624936397, [Animation, Childr..."
2067,Doctor Zhivago,"[Drama, Romance, War]",1965,"[1952, 1960, 1674, 912, 1263, 3168, 902, 969, ...","[0.9313791510320225, 0.9308844907814587, 0.930...","[[Drama], [Drama, War], [Drama, Romance, Thril...","[Drama, War, Romance]","[(1952, 0.9313791510320225, [Drama]), (1960, 0..."
2651,Frankenstein Meets the Wolf Man,[Horror],1943,"[2781, 1337, 2633, 2644, 2637, 1340, 2652, 265...","[0.938158999615055, 0.9339437529140573, 0.9193...","[[Horror], [Horror], [Horror, Romance], [Horro...","[Horror, Romance, Action]","[(2781, 0.938158999615055, [Horror]), (1337, 0..."
2989,For Your Eyes Only,[Action],1981,"[2947, 3441, 1101, 1587, 3197, 2949, 2115, 240...","[0.8891704499622717, 0.8866369019705072, 0.869...","[[Action], [Action, War], [Action, Romance], [...","[Action, Adventure, Romance]","[(2947, 0.8891704499622717, [Action]), (3441, ..."
3415,"Mirror, The (Zerkalo)",[Drama],1975,"[2830, 1859, 2632, 793, 2512, 2544, 3636, 3223...","[0.9870842067201842, 0.9864910266601372, 0.986...","[[Drama], [Drama], [Drama], [Drama], [Drama], ...","[Drama, Sci-Fi, Adventure]","[(2830, 0.9870842067201842, [Drama]), (1859, 0..."


In [None]:
movies_test.drop(columns=['neighbors_all'], inplace=True)

movies_test

Unnamed: 0_level_0,title,genre,year,neighbors,neighbors_corr,neighbors_genres,predicted_genres
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3397,"Great Muppet Caper, The","[Children's, Comedy]",1981,"[2096, 2003, 2137, 2423, 3033, 2134, 2413, 103...","[0.9367671624936397, 0.93560628186843, 0.93524...","[[Animation, Children's, Musical], [Comedy, Ho...","[Comedy, Children's, Animation]"
2067,Doctor Zhivago,"[Drama, Romance, War]",1965,"[1952, 1960, 1674, 912, 1263, 3168, 902, 969, ...","[0.9313791510320225, 0.9308844907814587, 0.930...","[[Drama], [Drama, War], [Drama, Romance, Thril...","[Drama, War, Romance]"
2651,Frankenstein Meets the Wolf Man,[Horror],1943,"[2781, 1337, 2633, 2644, 2637, 1340, 2652, 265...","[0.938158999615055, 0.9339437529140573, 0.9193...","[[Horror], [Horror], [Horror, Romance], [Horro...","[Horror, Romance, Action]"
2989,For Your Eyes Only,[Action],1981,"[2947, 3441, 1101, 1587, 3197, 2949, 2115, 240...","[0.8891704499622717, 0.8866369019705072, 0.869...","[[Action], [Action, War], [Action, Romance], [...","[Action, Adventure, Romance]"
3415,"Mirror, The (Zerkalo)",[Drama],1975,"[2830, 1859, 2632, 793, 2512, 2544, 3636, 3223...","[0.9870842067201842, 0.9864910266601372, 0.986...","[[Drama], [Drama], [Drama], [Drama], [Drama], ...","[Drama, Sci-Fi, Adventure]"
...,...,...,...,...,...,...,...
2309,"Inheritors, The (Die Siebtelbauern)",[Drama],1998,"[850, 2621, 2609, 573, 1533, 3490, 2175, 1829,...","[0.995157491034146, 0.9948516654889874, 0.9947...","[[Crime, Drama], [Drama, Romance], [Drama], [D...","[Drama, Romance, Crime]"
2421,"Karate Kid, Part II, The","[Action, Adventure, Drama]",1986,"[2133, 1378, 1562, 153, 1101, 2115, 2402, 2405...","[0.8960432708055139, 0.8947890384097515, 0.894...","[[Adventure, Comedy], [Action, Comedy, Western...","[Action, Adventure, Comedy]"
3255,"League of Their Own, A","[Comedy, Drama]",1992,"[1682, 2268, 1517, 1485, 39, 2321, 11, 587, 44...","[0.8762190921254163, 0.8738660256558552, 0.870...","[[Drama], [Crime, Drama], [Comedy], [Comedy], ...","[Comedy, Romance, Drama]"
974,Algiers,"[Drama, Romance]",1938,"[937, 2904, 3414, 566, 1068, 956, 960, 925, 11...","[0.9826299217239781, 0.9825082142804039, 0.981...","[[Comedy, Romance], [Drama], [Romance], [Comed...","[Drama, Romance, Comedy]"


### **Parametric Adaptive Rank Cut**: use an adaptive parameter $t_m$ for each movie $m$ to vote out the predicted genre of $m$ using its neighbors' genres.

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
mlb.fit(movies_test['genre'])

mlb.classes_

array(['Action', 'Adventure', 'Animation', "Children's", 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
       'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western'], dtype=object)

For a movie $m$, we take the top $t_m$ labels with the highest correlation with $m$'s neighbors' genres, where $t_m$ is given by:
$$t_m = round(\alpha . \overline{G_N(m)})$$
where $N(m)$ is the set of neighbors of $m$, and $\overline{G_N(m)}$ is the average number of genres of the movies in $N(m)$:
$$\overline{G_N(m)} = \frac{1}{|N(m)|} \sum_{n \in N(m)} |G_n|$$
where $G_n$ is the set of genres of movie $n$.

In [None]:
# RCut: get top t genres
def get_t(neighborhood, alpha):
    avg_neighbor_num_genres = np.mean([len(i) for i in neighborhood])
    t = int(alpha * avg_neighbor_num_genres) / len(neighborhood)
    if t == 0:
        t = 1
    return t

### Vote genres out for each movie in the test set using neighborhood genres. 

The confidence score of the movie $m$ with respect to a candidate genre $g$ is given by:

$$s_{m,g} = \frac{1}{|N(m)|} \sum_{n \in N(m)} \gamma(n, g) . S_{m,n}$$
where $N(m)$ is the set of neighbors of $m$, $s_{m,n}$ is the similarity of $m$ and $n$, and $\gamma(n, g)$ indicates whether $g$ is a genre of $n$; that is, $\gamma(n, g) = 1$ if $g \in G(n)$, $0$ otherwise.


In [None]:
# Get genres from neighbors
def vote_genre(neighborhood, similarities, alpha):
    genres = mlb.classes_
    scores = {}
    for i in range(len(genres)):
        score = 0
        for neighbor, similarity in zip(neighborhood, similarities):
            score += similarity * neighbor[i]

        scores[genres[i]] = score

    # print(scores)
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    # print(sorted_scores)
    top_t = dict(sorted_scores[:int(get_t(neighborhood, alpha))])
    return list(top_t.keys())

Test on a sample

In [None]:
sample = movies_test.sample(1).iloc[0]
print('Movie ID:', sample.name)

sample_neighbors = mlb.transform(sample['neighbors_genres'])
# print(sample_neighbors)

sample_similarities = sample['neighbors_corr']
# print(sample_similarities)
sample_alpha = 2

print('golden genres: ', sample['genre'])
print('neighbor genres: ', sample['neighbors_genres'])

predict_genre = vote_genre(sample_neighbors, sample_similarities, sample_alpha)
print('predicted genres: ', predict_genre)

golden_genre = mlb.transform([sample['genre']])[0]
predict_genre = mlb.transform([predict_genre])[0]
print('one hot golden genres: ', golden_genre)
print('one hot predicted genres: ', predict_genre)

print('f1 score: ', f1_score([golden_genre], [predict_genre], average='samples'))

Movie ID: 1037
golden genres:  ['Action', 'Sci-Fi', 'Thriller']
neighbor genres:  [['Action', 'Crime', 'Sci-Fi'], ['Action', 'Adventure', 'Sci-Fi'], ['Action', 'Mystery', 'Sci-Fi', 'Thriller'], ['Action', 'Comedy', 'Sci-Fi', 'War'], ['Horror', 'Sci-Fi'], ['Action', 'Horror', 'Sci-Fi', 'Thriller'], ['Action', 'Adventure', 'Sci-Fi', 'Thriller'], ['Action', 'Sci-Fi'], ['Action', 'Sci-Fi'], ['Action', 'Sci-Fi'], ['Action', 'Adventure', 'Sci-Fi'], ['Action', 'Adventure', 'Sci-Fi'], ['Action', 'Adventure', 'Sci-Fi', 'Thriller'], ['Action', 'Adventure', 'Sci-Fi', 'War'], ['Action', 'Sci-Fi']]
predicted genres:  ['Sci-Fi', 'Action']
one hot golden genres:  [1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0]
one hot predicted genres:  [1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
f1 score:  0.8


If the movie does not have any ratings, thus no neighbors, we will use the most popular genre from the year that the movie was produced.

In [None]:
def get_genre_by_year(year):
    return movies_train[movies_train['year'] == year]['genre'].explode().value_counts().index[0]

get_genre_by_year(1990)

Predict the genre of each movie in the test set

In [None]:
movies_test['predicted_genres'] = movies_test.apply(lambda x: vote_genre(mlb.transform(x['neighbors_genres']),
                                                                        x['neighbors_corr'],
                                                                        alpha=3) if len(x['neighbors']) > 0 else [get_genre_by_year(x['year'])], 
                                                                        
                                                    axis=1)
movies_test

Unnamed: 0_level_0,title,genre,year,neighbors,neighbors_corr,neighbors_genres,predicted_genres
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3397,"Great Muppet Caper, The","[Children's, Comedy]",1981,"[3398, 3396, 2083, 3087, 2141, 107, 2081, 1030...","[0.6888511991512698, 0.8538650439499706, 0.901...","[[Children's, Comedy], [Children's, Comedy], [...","[Comedy, Children's, Animation]"
2067,Doctor Zhivago,"[Drama, Romance, War]",1965,"[920, 1944, 1247, 1250, 914, 1204, 1084, 969, ...","[0.892907238858123, 0.9001695228060859, 0.9029...","[[Drama, Romance, War], [Drama, Romance, War],...","[Drama, War, Romance]"
2651,Frankenstein Meets the Wolf Man,[Horror],1943,"[2654, 2647, 2646, 2649, 2650, 2638, 2634, 265...","[0.7862881544303645, 0.8218553755334759, 0.828...","[[Horror], [Horror], [Horror], [Horror], [Horr...","[Horror, Romance, Action]"
2989,For Your Eyes Only,[Action],1981,"[2376, 2990, 2991, 3635, 3639, 2993, 3638, 240...","[0.7125289643142872, 0.7438729195422868, 0.779...","[[Action], [Action], [Action], [Action], [Acti...","[Action, Adventure, Romance]"
3415,"Mirror, The (Zerkalo)",[Drama],1975,"[1232, 3503, 2933, 751, 1163, 668, 3470, 3223,...","[0.9742251112665378, 0.9745191722514355, 0.980...","[[Mystery, Sci-Fi], [Drama, Sci-Fi], [Drama], ...","[Drama, Sci-Fi, Adventure]"
...,...,...,...,...,...,...,...
2309,"Inheritors, The (Die Siebtelbauern)",[Drama],1998,"[887, 3352, 124, 2833, 1384, 560, 2101, 1829, ...","[0.7475040281811596, 0.9334796131777078, 0.952...","[[Drama], [Drama], [Drama], [Romance, War], [D...","[Drama, Romance, Crime]"
2421,"Karate Kid, Part II, The","[Action, Adventure, Drama]",1986,"[2422, 2410, 2411, 2420, 2471, 2735, 2642, 240...","[0.6879969670752948, 0.8151498811266273, 0.826...","[[Action, Adventure, Drama], [Action, Drama], ...","[Action, Adventure, Comedy]"
3255,"League of Their Own, A","[Comedy, Drama]",1992,"[1784, 500, 3253, 2302, 597, 1923, 440, 587, 1...","[0.8193485409592312, 0.8249582007478116, 0.830...","[[Comedy, Drama], [Comedy], [Comedy], [Comedy]...","[Comedy, Romance, Drama]"
974,Algiers,"[Drama, Romance]",1938,"[1070, 3924, 630, 755, 970, 3640, 1160, 925, 9...","[0.9596097565292113, 0.968724766382867, 0.9688...","[[Adventure], [Comedy], [Drama, Romance], [Chi...","[Drama, Romance, Comedy]"


In [None]:
movies_test_to_save = movies_test.copy(deep=True)

movies_test_to_save = movies_test_to_save.drop(columns=['predicted_genres'])

movies_test_to_save['genre'] = movies_test_to_save['genre'].apply(lambda x: '|'.join([i for i in x]))
movies_test_to_save['neighbors'] = movies_test_to_save['neighbors'].apply(lambda x: '|'.join([str(i) for i in x]) if len(x) > 0 else [])
movies_test_to_save['neighbors_genres'] = movies_test_to_save['neighbors_genres'].apply(lambda x: '|'.join(['_'.join(i) for i in x]) if len(x) > 0 else [])
movies_test_to_save['neighbors_corr'] = movies_test_to_save['neighbors_corr'].apply(lambda x: '|'.join([str(i) for i in x]) if len(x) > 0 else [])

movies_test_to_save.to_csv('movies_test_neighbors_pea_jacc.csv')

movies_test_to_save.head()

Unnamed: 0_level_0,title,genre,year,neighbors,neighbors_corr,neighbors_genres
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3397,"Great Muppet Caper, The",Children's|Comedy,1981,2096|2003|2137|2423|3033|2134|2413|1030|2081|1...,0.9367671624936397|0.93560628186843|0.93524212...,Animation_Children's_Musical|Comedy_Horror|Ani...
2067,Doctor Zhivago,Drama|Romance|War,1965,1952|1960|1674|912|1263|3168|902|969|1084|1204...,0.9313791510320225|0.9308844907814587|0.930860...,Drama|Drama_War|Drama_Romance_Thriller|Drama_R...
2651,Frankenstein Meets the Wolf Man,Horror,1943,2781|1337|2633|2644|2637|1340|2652|2653|2634|2...,0.938158999615055|0.9339437529140573|0.9193735...,Horror|Horror|Horror_Romance|Horror|Horror|Hor...
2989,For Your Eyes Only,Action,1981,2947|3441|1101|1587|3197|2949|2115|2403|3638|2...,0.8891704499622717|0.8866369019705072|0.869616...,Action|Action_War|Action_Romance|Action_Advent...
3415,"Mirror, The (Zerkalo)",Drama,1975,2830|1859|2632|793|2512|2544|3636|3223|3470|66...,0.9870842067201842|0.9864910266601372|0.986456...,Drama|Drama|Drama|Drama|Drama|Drama|Drama|Dram...


Classifications report

In [None]:
from sklearn.metrics import classification_report

print(classification_report(mlb.transform(movies_test['genre']).tolist(), 
                            mlb.transform(movies_test['predicted_genres']).tolist(), 
                            target_names=mlb.classes_))

              precision    recall  f1-score   support

      Action       0.49      0.91      0.63        90
   Adventure       0.32      0.75      0.45        48
   Animation       0.66      0.90      0.76        21
  Children's       0.50      0.96      0.66        48
      Comedy       0.54      0.91      0.68       247
       Crime       0.15      0.52      0.24        31
 Documentary       0.57      0.67      0.62        30
       Drama       0.56      0.94      0.70       309
     Fantasy       0.50      0.86      0.63         7
   Film-Noir       0.36      0.67      0.47         6
      Horror       0.72      0.91      0.80        75
     Musical       0.32      0.85      0.47        13
     Mystery       0.26      0.44      0.33        18
     Romance       0.33      0.74      0.46        94
      Sci-Fi       0.61      0.94      0.74        48
    Thriller       0.38      0.89      0.53       106
         War       0.41      0.60      0.48        25
     Western       0.35    