# **IMPORT**

## Import libraries

In [312]:
import pandas as pd
import math

## Load dataset

In [313]:
dataset_path = f'./Datasets/dataset.csv'

with open(dataset_path, 'rt') as f:
    df = pd.read_csv(f)

# **FILTER FUNCTION**

## Define the filter function

[1] "Movies included in the array are those whose even one attribute value matches with the input value of the user."

In [314]:
def Filter(movies_df, minRating=0, maxRating=10, minYear=0, maxYear=math.inf, genres=[], directors=[], authors=[]):

    # Get an empty dataframe
    filtered_df = pd.DataFrame(columns=movies_df.columns) # [1]

    # Take the movies which are in the range of the rating
    if minRating > 0 or maxRating < 10:
        rating_df = movies_df[movies_df['averageRating'] >= minRating]
        rating_df = rating_df[rating_df['averageRating'] <= maxRating]
        filtered_df = rating_df

    # Take the movies which are in the range of the year
    if minYear > 0 or maxYear < math.inf:
        year_df = movies_df[movies_df['startYear'] >= minYear]
        year_df = year_df[year_df['startYear'] <= maxYear]
        filtered_df = pd.concat([filtered_df, year_df]).drop_duplicates() # [1]

    # Take the movies from the movies_df such that genres list is contained in the list of comma-separated values in movies_df['genres'] if genres not empty
    if genres:
        genres_df = movies_df[movies_df['genres'].apply(lambda x: all(g in x for g in genres))]
        filtered_df = pd.concat([filtered_df, genres_df]).drop_duplicates() # [1]

    # Take the movies from the movies_df such that directors list is contained in the list of comma-separated values in movies_df['directors']
    if directors:
        directors_df = movies_df[movies_df['directors'].apply(lambda x: all(d in x for d in directors))]
        filtered_df = pd.concat([filtered_df, directors_df]).drop_duplicates() # [1]

    # Take the movies from the movies_df such that authors list is contained in the list of comma-separated values in movies_df['authors']
    if authors:
        authors_df = movies_df[movies_df['authors'].apply(lambda x: all(a in x for a in authors))]
        filtered_df = pd.concat([filtered_df, authors_df]).drop_duplicates() # [1]

    if filtered_df.empty: # [1]
        return movies_df

    return filtered_df

# **RECOMMENDER FUNCTION**

## Define the recommender function

[1] "In our research we have also found that generally a user prefer a list with five movies so we assume K equal to be 4 so that an average every K has five movies, where K is the number of cluster to be formed."

[2] "For each cluster k1, k2 , k3, k4 we assume initial centroid c1, c2, c3, c4 which corresponds to the first, sixth, eleventh, and sixteenth movie in the movie array."

[3] "The distance measure we have used to calculate the distance between data points and centroid is the Euclidean Distance."

[A] One-hot encoding for categorical variables

In [315]:
def MovieREC(movies_df, n_clusters=4): # [1]

    #  ---- Filtering top 20 movies with highest average rating ---- #
 
    # Return an empty list if the input list is empty
    if len(movies_df) == 0:
        return list()
    
    # Resize the number of clusters based on the number of movies in the input list if there are less than 20 movies
    # Otherwise keep the 20 movies with highest average rating, giving priority by number of ratings
    if len(movies_df) < 20:
        n_clusters = math.ceil(n_clusters * (len(movies_df)/20)) 
    else:
        movies_df = movies_df.sort_values(by=['averageRating', 'numVotes'], ascending=False).head(20)

    # Drop column num_ratings
    movies_df = movies_df.drop(columns=['numVotes'])

    # ---- K-Means Clustering Initialization ---- #

    # Create a dictionary which assigns to each column index in movie_df their name (averageRating, startYear, genres, directors, authors)
    num_of_columns = 5
    col_names = {i: movies_df.columns[i+1] for i in range(num_of_columns)}

    # Now let us create a matrix with num_clusers rows and one column for each column in (averageRating, startYear, genres, directors, authors)
    # Pick evenly spaced n_clusters indeces from the list of movies, starting from the index 0, up to len-1
    centroids_init_indexes = [math.floor(i * len(movies_df)/n_clusters)+1 for i in range(n_clusters)] # [2]
    # Store for each centroid (movie) the values of its corresponding columns in the movies_df for averageRating and startYear
    centroids = [[movies_df.iloc[centroids_init_indexes[j]][col_names[i]] for i in range(2)] for j in range(n_clusters)]
    # And append to each centroid (movie) three dictionaries which assign 1 to the values in the corresponding list, for genres, directors and authors
    for i in range(n_clusters):
        for j in range(2, num_of_columns):
            centroids[i].append( {x: 1 for x in movies_df.iloc[centroids_init_indexes[i]][col_names[j]].split(',')} ) # [A]

    # Create a list of clusters, where each cluster is the list of indexes of the movies in the movies_df
    clusters = [[] for i in range(n_clusters)]
        
    # Initialize to True a "changing" flag, which will be set to False when the centroids stop changing
    changing = True

    # ---- K-Means Algorithm ---- #

    # While the centroids keep changing...
    while changing:

        # Store a copy of the previous clusters
        prev_clusters = clusters.copy()

        # Empty the list of clusters
        clusters = [[] for i in range(n_clusters)]

        # Set the changing flag to False
        changing = False

        # Now, for each movie in the movies_df compute its cluster
        for i in range(len(movies_df)):

            # Save a tuple (math.inf, -1) to store the minimum distance and the index of the closest centroid
            min_distance = (math.inf, -1)

            # And compute the closest centroid for the movie i
            for c_idx in range(n_clusters):

                # Init the one-dimension distances list
                tempDistances = []

                # Compute the 1D distances for the averageRating and startYear columns
                for j in range(2):
                    tempDistances.append(movies_df.iloc[i][col_names[j]] - centroids[c_idx][j])

                # Compute the 1D one-hot tempDistance for genres, directors and authors as well
                for j in range(2, num_of_columns):

                    # By summing 1 for each value in movies_df[col_names[j]][i] not in centroids[c_idx][j].keys()
                    tempBoth = sum([1 for x in movies_df.iloc[i][col_names[j]] if x not in centroids[c_idx][j].keys()])
                    # Summing (1 - centroids[c_idx][j][x]) for each x both in movies_df[col_names[j]][i] and in centroids[c_idx][j].keys()
                    tempMovie = sum([1 - centroids[c_idx][j][x] for x in movies_df.iloc[i][col_names[j]] if x in centroids[c_idx][j].keys()])
                    # Summing centroids[c_idx][j][x] for each x in centroids[c_idx][j].keys() and not in movies_df[col_names[j]][i]
                    tempCentroid = sum([centroids[c_idx][j][x] for x in centroids[c_idx][j].keys() if x not in movies_df.iloc[i][col_names[j]]])

                    # And then appending the overall sum to the 1D distances list
                    tempDistances.append(tempBoth + tempMovie + tempCentroid)

                # Compute the Euclidean distance between the movie i and the centroid c_idx
                distance = math.sqrt(sum([x**2 for x in tempDistances])) # [3]
                                    
                # If the distance is less than the minimum distance, update the minimum distance and the index of the closest centroid
                if distance < min_distance[0]:
                    min_distance = (distance, c_idx)

            # To finally add the movie i to the cluster of the closest centroid
            clusters[min_distance[1]].append(i)

        # Now compute the new centroids as the average of the movies in each cluster
        for i in range(n_clusters):

            if len(clusters[i]) == 0:
                continue

            # Compute the average of the averageRating and startYear columns (easy)
            centroids[i] = [sum([movies_df.iloc[m][col_names[j]] for m in clusters[i]])/len(clusters[i]) for j in range(2)]

            # And append three dictionaries which assign the avg for each value in genres, directors and authors with respect to the movies in the cluster
            # Remember we started using one-hot encoding, so the average will be a number between 0 and 1 for each value of each column
            for j in range(2, num_of_columns):
                # Compute occurrencies of values in the current column for all the movies in the current cluster
                centroids[i].append({})
                for m in clusters[i]:
                    for v in movies_df.iloc[m][col_names[j]].split(','):
                        if v in centroids[i][j].keys():
                            centroids[i][j][v] += 1
                        else:
                            centroids[i][j][v] = 1
                # And make them an "average"
                centroids[i][j] = {key: value / len(clusters[i]) for key, value in centroids[i][j].items()}  

        # Now set the changing flag to True if CLUSTERS have changed
        for i in range(n_clusters):
            for j in range(num_of_columns):
                if set(clusters[i]) != set(prev_clusters[i]):
                    changing = True
                    break

        # If they did not change, K-Means has converged and we will stop

    # ---- Pick best cluster ---- #
            
    clusters_dict = {}

    # Compute the weighted average movie rating for each cluster
    # Each cluster contains at least one movie with non-zero weight, so we can safely compute the weighted average
    for idx, cl in enumerate(clusters):
        weights_sum = 0
        clusters_dict[idx] = 0
        for i in cl:
            weights_sum += movies_df.iloc[i]['weight']
            clusters_dict[idx] += movies_df.iloc[i]['averageRating'] * movies_df.iloc[i]['weight']
        if weights_sum != 0:
            clusters_dict[idx] /= weights_sum

    # Return the cluster with the highest weighted average movie rating
    return movies_df.iloc[clusters[max(clusters_dict, key=clusters_dict.get)]].sort_values(by='averageRating') # max() returns the first key with the highest value


# **EXECUTION**

## Check that the recommendations are deterministic

Execute once.

In [316]:
MovieREC(Filter(df))

Unnamed: 0,tconst,averageRating,startYear,genres,directors,writers,weight
11948,tt0446623,10.0,1989,Family,nm1829612,nm1855809,10.025431
17976,tt12426598,10.0,2005,"Adult,Animation,Comedy",nm5949831,nm5949831,10.021278


Execute twice.

In [317]:
MovieREC(Filter(df))

Unnamed: 0,tconst,averageRating,startYear,genres,directors,writers,weight
11948,tt0446623,10.0,1989,Family,nm1829612,nm1855809,10.025431
17976,tt12426598,10.0,2005,"Adult,Animation,Comedy",nm5949831,nm5949831,10.021278


## Filtering

Let's try to filter out the preceiding results to see some different ones.

In [318]:
MovieREC(Filter(df, minRating=2, maxRating=4))

Unnamed: 0,tconst,averageRating,startYear,genres,directors,writers,weight
38421,tt9319770,4.0,2021,"Drama,Sci-Fi","nm0177512,nm0086745,nm1171810,nm0502497,nm1325...","nm0248404,nm2325207,nm0676671,nm8872137,nm1307...",2.042632


Let's try to di the same by filtering by another attribute.

In [319]:
MovieREC(Filter(df, minYear=2006, maxYear=2020))

Unnamed: 0,tconst,averageRating,startYear,genres,directors,writers,weight
36833,tt8138362,10.0,2017,Comedy,nm4579990,"nm4303373,nm4304082,nm7529108",10.19878


Although, filtering by low ratings is usually not effective when filtering by more attributes as well...

Indeed, the system only considers the top 20 movies by average rating once the movies are filtered, and the filter conditions are not exclusive (see above).

In [320]:
MovieREC(Filter(df, minRating=4.5, maxRating=4.5, minYear=1940, maxYear=2006))

Unnamed: 0,tconst,averageRating,startYear,genres,directors,writers,weight
24806,tt2055479,9.9,1958,Family,nm0956985,nm0956985,9.917714
11042,tt0404590,9.8,1961,Music,nm0498653,nm0143922,9.805999
7964,tt0259785,9.7,1958,"Drama,Family,Romance",nm0111357,"nm0017301,nm1522821",9.703768
