# **IMPORT**

## Import libraries

In [1]:
import pandas as pd
import math

## Load dataset

In [2]:
dataset_path = f'./Datasets/dataset.csv'

with open(dataset_path, 'rt') as f:
    df = pd.read_csv(f)

# **FILTER FUNCTION**

## Define the filter function

[1] "Movies included in the array are those whose even one attribute value matches with the input value of the user."

In [None]:
def Filter(movies_df, minRating=0, maxRating=10, minYear=0, maxYear=math.inf, genres=[], directors=[], authors=[]):

    # Take the movies which are in the range of the rating
    rating_df = movies_df[movies_df['averageRating'] >= minRating]
    rating_df = rating_df[rating_df['averageRating'] <= maxRating]

    # Take the movies which are in the range of the year
    year_df = movies_df[movies_df['startYear'] >= minYear]
    year_df = year_df[year_df['startYear'] <= maxYear]

    # Take the movies from the movies_df such that genres list is contained in the list of comma-separated values in movies_df['genres']
    if genres:
        genres_df = movies_df[movies_df['genres'].apply(lambda x: all(g in x for g in genres))]

    # Take the movies from the movies_df such that directors list is contained in the list of comma-separated values in movies_df['directors']
    if directors:
        directors_df = movies_df[movies_df['directors'].apply(lambda x: all(d in x for d in directors))]

    # Take the movies from the movies_df such that authors list is contained in the list of comma-separated values in movies_df['authors']
    if authors:
        authors_df = movies_df[movies_df['authors'].apply(lambda x: all(a in x for a in authors))]

    # Take the union of all the dataframes avoiding duplicates
    filtered_df = pd.concat([rating_df, year_df, genres_df, directors_df, authors_df]).drop_duplicates() # [1]

    return filtered_df

# **RECOMMENDER FUNCTION**

## Define the recommender function

[1] "In our research we have also found that generally a user prefer a list with five movies so we assume K equal to be 4 so that an average every K has five movies, where K is the number of cluster to be formed."

[2] "For each cluster k1, k2 , k3, k4 we assume initial centroid c1, c2, c3, c4 which corresponds to the first, sixth, eleventh, and sixteenth movie in the movie array."

[3] "The distance measure we have used to calculate the distance between data points and centroid is the Euclidean Distance."

[A] One-hot encoding for categorical variables

In [1]:
def MovieREC(movies_df, n_clusters=4): # [1]

    #  ---- Filtering top 20 movies with highest average rating ---- #
 
    # Return an empty list if the input list is empty
    if len(movies_df) == 0:
        return list()
    
    # Resize the number of clusters based on the number of movies in the input list if there are less than 20 movies
    # Otherwise keep the 20 movies with highest average rating, giving priority by number of ratings
    if len(movies_df) < 20:
        n_clusters = math.ceil(n_clusters * (len(movies_df)/20)) 
    else:
        movies_df = movies_df.sort_values(by=['avg_rating', 'num_ratings'], ascending=False).head(20)

    # Drop column num_ratings
    movies_df = movies_df.drop(columns=['num_ratings'])

    # ---- K-Means Clustering Initialization ---- #

    # Create a dictionary which assigns to each column index in movie_df their name (averageRating, startYear, genres, directors, authors)
    col_names = {i: movies_df.columns[i] for i in range(2, 7)}

    # Now let us create a matrix with num_clusers rows and one column for each column in (averageRating, startYear, genres, directors, authors)
    # Pick evenly spaced n_clusters indeces from the list of movies, starting from the index 0, up to len-1
    centroids_indexes = [math.floor(i * len(movies_df)/n_clusters) for i in range(n_clusters)] # [2]
    # Store for each centroid (movie) the values of its corresponding columns in the movies_df for averageRating and startYear
    centroids = [[movies_df[col_names[i]][centroids_indexes[j]] for i in range(2)] for j in range(n_clusters)]
    # And append to each centroid (movie) a dictionary which assigns 1 to the values in the corresponding list, for genres, directors and authors
    for j in range(n_clusters):
        centroids[j].append( {movies_df[col_names[i]][centroids_indexes[j]]: 1 for i in range(2, len(movies_df.columns))} ) # [A]

    # ---- K-Means Algorithm ---- #
        
    # Initialize to True a "changing" flag, which will be set to False when the centroids stop changing
    changing = True

    # While the centroids keep changing...
    while changing:

        # Set the "changing" flag to False
        changing = False

        # Store a copy of the previous centroids
        prev_centroids = centroids.copy()

        # Create a list of clusters, where each cluster is the list of indexes of the movies in the movies_df
        clusters = [[] for i in range(n_clusters)]

        # Now, for each movie in the movies_df...
        for i in range(len(movies_df)):

            # If i is in the centroids_indexes, add it to the corresponding cluster
            if i in centroids_indexes:
                clusters[centroids_indexes.index(i)].append(i)
                continue

            # Save a tuple (math.inf, -1) to store the minimum distance and the index of the closest centroid
            min_distance = (math.inf, -1)

            # And compute the closest centroid for the movie i
            for c_idx in centroids_indexes:

                # Init the one-dimension distances list
                tempDistances = []

                # Compute the 1D distances for the averageRating and startYear columns
                for j in range(2):
                    tempDistances.append(movies_df[col_names[j]][i] - centroids[c_idx][j])

                # Compute the 1D one-hot tempDistance for genres, directors and authors as well
                for j in range(2, len(movies_df.columns)):

                    # By summing 1 for each value in movies_df[col_names[j]][i] not in centroids[c_idx][j].keys()
                    tempBoth = sum([1 for x in movies_df[col_names[j]][i] if x not in centroids[c_idx][j].keys()])
                    # Summing (1 - centroids[c_idx][j][x]) for each x both in movies_df[col_names[j]][i] and in centroids[c_idx][j].keys()
                    tempMovie = sum([1 - centroids[c_idx][j][x] for x in movies_df[col_names[j]][i] if x in centroids[c_idx][j].keys()])
                    # Summing centroids[c_idx][j][x] for each x in centroids[c_idx][j].keys() and not in movies_df[col_names[j]][i]
                    tempCentroid = sum([centroids[c_idx][j][x] for x in centroids[c_idx][j].keys() if x not in movies_df[col_names[j]][i]])

                    # And then appending the overall sum to the 1D distances list
                    tempDistances.append(tempBoth + tempMovie + tempCentroid)

                # Compute the Euclidean distance between the movie i and the centroid c_idx
                distance = math.sqrt(sum([x**2 for x in tempDistances])) # [3]
                                    
                # If the distance is less than the minimum distance, update the minimum distance and the index of the closest centroid
                if distance < min_distance[0]:
                    min_distance = (distance, c_idx)

            # To finally add the movie i to the cluster of the closest centroid
            clusters[min_distance[1]].append(i)

        # Now set the "changing" flag to True if clusters have changed
        for i in range(n_clusters):
            if set(clusters[i]) != set(prev_centroids[i]):
                changing = True
                break

        # If they did not change, K-Means has converged and we can stop

    # ---- Pick best cluster ---- #
            
    clusters_dict = {}

    # Compute the weighted average movie rating for each cluster
    # Each cluster contains at least one movie with non-zero weight, so we can safely compute the weighted average
    for cl in clusters:
        weights_sum = 0
        weighted_sum = 0
        for i in cl:
            weights_sum += movies_df['weight'][i]
            weighted_sum += movies_df['averageRating'][i] * movies_df['weight'][i]
        clusters_dict[cl] = weighted_sum/weights_sum

    # Return the cluster with the highest weighted average movie rating
    return clusters[max(clusters_dict, key=clusters_dict.get)] # max() returns the first key with the highest value
