In [1]:
from __future__ import annotations # To prevent Subscript for class "list" will generate runtime exception; enclose type annotation in quotes
import seaborn as sns
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import math
from math import sqrt
import scipy
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline


In [2]:

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

# Class implementing the K-Means++ algorithm


class KMeansPlusPlus:
    # Initialization of the KMeans++ class with number of clusters (K), maximum iterations, and a random seed
    def __init__(self, K, max_iter, train_ids, random_state=42):
        self.K = K  # Number of clusters
        self.max_iter = max_iter  # Maximum number of iterations
        self.random_state = random_state  # Seed for random number generator
        self.centroids = []  # List to store the centroids
        self.train_ids = train_ids

    # Method to initialize centroids using the K-Means++ algorithm
    def initialize_centroids(self, X):
        np.random.seed(self.random_state)  # Set the random seed
        # Randomly select the first centroid
        centroids = [X[np.random.randint(X.shape[0])]]
        for _ in range(1, self.K):  # For each remaining centroid
            dist_sq = np.array(
                [min([euclidean_distance(x, centroid) ** 2 for centroid in centroids]) for x in X])
            # Compute the probability of selecting each point
            probabilities = dist_sq / dist_sq.sum()
            # Cumulative sum of probabilities
            cumulative_probabilities = probabilities.cumsum()
            r = np.random.rand()  # Draw a random number
            for j, p in enumerate(cumulative_probabilities):  # Select the next centroid
                if r < p:
                    centroids.append(X[j])
                    break
        self.centroids = np.array(centroids)  # Update the centroids list

    # Method to find the closest centroid for each sample
    def closest_centroid(self, sample):
        distances = [euclidean_distance(sample, centroid)
                     for centroid in self.centroids]
        return np.argmin(distances)  # Return the index of the closest centroid

    # Method to assign each sample to the closest centroid, creating clusters
    def create_clusters(self, X):
        # Initialize clusters as a list of empty lists
        user_to_cluster = np.zeros(X.shape[0], dtype=int)
        clusters = [[] for _ in range(self.K)]
        clusters2 = [[] for _ in range(self.K)]
        for idx, sample in enumerate(X):  # For each sample
            centroid_idx = self.closest_centroid(
                sample)  # Find the claosest centroid
            # Assign the sample to the cluster
            user_to_cluster[idx] = centroid_idx
            clusters[centroid_idx].append(self.train_ids[idx])
            clusters2[centroid_idx].append(idx)
        return clusters, clusters2, user_to_cluster # Return the clusters

    def cluster_prediction(self, X):
        # Initialize clusters as a list of empty lists
        test_to_cluster = np.zeros(X.shape[0], dtype=int)
        for idx, sample in enumerate(X):  # For each sample
            centroid_idx = self.closest_centroid(
                sample)  # Find the closest centroid
            # Assign the sample to the cluster
            test_to_cluster[idx] = centroid_idx
        return test_to_cluster # Return the clusters


    # Method to update the centroids based on the current clusters
    def update_centroids(self, X, clusters2):
        # Initialize new centroids as zeros
        centroids = np.zeros((self.K, X.shape[1]))
        for cluster_idx, cluster2 in enumerate(clusters2):  # For each cluster
            if cluster2:
                # Calculate the mean of the cluster
                cluster_mean = np.mean(X[cluster2], axis=0)
                centroids[cluster_idx] = cluster_mean  # Update the centroid
        return centroids  # Return the updated centroids

    # Method to run the K-Means++ algorithm and return the final clusters and centroids
    def predict(self, X):

      self.initialize_centroids(X)  # Initialize the centroids
      for _ in range(self.max_iter):  # For each iteration
          clusters,clusters2,user_to_cluster = self.create_clusters(X)  # Create clusters
          centroids_old = self.centroids.copy()  # Make a copy of the current centroids to compare after update
          new_centroids = self.update_centroids(X, clusters2)  # Update centroids

          # Check for convergence using the _isConverged method
          if self._isConverged(centroids_old, new_centroids, tol=1e-4):
              break  # Exit the loop if centroids have effectively stopped changing

          self.centroids = new_centroids  # Update centroids for the next iteration
      clusters_adj,clusters2,user_to_cluster =  self.create_clusters(X)
      # Return the final clusters and centroids
      # print("Debugging:", type(self.centroids), type(clusters_adj), type(user_to_cluster))
      return  self.centroids, clusters_adj, user_to_cluster

    def _isConverged(self, centroids_old, centroids, tol=1e-4):
      distances = [euclidean_distance(centroids_old[i], centroids[i]) for i in range(self.K)]
      return sum(distances) < tol

    # Method to fit the model to the data, alias for predict()
    def fit(self, X):
        return self.predict(X)

    def inertia(self, X):
      total_inertia = 0
      for idx, sample in enumerate(X):
          centroid_idx = self.closest_centroid(sample)
          centroid = self.centroids[centroid_idx]
          distance = euclidean_distance(sample, centroid)
          total_inertia += distance ** 2
      return total_inertia



In [3]:
import pandas as pd
import numpy as np
import math
from decimal import Decimal


def vector_norm(x):
    return Decimal(np.dot(x, x)).sqrt()


def qr_decomposition(A):
    # Get the shape of the input matrix
    m, n = A.shape

    # Initialize the matrices
    Q = np.zeros((m, n))
    R = np.zeros((n, n))

    # Perform the Gram-Schmidt orthogonalization
    for j in range(n):
        v = A[:, j]
        for i in range(j):
            R[i, j] = np.dot(Q[:, i], A[:, j])
            v = v - R[i, j] * Q[:, i]
        # R[j, j] = np.linalg.norm(v)
        R[j, j] = vector_norm(v)
        Q[:, j] = v / R[j, j]

    return Q, R

def rank_of_matrix(mat):
    """
    This function calculates the rank of a matrix 'mat' using Gaussian elimination method.
    It returns the rank of the matrix.
    """
    # Define the dimensions of the matrix
    m = len(mat)
    n = len(mat[0])

    rank = min(m, n)

    # Perform Gaussian elimination
    for row in range(rank):
        # Check if the diagonal element is not zero
        if mat[row][row] != 0:
            for col in range(row + 1, m):
                # Calculate the factor by which to multiply the current row
                # to eliminate the non-zero element in the current column
                factor = mat[col][row] / mat[row][row]
                for i in range(row, n):
                    # Update the current row by subtracting the product of the factor
                    # and the corresponding element in the row being eliminated from it
                    mat[col][i] -= factor * mat[row][i]
        else:
            # If the diagonal element is zero, look for a non-zero element below it
            # and swap the rows if necessary
            reduce_rank = True
            for i in range(row + 1, m):
                if mat[i][row] != 0:
                    mat[row], mat[i] = mat[i], mat[row]
                    reduce_rank = False
                    break
            if reduce_rank:
                rank -= 1
                for i in range(row, m):
                    mat[i][row] = mat[i][rank]

    return rank

def eig(A):
    """
    Parameters:
    A (numpy.ndarray): The matrix to compute eigenvalues and eigenvectors.
    """
    # set the number of iterations and tolerance level
    max_iter = 100
    tol = 1e-6

    # initialize the eigenvectors
    m, n = A.shape
    eigvecs = np.random.randn(n, n)

    # compute the largest eigenvalue and eigenvector
    for i in range(max_iter):
        # compute the new eigenvector
        eigvecs_new = A @ eigvecs
        # eigvecs_new, _ = np.linalg.qr(eigvecs_new)
        eigvecs_new, _ = qr_decomposition(eigvecs_new)
        if np.allclose(eigvecs_new, eigvecs, rtol=tol):
            break
        eigvecs = eigvecs_new

    # compute the eigenvalues
    eigvals = np.diag(eigvecs.T @ A @ eigvecs)

    return eigvals, eigvecs

def full_SVD(A):
    """
    Compute Full Singular Value Decomposition of matrix A.

    Args:
        A: numpy.array, matrix to be decomposed

    Returns:
        U: numpy.array, matrix containing left singular vectors
        s: numpy.array, array containing singular values
        Vt: numpy.array, matrix containing right singular vectors (transposed)
    """
    m, n = A.shape
    if m < n:
        S = A @ A.T
    else:
        S = A.T @ A

    # Compute eigenvalues and eigenvectors
    eigvals, eigvecs = eig(S)
    # Sort the eigenvalues and eigenvectors
    sorted_indices = np.argsort(eigvals)[::-1]
    eigvals = eigvals[sorted_indices]
    eigvecs = eigvecs[:, sorted_indices]

    # Compute the singular values
    s = np.sqrt(eigvals)
    # Filter out small negative or zero eigenvalues due to numerical errors
    s = np.maximum(s, 0)

    if m < n:
        U = eigvecs
        # For full SVD, compute Vt using the formula that accounts for all singular values
        sigma_inv = np.zeros_like(A.T)
        np.fill_diagonal(sigma_inv[:m, :m], 1/s)
        Vt = (sigma_inv @ U.T @ A).T
    else:
        # Compute U directly for A larger than or equal to n
        U = A @ eigvecs @ np.diag(1/s)
        Vt = eigvecs.T

    # Ensure s is returned as a 1D array of singular values
    s = s[:min(m, n)]

    return U, s, Vt


def reduced_SVD(A, k):
    """
    Compute Reduced Singular Value Decomposition of matrix A using NumPy.

    Args:
        A: numpy.array, matrix to be decomposed
        k: int, number of singular values and vectors to retain

    Returns:
        U: numpy.array, matrix containing left singular vectors
        s: numpy.array, array containing singular values
        Vt: numpy.array, matrix containing right singular vectors (transposed)
    """
    # Assume A is m x n
    m, n = A.shape
    if m < n:
        S = A @ A.T
        k = min(k, rank_of_matrix(S.copy()))
    else:
        S = A.T @ A
        k = min(k, rank_of_matrix(S.copy()))

    eigvals, eigvecs = eig(S)
    sorted_indices = np.argsort(eigvals)[::-1]
    eigvals = eigvals[sorted_indices][:k]  # Keep top k eigenvalues
    eigvecs = eigvecs[:, sorted_indices][:, :k]  # Keep top k eigenvectors

    s = np.sqrt(eigvals)

    if m < n:
        U = eigvecs
        Vt = np.dot(np.diag(1/s), np.dot(U.T, A)).T  # Compute V^T
    else:
        U = np.dot(A, eigvecs) @ np.diag(1/s)
        Vt = eigvecs.T

    # Make sure U, s, Vt conform to the expected dimensions
    s = s[:k]  # Ensure s is of length k

    return U[:, :k], s, Vt[:k, :]

In [4]:
cols = ['UserID', 'MovieID' , 'Rating', 'Timestamp']
df = pd.read_csv('ratings.dat', sep = '::' , names = cols )

  df = pd.read_csv('ratings.dat', sep = '::' , names = cols )


In [5]:
df = df.sort_values(by='UserID')
df

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1177,5,978300760
29,1,736,3,978824268
30,1,2226,4,978824291
31,1,3118,4,978300019
32,1,1527,4,978824330
...,...,...,...,...
999977,6040,1554,3,964828599
999976,6040,1547,1,956716374
999975,6040,3114,5,984195682
999983,6040,298,2,956704716


In [6]:
df = df.drop('Timestamp', axis=1)

In [7]:
cols_temp = ['MovieID' , 'Title' , 'Genres']
df_temp = pd.read_csv('movies.dat', sep = '::' , names = cols_temp, encoding='ISO-8859-1' )

  df_temp = pd.read_csv('movies.dat', sep = '::' , names = cols_temp, encoding='ISO-8859-1' )


In [8]:
df_temp = df_temp.sort_values(by='MovieID')
df_temp

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3879,Meet the Parents (2000),Comedy
3879,3880,Requiem for a Dream (2000),Drama
3880,3881,Tigerland (2000),Drama
3881,3882,Two Family House (2000),Drama


In [9]:
(df_temp.head())

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [10]:

# Assuming you've already read the 'movies.dat' into movies_df and 'ratings.dat' into ratings_df

# First, merge the two dataframes on 'MovieID'
merged_df = pd.merge(df, df_temp, on='MovieID')



In [11]:
merged_df

Unnamed: 0,UserID,MovieID,Rating,Title,Genres
0,1,1177,5,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1177,5,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1177,4,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1177,4,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1177,5,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...
1000204,5949,2130,5,Modulations (1998),Documentary
1000205,5675,2635,3,Broken Vessels (1998),Drama
1000206,5780,2777,1,White Boys (1999),Drama
1000207,5851,3539,5,One Little Indian (1973),Comedy|Drama|Western


In [12]:
# Assuming 'merged_df' is your merged DataFrame containing 'UserID', 'Title', and 'Rating'
user_movie_ratings = merged_df.pivot(index='UserID', columns='Title', values='Rating')

# Now 'user_movie_ratings' is a DataFrame where each row represents a unique UserID,
# each column represents a movie title, and the cell values are the ratings.
# 'NaN' values indicate missing ratings where a user has not rated a particular movie.

# Optionally, you can fill NaN values with zeros if that's appropriate for your use case
# user_movie_ratings = user_movie_ratings.fillna(0)

# Reset the index to make UserID a column instead of an index, if required
user_movie_ratings.reset_index(inplace=True)

# Display the first few rows to confirm the structure is as expected
user_movie_ratings


Title,UserID,"$1,000,000 Duck (1971)",'Night Mother (1986),'Til There Was You (1997),"'burbs, The (1989)",...And Justice for All (1979),1-900 (1994),10 Things I Hate About You (1999),101 Dalmatians (1961),101 Dalmatians (1996),...,"Young Poisoner's Handbook, The (1995)",Young Sherlock Holmes (1985),Young and Innocent (1937),Your Friends and Neighbors (1998),Zachariah (1971),"Zed & Two Noughts, A (1985)",Zero Effect (1998),Zero Kelvin (Kjærlighetens kjøtere) (1995),Zeus and Roxanne (1997),eXistenZ (1999)
0,1,,,,,,,,,,...,,,,,,,,,,
1,2,,,,,,,,,,...,,,,,,,,,,
2,3,,,,,,,,,,...,,,,,,,,,,
3,4,,,,,,,,,,...,,,,,,,,,,
4,5,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,6036,,3.0,,,,,2.0,4.0,,...,,3.0,,,,,,,,2.0
6036,6037,,,,,,,,,,...,,,,,,,,,,
6037,6038,,,,,,,,,,...,,,,,,,,,,
6038,6039,,,,,,,,,,...,,3.0,,,,,,,,


In [13]:
test_ids = [0,2,3]
sub = user_movie_ratings.loc[test_ids]
sub = sub.drop('UserID', axis=1)
sub

Title,"$1,000,000 Duck (1971)",'Night Mother (1986),'Til There Was You (1997),"'burbs, The (1989)",...And Justice for All (1979),1-900 (1994),10 Things I Hate About You (1999),101 Dalmatians (1961),101 Dalmatians (1996),12 Angry Men (1957),...,"Young Poisoner's Handbook, The (1995)",Young Sherlock Holmes (1985),Young and Innocent (1937),Your Friends and Neighbors (1998),Zachariah (1971),"Zed & Two Noughts, A (1985)",Zero Effect (1998),Zero Kelvin (Kjærlighetens kjøtere) (1995),Zeus and Roxanne (1997),eXistenZ (1999)
0,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,


In [14]:
value = user_movie_ratings.at[6035, '101 Dalmatians (1961)']
value

4.0

In [15]:
rat = user_movie_ratings.at[991,'12 Angry Men (1957)']
rat

5.0

In [16]:
# Assuming user_movie_ratings is your DataFrame and you want the mean starting from the 1st column (zero-based indexing)
val = user_movie_ratings.loc[991, user_movie_ratings.columns[1:]].mean()

print(f"Mean value from the 1st column onwards for user at index 991: {val}")


Mean value from the 1st column onwards for user at index 991: 3.4193548387096775


In [17]:
# Assuming user_movie_ratings is your DataFrame and index 991 exists
max_value_at_991 = user_movie_ratings.loc[991].max()
min_value_at_991 = user_movie_ratings.loc[991].min()

print(f"Maximum value for user at index 991: {max_value_at_991}")
print(f"Minimum value for user at index 991: {min_value_at_991}")


Maximum value for user at index 991: 992.0
Minimum value for user at index 991: 1.0


In [18]:
row = user_movie_ratings.loc[991]

# Find the column name of the maximum value in this row
max_col_name = row.idxmax()
max_col_name

'UserID'

In [19]:
val = user_movie_ratings.at[6035, '101 Dalmatians (1961)']
val

4.0

In [20]:
rating = user_movie_ratings.at[6039, 'Drive Me Crazy (1999)']
rating

nan

In [21]:
from __future__ import annotations # To prevent Subscript for class "list" will generate runtime exception; enclose type annotation in quotes
import seaborn as sns
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import math
from math import sqrt
import scipy
from sklearn.preprocessing import MinMaxScaler
# from Kmeans import *
#%matplotlib inline



def pearson_correlation(user_movie_ratings, user1, user2):

    non_nan_counts = user_movie_ratings.notna().sum(axis=0)
    # Create a new DataFrame with a single row containing these counts
    counts_df = pd.DataFrame([non_nan_counts], index=['count_non_nan'])
    counts_df = counts_df.iloc[:, 1:]



    ratings1 = user_movie_ratings.loc[user_movie_ratings['UserID'] == user1, user_movie_ratings.columns[1:]]
    ratings2 = user_movie_ratings.loc[user_movie_ratings['UserID'] == user2, user_movie_ratings.columns[1:]]

    mask1 = ratings1.notna().iloc[0]
    mask2 = ratings2.notna().iloc[0]
    common_indices = mask1 & mask2

    mean_user1 = 0
    mean_user2 = 0
    count  = 0

    for column_name in common_indices[common_indices].index:
      rating_value1 = ratings1.iloc[0][column_name]
      rating_value2 = ratings2.iloc[0][column_name]
      mean_user1 +=   rating_value1
      mean_user2 +=   rating_value2
      count += 1

    if(count==0):
      return 0

    mean_user1  = mean_user1/count
    mean_user2  = mean_user2/count


    numerator = 0
    d1 = 0
    d2 = 0
    for column_name in common_indices[common_indices].index:
      n1 = (ratings1.iloc[0][column_name] - mean_user1)
      n2 = (ratings2.iloc[0][column_name]  - mean_user2)

      weight  = math.log(6040/(counts_df.iloc[0][column_name]))
      d1 += n1*n1*weight
      d2 += n2*n2*weight
      num = weight*n1*n2
      numerator += num



    denominator1 = sqrt(d1)
    denominator2 = sqrt(d2)


    if denominator1*denominator2==0:
      return 0
    else:
      return numerator/(denominator1*denominator2)


# def calculate_user_similarity_list(target_user, user_list_in_cluster, ratings_matrix):
#     """Calculate similarity between the target user and each user in the user list."""
#     similarity_list = []
#     for user_id in user_list_in_cluster:
#         similarity = pearson_correlation(ratings_matrix, target_user, user_id)
#         similarity_list.append((user_id, similarity))
#     return similarity_list


In [22]:
val = pearson_correlation(user_movie_ratings,3, 385)

val

-0.22294128969918323

In [23]:
def count_non_nan_ratings(df):
    # Count non-NaN values for each column in the DataFrame
    non_nan_counts = df.notna().sum(axis=0)
    # Create a new DataFrame with a single row containing these counts
    counts_df = pd.DataFrame([non_nan_counts], index=['count_non_nan'])
    return counts_df

# Usage:
# Assuming 'full_ratings_df' is your full DataFrame with all user ratings
counts_df = count_non_nan_ratings(user_movie_ratings)
counts_df = counts_df.iloc[:, 1:]
counts_df

Title,"$1,000,000 Duck (1971)",'Night Mother (1986),'Til There Was You (1997),"'burbs, The (1989)",...And Justice for All (1979),1-900 (1994),10 Things I Hate About You (1999),101 Dalmatians (1961),101 Dalmatians (1996),12 Angry Men (1957),...,"Young Poisoner's Handbook, The (1995)",Young Sherlock Holmes (1985),Young and Innocent (1937),Your Friends and Neighbors (1998),Zachariah (1971),"Zed & Two Noughts, A (1985)",Zero Effect (1998),Zero Kelvin (Kjærlighetens kjøtere) (1995),Zeus and Roxanne (1997),eXistenZ (1999)
count_non_nan,37,70,52,303,199,2,700,565,364,616,...,79,379,10,109,2,29,301,2,23,410


In [24]:
# Filter rows where 'UserID' equals 1, then select columns starting from the 3rd column
# ratings1 = user_movie_ratings.loc[user_movie_ratings['UserID'] == 1, user_movie_ratings.columns[1:]]
ratings2 = user_movie_ratings.loc[user_movie_ratings['UserID'] == 6040, user_movie_ratings.columns[1:]]
ratings2

Title,"$1,000,000 Duck (1971)",'Night Mother (1986),'Til There Was You (1997),"'burbs, The (1989)",...And Justice for All (1979),1-900 (1994),10 Things I Hate About You (1999),101 Dalmatians (1961),101 Dalmatians (1996),12 Angry Men (1957),...,"Young Poisoner's Handbook, The (1995)",Young Sherlock Holmes (1985),Young and Innocent (1937),Your Friends and Neighbors (1998),Zachariah (1971),"Zed & Two Noughts, A (1985)",Zero Effect (1998),Zero Kelvin (Kjærlighetens kjøtere) (1995),Zeus and Roxanne (1997),eXistenZ (1999)
6039,,,,,,,,,,5.0,...,,,,,,,,,,


In [25]:
# mask1 = ratings1.notna().iloc[0]
# mask2 = ratings2.notna().iloc[0]
# common_indices = mask1 & mask2
# common_indices

In [26]:
# for column_name in common_indices[common_indices].index:
#   print(counts_df.iloc[0][column_name])

In [27]:
# # Iterate over the common non-NaN indices and access the values
# for column_name in common_indices[common_indices].index:
#     # print(column_name)
#     rating_value1 = ratings1.iloc[0][column_name]
#     rating_value2 = ratings2.iloc[0][column_name]
#     print(f"Column: {column_name}, Ratings1: {rating_value1}, Ratings2: {rating_value2}")

In [28]:
# ratings1 = ratings1[common_indices]
# for column_name in ratings1.columns:
#     value = ratings1.iloc[0][column_name]
#     print(f"Column: {column_name}, Value: {value}")

In [29]:
import pandas as pd
import numpy as np

# Assuming `movies_df` and `ratings_df` are your DataFrames for movies and ratings
movies_df = df_temp.copy()
ratings_df = df.copy()

# Split the 'Genres' column into a list of genres
movies_df['Genres'] = movies_df['Genres'].str.split('|')


# # Explode the 'Genres' into separate rows
# movies_df = movies_df.explode('Genres')
# movies_df

# # # Merge the ratings with the exploded movies DataFrame on 'MovieID'
# # genre_ratings_df = pd.merge(ratings_df, movies_df, on='MovieID')

# # # Pivot to get users in rows and genres in columns with ratings as values
# # genre_matrix = genre_ratings_df.pivot_table(index='UserID', columns='Genres', values='Rating', aggfunc='mean')

# # # Calculate the average rating for each user across all genres they have rated
# # user_genre_avg = genre_matrix.mean(axis=1)

# # # Fill NaN values in the genre pivot table with each user's average rating across genres
# # # For users who haven't rated any movie in a genre, fill with 2.5 or the global average
# # for user_id in genre_matrix.index:
# #     user_avg = user_genre_avg.loc[user_id]
# #     genre_matrix.loc[user_id] = genre_matrix.loc[user_id].fillna(user_avg if not np.isnan(user_avg) else 2.5)

# # # Now genre_matrix represents the average rating for all movies in each genre calculated separately for each user
# # # Display the genre matrix
# # print(genre_matrix.head())
movies_df

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),"[Animation, Children's, Comedy]"
1,2,Jumanji (1995),"[Adventure, Children's, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama]"
4,5,Father of the Bride Part II (1995),[Comedy]
...,...,...,...
3878,3879,Meet the Parents (2000),[Comedy]
3879,3880,Requiem for a Dream (2000),[Drama]
3880,3881,Tigerland (2000),[Drama]
3881,3882,Two Family House (2000),[Drama]


In [30]:
movies_df = movies_df.explode('Genres')
movies_df

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation
0,1,Toy Story (1995),Children's
0,1,Toy Story (1995),Comedy
1,2,Jumanji (1995),Adventure
1,2,Jumanji (1995),Children's
...,...,...,...
3879,3880,Requiem for a Dream (2000),Drama
3880,3881,Tigerland (2000),Drama
3881,3882,Two Family House (2000),Drama
3882,3883,"Contender, The (2000)",Drama


In [31]:
genre_ratings_df = pd.merge(ratings_df, movies_df, on='MovieID')
genre_ratings_df

Unnamed: 0,UserID,MovieID,Rating,Title,Genres
0,1,1177,5,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1177,5,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1177,4,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1177,4,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1177,5,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...
2101810,5780,2777,1,White Boys (1999),Drama
2101811,5851,3539,5,One Little Indian (1973),Comedy
2101812,5851,3539,5,One Little Indian (1973),Drama
2101813,5851,3539,5,One Little Indian (1973),Western


In [32]:
genre_ratings_df_temp = genre_ratings_df.sort_values(by='MovieID')
genre_ratings_df_temp

Unnamed: 0,UserID,MovieID,Rating,Title,Genres
22135,1223,1,5,Toy Story (1995),Children's
25054,4058,1,3,Toy Story (1995),Children's
25053,4058,1,3,Toy Story (1995),Animation
25052,4055,1,3,Toy Story (1995),Comedy
25051,4055,1,3,Toy Story (1995),Children's
...,...,...,...,...,...
1658591,2000,3883,4,"Contender, The (2000)",Thriller
1658592,2020,3883,2,"Contender, The (2000)",Drama
1658593,2020,3883,2,"Contender, The (2000)",Thriller
1658580,1962,3883,3,"Contender, The (2000)",Drama


In [33]:
genre_matrix = genre_ratings_df.pivot_table(index='UserID', columns='Genres', values='Rating', aggfunc='mean')
genre_matrix

Genres,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,4.200000,4.000000,4.111111,4.250000,4.142857,4.000000,,4.428571,4.00,,,4.285714,,3.666667,4.333333,3.666667,5.000000,
2,3.500000,3.736842,,,3.560000,3.583333,,3.898734,3.00,4.000000,3.000000,,3.333333,3.708333,3.588235,3.483871,3.733333,4.333333
3,3.956522,4.000000,4.000000,4.000000,3.766667,,,4.000000,4.50,,2.666667,4.000000,3.000000,3.800000,3.833333,3.800000,4.000000,4.666667
4,4.157895,3.833333,,4.000000,,5.000000,,4.166667,4.50,,4.333333,,,4.000000,3.555556,3.500000,3.333333,4.500000
5,2.612903,3.000000,4.000000,3.833333,3.410714,3.285714,3.666667,3.096154,,4.000000,2.800000,3.333333,3.125000,3.100000,3.066667,2.846154,3.500000,4.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,3.000000,2.987952,3.911765,3.444444,3.203065,3.528302,3.909091,3.505376,3.00,4.058824,2.986486,3.709677,3.411765,3.352459,2.834320,3.142857,3.785714,3.642857
6037,3.642857,4.000000,4.000000,3.666667,3.576271,3.833333,4.000000,3.877551,4.25,3.444444,4.111111,4.000000,3.692308,3.681818,3.692308,3.705882,4.000000,3.750000
6038,3.000000,4.000000,3.666667,3.000000,3.833333,,,3.888889,,,2.500000,,,4.166667,4.000000,,4.000000,
6039,4.000000,4.100000,3.615385,3.529412,3.723077,4.000000,,4.000000,3.60,4.500000,4.000000,3.690476,4.176471,3.800000,4.250000,4.142857,4.111111,4.500000


In [34]:
# Calculate the average rating for each user across all genres they have rated
user_genre_avg = genre_matrix.mean(axis=1)

# Fill NaN values in the genre pivot table with each user's average rating across genres
# For users who haven't rated any movie in a genre, fill with 2.5 or the global average
for user_id in genre_matrix.index:
    user_avg = user_genre_avg.loc[user_id]
    genre_matrix.loc[user_id] = genre_matrix.loc[user_id].fillna(user_avg if not np.isnan(user_avg) else 2.5)
genre_matrix

Genres,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,4.200000,4.000000,4.111111,4.250000,4.142857,4.000000,4.160379,4.428571,4.000000,4.160379,4.160379,4.285714,4.160379,3.666667,4.333333,3.666667,5.000000,4.160379
2,3.500000,3.736842,3.604239,3.604239,3.560000,3.583333,3.604239,3.898734,3.000000,4.000000,3.000000,3.604239,3.333333,3.708333,3.588235,3.483871,3.733333,4.333333
3,3.956522,4.000000,4.000000,4.000000,3.766667,3.865990,3.865990,4.000000,4.500000,3.865990,2.666667,4.000000,3.000000,3.800000,3.833333,3.800000,4.000000,4.666667
4,4.157895,3.833333,4.073343,4.000000,4.073343,5.000000,4.073343,4.166667,4.500000,4.073343,4.333333,4.073343,4.073343,4.000000,3.555556,3.500000,3.333333,4.500000
5,2.612903,3.000000,4.000000,3.833333,3.410714,3.285714,3.666667,3.096154,3.333920,4.000000,2.800000,3.333333,3.125000,3.100000,3.066667,2.846154,3.500000,4.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,3.000000,2.987952,3.911765,3.444444,3.203065,3.528302,3.909091,3.505376,3.000000,4.058824,2.986486,3.709677,3.411765,3.352459,2.834320,3.142857,3.785714,3.642857
6037,3.642857,4.000000,4.000000,3.666667,3.576271,3.833333,4.000000,3.877551,4.250000,3.444444,4.111111,4.000000,3.692308,3.681818,3.692308,3.705882,4.000000,3.750000
6038,3.000000,4.000000,3.666667,3.000000,3.833333,3.605556,3.605556,3.888889,3.605556,3.605556,2.500000,3.605556,3.605556,4.166667,4.000000,3.605556,4.000000,3.605556
6039,4.000000,4.100000,3.615385,3.529412,3.723077,4.000000,3.984635,4.000000,3.600000,4.500000,4.000000,3.690476,4.176471,3.800000,4.250000,4.142857,4.111111,4.500000


In [35]:
mtr = genre_matrix.loc[3]

In [36]:
mtr

Genres
Action         3.956522
Adventure      4.000000
Animation      4.000000
Children's     4.000000
Comedy         3.766667
Crime          3.865990
Documentary    3.865990
Drama          4.000000
Fantasy        4.500000
Film-Noir      3.865990
Horror         2.666667
Musical        4.000000
Mystery        3.000000
Romance        3.800000
Sci-Fi         3.833333
Thriller       3.800000
War            4.000000
Western        4.666667
Name: 3, dtype: float64

In [37]:
from scipy.linalg import svd
genre_matrix_np = genre_matrix.to_numpy()

# Perform SVD
U, s, Vt = full_SVD(genre_matrix_np)

# Count non-zero singular values
non_zero_singular_values = np.sum(s > 0.0001)  # Using a small threshold to account for computational precision

print(f"Number of non-zero singular values: {non_zero_singular_values}")

Number of non-zero singular values: 18


In [38]:
import numpy as np
import pandas as pd

def get_predictions(user_movie_ratings, test_ids, test_to_cluster, cluster_adj):

    predictions = {}
    # max_similarities = {}  # Dictionary to store the maximum similarity for each test_id
    # min_similarities = {}
    for test_idx, test_id in enumerate(test_ids):
        cluster_label = test_to_cluster[test_idx]
        similar_users = cluster_adj[cluster_label]

        weighted_sums = pd.Series(0, index=user_movie_ratings.columns[1:])
        sim_sums = pd.Series(0, index=user_movie_ratings.columns[1:])

        user_mean = user_movie_ratings.loc[test_id-1,user_movie_ratings.columns[1:]].mean()
        for other_user_id in similar_users:
            if other_user_id == test_id:
                continue
            other_user_mean = user_movie_ratings.loc[other_user_id-1,user_movie_ratings.columns[1:]].mean()
            similarity = pearson_correlation(user_movie_ratings, test_id, other_user_id)

            for movie in user_movie_ratings.columns[1:]:
                other_user_rating = user_movie_ratings.at[other_user_id-1, movie]
                if not np.isnan(other_user_rating):
                    weighted_sums[movie] += similarity * (other_user_rating - other_user_mean)
                    sim_sums[movie] += abs(similarity)
            # movie = 'Event Horizon (1997)'
            # other_user_rating = user_movie_ratings.at[other_user_id-1, movie]
            # if not np.isnan(other_user_rating):
            #     weighted_sums[movie] += similarity * (other_user_rating - other_user_mean)
            #     print(other_user_rating - other_user_mean)
            #     print(other_user_id)
            #     sim_sums[movie] += abs(similarity)


        sim_sums[sim_sums == 0] = np.nan  # to avoid division by zero
        user_predictions = user_mean+weighted_sums / sim_sums
        predictions[test_id] = user_predictions.fillna(user_mean)

    # Assuming 'predictions' is a dictionary
    predictions_df = pd.DataFrame(predictions).T
    for col in predictions_df.columns:
      for index in predictions_df.index:
          value = predictions_df.at[index, col]
          if value > 5:
             predictions_df.at[index, col] = 5
          elif value < 1:
             predictions_df.at[index, col] = 1


    return predictions_df

# Example usage:
# Assuming the pearson_correlation function is already defined
# Call get_predictions with appropriate arguments and store the maximum similarities.


In [73]:
import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds
# from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error, mean_absolute_error
# from KMEANS import KMeansPlusPlus

# Assuming genre_matrix and user_movie_ratings are already defined

# Function to project test users into the SVD space
def project_test_users_to_svd_space(genre_matrix_test, V, s, num_singular_values):
    """
    Projects test set users into the SVD space defined by V and singular values,
    with an explicit loop for each user.

    Parameters:
    - genre_matrix_test: DataFrame with test users' genre ratings.
    - V: Matrix V from SVD (not V^T), rows are singular vectors.
    - s: Array of singular values from SVD.
    - num_singular_values: Number of singular values (and vectors) to use.

    Returns:
    - A numpy array with test users represented in the SVD-defined latent space.
    """


    # Ensure V is correctly oriented (it should have genres as rows if not transposed)
    if(V.shape[0] != genre_matrix_test.shape[1]):
        V = V.T  # Transpose if necessary

    # Initialize an empty matrix for the test users' SVD space representations
    test_users_svd_space = np.zeros((genre_matrix_test.shape[0], num_singular_values))

    # Loop over each user
    for user_index in range(genre_matrix_test.shape[0]):
        # Compute the representation for each singular value dimension
        for i in range(num_singular_values):
            user_representation = 0
            for genre_index in range(genre_matrix_test.shape[1]):
                user_representation += genre_matrix_test.iloc[user_index, genre_index] * V[genre_index, i]
            # Normalize by the ith singular value
            test_users_svd_space[user_index, i] = user_representation / s[i]

    return test_users_svd_space

# Modified function to perform SVD, KMeans, and evaluation
def svd_kmeans_evaluation(genre_matrix, n_singular_values, n_clusters, train_ids, test_ids):
    # Select train and test subsets
    genre_matrix_train = genre_matrix.loc[train_ids]
    genre_matrix_test = genre_matrix.loc[test_ids]

    # # Convert genre_matrix_test to DataFrame if test_ids has length 1
    # if isinstance(test_ids, int) or len(test_ids) == 1:
    #     if isinstance(genre_matrix_test, pd.Series):
    #         # Convert Series to DataFrame
    #         genre_matrix_test = genre_matrix_test.to_frame().T
    #     elif isinstance(test_ids, list) or isinstance(test_ids, np.ndarray):
    #         # Ensure test_ids is a scalar for direct indexing
    #         test_id = test_ids[0]
    #         genre_matrix_test = genre_matrix.loc[[test_id]]

    genre_matrix_train_array = genre_matrix.loc[train_ids].to_numpy()

    # Apply SVD on the training set
    U, s, Vt = reduced_SVD(genre_matrix_train_array , n_singular_values)

    # KMeans Clustering on the U matrix from SVD
    # kmeans = KMeans(n_clusters=n_clusters, init='k-means++',random_state=42)
    kmeans = KMeansPlusPlus(K=n_clusters, max_iter=300, train_ids = train_ids)

    # clusters = kmeans.fit_predict(U)
    # centroid_rep,clusters_adj,user_to_cluster = kmeans.predict(U)
    result = kmeans.predict(U)

    centroid_rep, clusters_adj, user_to_cluster = result

    # Calculate cluster averages in the user_movie_ratings matrix for training data
    # adjusted_train_ids = [index - 1 for index in train_ids]
    # train_cluster_avg = user_movie_ratings_filled.loc[adjusted_train_ids].groupby(clusters).mean()

    # Transform test set users into the SVD-defined space
    if isinstance(genre_matrix_test, pd.Series):
      genre_matrix_test = genre_matrix_test.to_frame().T

    test_users_svd_space = project_test_users_to_svd_space(genre_matrix_test, Vt.T, s, n_singular_values)
    # return  test_users_svd_space

    # Predict cluster membership for test set users
    test_to_cluster  = kmeans.cluster_prediction(test_users_svd_space)

    # Predict ratings for the test set based on cluster averages
    if not isinstance(test_ids, (list, tuple)):
      test_ids = [test_ids]
    predictions = get_predictions(user_movie_ratings, test_ids, test_to_cluster, clusters_adj)

    return  predictions



In [40]:
test_ids = [1,2,3]

# Create train_ids with all user IDs from 1 to 6040, excluding those in test_ids
train_ids = [i for i in range(1, 200)if i not in test_ids]

In [41]:
genre_matrix_train_array = genre_matrix.loc[train_ids].to_numpy()
U, s, Vt = full_SVD(genre_matrix_train_array)

# Count non-zero singular values
non_zero_singular_values = np.sum(s > 0.0001)  # Using a small threshold to account for computational precision

print(f"Number of non-zero singular values: {non_zero_singular_values}")

Number of non-zero singular values: 18


In [42]:
# predictions = svd_kmeans_evaluation(genre_matrix, user_movie_ratings, 5, 5, train_ids, test_ids)



In [43]:
# predictions


In [44]:
# for index in predictions.index:
#   print(index)

In [45]:
# max_value = predictions.max().max()
# min_value = predictions.min().min()

# # Find the column names for the maximum and minimum values
# max_value_column = predictions.max().idxmax()
# min_value_column = predictions.min().idxmin()

# print(f"Maximum value in the DataFrame: {max_value}, Column: {max_value_column}")
# print(f"Minimum value in the DataFrame: {min_value}, Column: {min_value_column}")

In [46]:
import numpy as np

def get_accuracy(predictions_df, user_movie_ratings, test_ids):
    adjusted_test_ids = [index - 1 for index in test_ids]
    user_movie_ratings_test = user_movie_ratings.loc[adjusted_test_ids]
    user_movie_ratings_test = user_movie_ratings_test.drop('UserID', axis=1)

    sum_abs_error = 0
    sum_squared_error = 0
    sum_percentage_error = 0
    sum_symmetric_percentage_error = 0
    sum_squared_log_error = 0
    count = 0

    for index in predictions_df.index:
        for col in predictions_df.columns:
            val_pred = predictions_df.at[index, col]
            val_ac = user_movie_ratings_test.at[index-1, col]
            if not np.isnan(val_ac):
                abs_error = abs(val_ac - val_pred)
                squared_error = (val_ac - val_pred) ** 2
                percentage_error = abs_error / val_ac if val_ac != 0 else 0
                symmetric_percentage_error = 200 * abs_error / (abs(val_ac) + abs(val_pred)) if (val_ac + val_pred) != 0 else 0
                squared_log_error = (np.log(val_pred + 1) - np.log(val_ac + 1)) ** 2

                sum_abs_error += abs_error
                sum_squared_error += squared_error
                sum_percentage_error += percentage_error
                sum_symmetric_percentage_error += symmetric_percentage_error
                sum_squared_log_error += squared_log_error
                count += 1

    mae = sum_abs_error / count if count else float('nan')
    rmse = np.sqrt(sum_squared_error / count) if count else float('nan')
    mape = (sum_percentage_error / count) * 100 if count else float('nan')
    smape = (sum_symmetric_percentage_error / count) if count else float('nan')
    msle = np.sqrt(sum_squared_log_error / count) if count else float('nan')

    return mae, rmse, mape, smape, msle


In [47]:
# mae , rmse = get_accuracy(predictions,user_movie_ratings,test_ids)
# mae

In [48]:
from sklearn.model_selection import train_test_split

# Assuming user_ids is your list of user IDs
user_ids = genre_matrix.index.values
train_ids, test_ids = train_test_split(user_ids, test_size=0.25, random_state=42)

# Sorting the train_ids and test_ids
train_ids = sorted(train_ids)
test_ids = sorted(test_ids)

# Now train_ids_sorted and test_ids_sorted are in sorted order
genre_matrix_train_array = genre_matrix.loc[train_ids].to_numpy()
U, s, Vt = full_SVD(genre_matrix_train_array)

# Count non-zero singular values
non_zero_singular_values = np.sum(s > 0.0001)  # Using a small threshold to account for computational precision

print(f"Number of non-zero singular values: {non_zero_singular_values}")

Number of non-zero singular values: 18


In [49]:
def find_optimal_k_svd(genre_matrix_train_array, n_singular_values, k_start=10, k_end=13):
    U, s, Vt = reduced_SVD(genre_matrix_train_array, k=n_singular_values)
    costs = []
    k_range = range(k_start, k_end + 1, 2)  # Skip every other k value
    for k in k_range:
        kmeans = KMeansPlusPlus(K=k, max_iter=300, train_ids=train_ids)
        kmeans.predict(U)
        costs.append(kmeans.inertia(U))

    plt.plot(k_range, costs, marker='o')
    plt.title(f'Elbow Method for {n_singular_values} Singular Values')
    plt.xlabel('Number of clusters (k)')
    plt.ylabel('Cost')
    plt.show()




In [50]:
# # # Example loop to find optimal k for each singular value count, skipping values
# # optimal_ks = []
# # for n_singular_values in range(7, 17, 2):  # Skip every other singular value
#       #  find_optimal_k_svd(genre_matrix_train_array, n_singular_values, 8, 30)
# find_optimal_k_svd(genre_matrix_train_array, 7, 10, 13)

In [51]:
# # Example loop to find optimal k for each singular value count, skipping values
# optimal_ks = []
# # for n_singular_values in range(17, 18,2):  # Skip every other singular value
#     find_optimal_k_svd(genre_matrix_train_array, n_singular_values, 8, 30)

In [52]:
optimal_k_per_singular_value = {
    7: 18,  # Assuming for 1 singular value, the optimal k is 5
    9: 18,
    11: 14,
    13: 18,
    15: 22,
    17: 22,

}


In [53]:
test_ids = [i for i in range (1,31)]
train_ids = [i for i in range (30,431) ]

In [54]:
def calculate_composite_score(mae, rmse, mape, smape, msle):
    """
    Calculate a composite score from various evaluation metrics with implicit normalization through weights.

    :param mae: Mean Absolute Error
    :param rmse: Root Mean Square Error
    :param mape: Mean Absolute Percentage Error
    :param smape: Symmetric Mean Absolute Percentage Error
    :param msle: Mean Squared Logarithmic Error
    :return: A single composite score reflecting the overall prediction error, lower is better.
    """
    weights = {'mae': 0.25, 'rmse': 0.25, 'mape': 0.15, 'smape': 0.15, 'msle': 0.20}
    composite_score = (weights['mae'] * mae +
                       weights['rmse'] * rmse +
                       weights['mape'] * (mape / 100) +  # Scaling down MAPE and SMAPE by dividing by 100
                       weights['smape'] * (smape / 100) +  # to bring closer to other metrics' scale
                       weights['msle'] * msle)

    return composite_score


In [55]:

results = []

for n_singular_values in range(7,18,2):  # Assuming 18 non-zero singular values
    # Inner loop for different numbers of clusters
        n_clusters   = optimal_k_per_singular_value[n_singular_values]
        print(n_clusters)
        predictions = svd_kmeans_evaluation(genre_matrix, n_singular_values, n_clusters, train_ids, test_ids)
        mae, rmse, mape, smape, msle = get_accuracy(predictions,user_movie_ratings,test_ids)
        comp_score = calculate_composite_score(mae, rmse, mape, smape, msle)
        print(comp_score)
        results.append((n_singular_values, n_clusters,comp_score, mae, rmse , mape , smape , msle))

# Choose optimal number of singular values and clusters based on lowest RMSE
optimal_values = min(results, key=lambda x: x[2])
print(results)
print(f"Optimal number of singular values: {optimal_values[0]}, Optimal number of clusters: {optimal_values[1]}, Lowest MAE: {optimal_values[2]}, Lowest RMSE: {optimal_values[3]}")

18
0.5725577953838287
18


KeyboardInterrupt: 

In [None]:
predictions = svd_kmeans_evaluation(genre_matrix, 9, 18, train_ids, 3)
mae, rmse, mape, smape, msle  = get_accuracy(predictions,user_movie_ratings,test_ids)

In [None]:
print(mae, rmse, mape, smape, msle)


**RECCOMENDING TOP 5 MOVIES**

In [56]:
# Assuming df_temp is your initial DataFrame as shown in the image
movies_df = df_temp.copy()
movies_df.set_index('MovieID', inplace=True)
movies_df = movies_df[['Title']]  # Create a DataFrame with only the Title column

movies_df

Unnamed: 0_level_0,Title
MovieID,Unnamed: 1_level_1
1,Toy Story (1995)
2,Jumanji (1995)
3,Grumpier Old Men (1995)
4,Waiting to Exhale (1995)
5,Father of the Bride Part II (1995)
...,...
3879,Meet the Parents (2000)
3880,Requiem for a Dream (2000)
3881,Tigerland (2000)
3882,Two Family House (2000)


In [57]:
# Extract the column names (movie titles) from user_movie_ratings, excluding the 'UserID' column
movie_titles = user_movie_ratings.columns.tolist()[1:]  # Skip the first column which is UserID

# Use these titles to look up the MovieIDs from movies_df, which has titles as the index
movie_ids = movies_df.reset_index().set_index('Title').loc[movie_titles].values.flatten()

# Ensure movie_ids are in the correct integer format
movie_ids = movie_ids.astype(int)
movie_ids

array([1963, 3044,  770, ..., 1344, 1403, 2532])

In [103]:
df_temp

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3879,Meet the Parents (2000),Comedy
3879,3880,Requiem for a Dream (2000),Drama
3880,3881,Tigerland (2000),Drama
3881,3882,Two Family House (2000),Drama


In [121]:
# def recommend_top_5_movies(test_user_id, genre_matrix, n_singular_values, n_clusters):

#     predicted_ratings = svd_kmeans_evaluation(genre_matrix,  n_singular_values, n_clusters, train_ids, test_user_id)
#     # converting dataframe to series
#     predicted_ratings = predicted_ratings.iloc[0]

#     # Combine predicted ratings with movie titles
#     # movie_predictions = pd.Series(predicted_ratings, index=movie_ids)
#     movie_predictions = pd.Series()

#     for movie_id, rating in zip(movie_ids, predicted_ratings):
#         movie_predictions.at[movie_id] = rating

#     top_5_movies = movie_predictions.nlargest(5)

#     top_5_predicted_ratings = top_5_movies.values

#     # Return the titles of the top 5 recommended movies
#     top_5_movie_titles = movies_df.loc[top_5_movies.index, 'Title']

#     for title, rating in zip(top_5_movie_titles, top_5_predicted_ratings):
#         print(f"{title}: {rating}")

#     return top_5_movie_titles , top_5_predicted_ratings  #, movie_predictions

import pandas as pd

def recommend_top_5_movies(test_user_id, genre_matrix, n_singular_values, n_clusters, movies_df, genre_diversity_count=3):
    # Assuming `movies_df` has columns 'MovieID', 'Title', and 'Genre'
    # Assuming there's a function `svd_kmeans_evaluation` defined elsewhere
    predicted_ratings = svd_kmeans_evaluation(genre_matrix, n_singular_values, n_clusters, train_ids,test_user_id)
    predicted_ratings = predicted_ratings.iloc[0]  # Convert DataFrame to Series if needed

    # Initialize an empty Series for movie predictions
    movie_predictions = pd.Series(dtype='float')

    # Assuming `movie_ids` is defined elsewhere and corresponds to movies in `predicted_ratings`
    for movie_id, rating in zip(movie_ids, predicted_ratings):
        movie_predictions.at[movie_id] = rating

    return implement_genre_diversity(movie_predictions, movies_df, genre_diversity_count)

def create_genre_movie_mapping(movies_df):
    genre_movie_map = {}
    for index, row in movies_df.iterrows():
        genres = row['Genres'].split('|')  # Assuming genres are pipe-separated
        for genre in genres:
            if genre not in genre_movie_map:
                genre_movie_map[genre] = []
            genre_movie_map[genre].append(row['MovieID'])
    return genre_movie_map

def implement_genre_diversity(movie_predictions, movies_df, genre_diversity_count):
    genre_movie_map = create_genre_movie_mapping(movies_df)
    selected_genres = set()
    top_movies = pd.Series(dtype='float64')

    # First, select movies to fulfill the genre diversity requirement
    for genre, movies in genre_movie_map.items():
        highest_rated_movie = None
        highest_rating = 0
        for movie in movies:
            if movie in movie_predictions and movie_predictions[movie] > highest_rating:
                highest_rated_movie = movie
                highest_rating = movie_predictions[movie]
        if highest_rated_movie and genre not in selected_genres and len(selected_genres) < genre_diversity_count:
            top_movies.at[highest_rated_movie] = highest_rating
            selected_genres.add(genre)

    # Then, fill the remaining slots with the highest-rated movies overall
    remaining_movies = movie_predictions.drop(top_movies.index)
    top_movies = top_movies.append(remaining_movies.nlargest(5 - len(top_movies)))

    # Fetch movie titles, ratings, and genres
    top_5_details = movies_df.loc[top_movies.index, ['Title', 'Genres']]

    # Print movie titles, ratings, and genres
    for index in top_5_details.index:
        title = top_5_details.at[index, 'Title']
        genre = top_5_details.at[index, 'Genres']
        rating = top_movies.at[index]
        print(f"Title: {title}, Rating: {rating}, Genres: {genre}")

    # Adjust the return statement if necessary to include genres or modify as per requirement
    return top_5_details['Title'].values, top_movies.values

In [122]:
top_movies , ratings  = recommend_top_5_movies(4848, genre_matrix, 9, 18, df_temp)


Title: Hellraiser: Bloodline (1996), Rating: 5.0, Genres: Action|Horror|Sci-Fi
Title: Man Who Knew Too Little, The (1997), Rating: 5.0, Genres: Comedy|Mystery
Title: Race the Sun (1996), Rating: 5.0, Genres: Drama
Title: Jules and Jim (Jules et Jim) (1961), Rating: 5.0, Genres: Drama
Title: Apple Dumpling Gang Rides Again, The (1979), Rating: 5.0, Genres: Children's|Comedy|Western


  top_movies = top_movies.append(remaining_movies.nlargest(5 - len(top_movies)))


In [128]:
def recommend_top_5_movies_nodive(test_user_id, genre_matrix, n_singular_values, n_clusters):
    predicted_ratings = svd_kmeans_evaluation(genre_matrix, n_singular_values, n_clusters, train_ids, test_user_id)
    # converting dataframe to series
    predicted_ratings = predicted_ratings.iloc[0]

    # Initialize an empty Series for movie predictions
    movie_predictions = pd.Series(dtype='float64', index=movie_ids)

    for movie_id, rating in zip(movie_ids, predicted_ratings):
        movie_predictions.at[movie_id] = rating

    top_5_movies = movie_predictions.nlargest(5)
    top_5_predicted_ratings = top_5_movies.values

    # Fetch titles and genres for the top 5 recommended movies from df_temp
    top_5_movie_details = df_temp.loc[top_5_movies.index, ['Title', 'Genres']]

    # Print titles, ratings, and genres for the top 5 movies
    for idx in top_5_movie_details.index:
        title = top_5_movie_details.loc[idx, 'Title']
        genres = top_5_movie_details.loc[idx, 'Genres']
        rating = top_5_movies.at[idx]
        print(f"Title: {title}, Rating: {rating}, Genres: {genres}")

    return top_5_movie_details['Title'].values, top_5_predicted_ratings


In [129]:
top_movies , ratings  = recommend_top_5_movies_nodive(4848, genre_matrix, 9, 18)


Title: Jules and Jim (Jules et Jim) (1961), Rating: 5.0, Genres: Drama
Title: Apple Dumpling Gang Rides Again, The (1979), Rating: 5.0, Genres: Children's|Comedy|Western
Title: Brother's Kiss, A (1997), Rating: 5.0, Genres: Drama
Title: Man Who Knew Too Little, The (1997), Rating: 5.0, Genres: Comedy|Mystery
Title: Telling You (1998), Rating: 5.0, Genres: Comedy|Drama|Romance
