In [2]:
import pandas as pd
from scipy.sparse import csr_matrix, csc_matrix, coo_matrix
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


<h1> Load Data </h1>

In [3]:
movies_df = pd.read_csv('data/movies.csv')
ratings_df = pd.read_csv('data/ratings.csv')

In [4]:
original_movie_ids = set(movies_df["movieId"])
movie_id_map = {original : new for new, original in enumerate(original_movie_ids) }
movies_df["movieId"] = movies_df["movieId"].map(movie_id_map)
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
original_movie_ids = set(ratings_df["movieId"])
movie_id_map = {original : new for new, original in enumerate(original_movie_ids) }
ratings_df["movieId"] = ratings_df["movieId"].map(movie_id_map)
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


<h1> Build Raiding Matrix </h1>

In [6]:


# Get the unique user IDs and movie IDs
user_ids = ratings_df['userId'].unique()
movie_ids = ratings_df['movieId'].unique()

# Create a dictionary to map movie IDs to column indices
movie_id_map = {movie_id: i for i, movie_id in enumerate(movie_ids)}





In [7]:
len(movie_ids)

59047

In [8]:

# Initialize lists to store the row indices, column indices, and ratings
row_indices = []
col_indices = []
ratings = []

# Iterate over the ratings dataframe
for _, row in ratings_df.iterrows():
    row_indices.append(int(row['userId']))
    col_indices.append(int(row['movieId']))
    ratings.append(row['rating'])

In [9]:
# Create the sparse spatial matrix
sparse_matrix = csr_matrix((ratings, (row_indices, col_indices)))
sparse_matrix.shape

(162542, 59047)

In [10]:
coo_matrix = coo_matrix((ratings, (row_indices, col_indices)))

<h1> Define training function </h1>

In [27]:
def Average(lst): 
    return sum(lst) / len(lst) 

In [25]:
csr_matrix.count_nonzero(sparse_matrix)

25000095

In [94]:
def getColumnAverage(matrix, column_index):
    col = matrix.getcol(column_index)
    non_zero_column = col[col.nonzero()]
    return Average(np.squeeze(np.asarray(non_zero_column)))

3.893707794587238
3.2515271586594023
3.142028126058963
2.85354736424891
3.0584343520573674
3.854908898649748


KeyboardInterrupt: 

In [99]:
import pandas as pd
# Calculate the column-wise average of the sparse matrix
ratings = []
for i in range(1, np.shape(coo_matrix)[1]):
    ratings.append(getColumnAverage(coo_matrix, i))
ratings

    

TypeError: iteration over a 0-d array

In [98]:
ratings

[3.893707794587238,
 3.2515271586594023,
 3.142028126058963,
 2.85354736424891,
 3.0584343520573674,
 3.854908898649748,
 3.3636663369601054,
 3.1145833333333335,
 2.992050660199407,
 3.42145763311516,
 3.6571705198920315,
 2.6245656241646618,
 3.333527696793003,
 3.4234888364494465,
 2.7190217391304348,
 3.8237068028689416,
 3.948806325713417,
 3.3843505768515074,
 2.6421677802524126,
 2.869921875,
 3.5701620505454055,
 3.3200173216412256,
 3.145691333982473,
 3.18758389261745,
 3.677304434479322,
 3.606708513142409,
 3.4039315155358274,
 4.03,
 3.936724864539434,
 3.6367924528301887,
 3.244838567977158,
 3.9057678412037236,
 3.023076923076923,
 3.6031917599186163,
 3.4792899408284024,
 3.91868681658758,
 2.993333333333333,
 2.870839936608558,
 3.4187380861608845,
 3.6267252195734003,
 3.843413033286451,
 3.0495652173913044,
 3.4499566724436743,
 2.6546241414996707,
 3.382907947260191,
 3.2634129456559364,
 4.0791663372598626,
 2.9777505385929723,
 3.5197368421052633,
 4.2843532131633

In [12]:
first_row = sparse_matrix.getrow(0)
first_row.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [13]:
for i in range(first_row.shape[1]):
    # Check if the column is null
    if first_row[:, i].nnz == 0:
        # Replace the null column with a value from weights
        first_row[:, i] = avg_ratings_df[i].iloc[0]
first_row.toarray()

  self._set_arrayXarray(i, j, x)


array([[1.53806401e-05, 1.37284210e+00, 4.84662426e-01, ...,
        1.32581117e-03, 1.84567681e-05, 1.01512225e-04]])

In [21]:
from sklearn.base import BaseEstimator

class AverageRatingModel(BaseEstimator):
    avg_ratings_df = None  
        
    def fit(self, training_ratings_matrix: csc_matrix):
        # Calculate the column-wise average of the training_ratings_matrix
        averages = sparse_matrix.nonzero().mean(axis=0) #TODO stop including 0s in averages
        self.avg_ratings_df = pd.DataFrame(averages[0])
        
    def predict(self, user_ratings_matrix: csc_matrix):
        for i in range(first_row.shape[1]):
            # Check if the column is null
            if first_row[:, i].nnz == 0:
                # Replace the null column with a value from weights
                first_row[:, i] = self.avg_ratings_df[i].iloc[0]
        
        # Return the modified user_ratings_matrix
        return user_ratings_matrix


<h1> Run Training </h1>

In [22]:
from sklearn.model_selection import GroupShuffleSplit

# Define the groups for the split based on the value of the first column
groups = sparse_matrix[:, 0].toarray().flatten()

# Create an instance of GroupShuffleSplit
splitter = GroupShuffleSplit(n_splits=1, test_size=0.2)

train_dataset,test_dataset = next(splitter.split(X=sparse_matrix[:,1:], groups=groups))



In [23]:
avg_rat_model = AverageRatingModel()

avg_rat_model.fit(train_dataset)

AttributeError: 'tuple' object has no attribute 'mean'

<h1> Evaluate </h1>

In [17]:
# predit the unseen movies agaisnt the actual ratings
groups = sparse_matrix[:, 0].toarray().flatten()

# Create an instance of GroupShuffleSplit
splitter = GroupShuffleSplit(n_splits=1, test_size=0.2)

train_dataset,test_dataset = next(splitter.split(X=sparse_matrix[:,1:], groups=groups))


#Root mean-squared error (RMSE)
#Boxplot of true vs predicted ratings
#Pearson's Correlation Coefficient (R2)
#Fraction of user-movie pairs with non-zero predicted ratings
#Fraction of user-movie ratings with a predicted values (recall)
#RMSE is appropriate if we want to exactly the predict the ratings of the users.  R2 is useful if we don't care about the ranges of the ratings, just that the predicted ratings rank items in a manner consistent with the user-provided ratings.  The MovieLens ratings are in increments of 0.5.  This makes it easy to visualize the relationships between the user-provided and predicted ratings using a boxplot (user-provided ratings on the X axis, predicted ratings on the Y axis).


In [18]:
predictions = avg_rat_model.predict(test_dataset)

In [19]:
predictions

array([16955], dtype=int64)