In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
import gc

In [2]:
df = pd.read_csv("random_state158.csv", usecols=['userId_x', 'movieId', 'rating'])

In [3]:
gc.collect()

0

In [4]:
df_clean= df.drop_duplicates()
#df_clean[df_clean['userId_x']==17682] # should be only 6 rows

In [5]:
def split_train_test(df, test_size=0.2, random_state=6740):
    # base split 
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)

    #  create a set of missing movies not in training
    missing_movies = set(df['movieId']) - set(train_df['movieId'])
    if missing_movies: # if we have missing movies
        # Add one random row per missing movie
        movies_to_add = (df[df['movieId'].isin(missing_movies)] # create the filtered df
            .groupby('movieId', as_index=False, sort=False) # group by movie id
            .apply(lambda x: x.sample(1, random_state=random_state)) # grab one sample from each group
            .reset_index(drop=True))
        # concat with the training set, we might have duplicates on the test set
        train_df = pd.concat([train_df, movies_to_add], ignore_index=True)

    # create singular keys for each row
    train_df['key'] = (
        train_df['userId_x'].astype(str)
        + '_' + train_df['movieId'].astype(str)
        + '_' + train_df['rating'].astype(str)
    )
    test_df['key'] = (
        test_df['userId_x'].astype(str)
        + '_' + test_df['movieId'].astype(str)
        + '_' + test_df['rating'].astype(str)
    )

    # Remove overlap, which in this case (random_state = 6740) is actually all the extra rows added to training
    train_keys = set(train_df['key'])
    test_keys = set(test_df['key'])
    overlap_keys = train_keys.intersection(test_keys)
    print(len(overlap_keys))
    test_df_clean = test_df[~test_df['key'].isin(overlap_keys)].reset_index(drop=True)
    test_df.info()
    print(test_df.shape)
    # clean the extra columns, not useful anymore
    train_df = train_df.drop(columns='key')
    test_df_clean= test_df_clean.drop(columns='key')
    test_df_clean.info()
    print(test_df_clean.shape)
    return train_df, test_df_clean

train_df, test_df = split_train_test(df_clean)

958
<class 'pandas.core.frame.DataFrame'>
Index: 29836 entries, 66796721 to 73091064
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   userId_x  29836 non-null  int64  
 1   movieId   29836 non-null  int64  
 2   rating    29836 non-null  float64
 3   key       29836 non-null  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 1.1+ MB
(29836, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28878 entries, 0 to 28877
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   userId_x  28878 non-null  int64  
 1   movieId   28878 non-null  int64  
 2   rating    28878 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 677.0 KB
(28878, 3)


  .apply(lambda x: x.sample(1, random_state=random_state)) # grab one sample from each group


In [6]:
unique_movies_total = df['movieId'].nunique()
print(unique_movies_total)
unique_movies_train = train_df['movieId'].nunique()
print(unique_movies_train)
unique_movies_test = test_df['movieId'].nunique()
print(unique_movies_test)

12301
12301
5527


In [7]:
train_df.head()

Unnamed: 0,userId_x,movieId,rating
0,152317,593,5.0
1,110228,58154,2.5
2,98084,7937,5.0
3,154141,2194,3.0
4,142821,61705,3.0


In [8]:
movie_counts = train_df['movieId'].value_counts().rename('num_unique_ratings')
train_df = train_df.merge(movie_counts, on='movieId', how='left')
#train_df.head()


n = 50

top_n_movies = (train_df.groupby('movieId')['num_unique_ratings'].max().sort_values(ascending=False).head(n))
top_n_movies_idx = top_n_movies.index

train_df_top_rating_counts = train_df[train_df['movieId'].isin(top_n_movies_idx)].reset_index(drop=True)

train_df_top_rating_counts

In [9]:
def top_popular_recommendations(df, top_r_count=50, recommendations = 10):
    movie_stats = (df.groupby(['movieId']).agg(movie_mean=('rating', 'mean'),rating_count=('rating', 'count')).reset_index())
    n = top_r_count
    m = recommendations
    top_rating_count_movies = movie_stats.sort_values('rating_count', ascending=False).head(n)
    top_rated_movies = top_rating_count_movies.sort_values('movie_mean', ascending = False).head(m)
    #display(top_rated_movies)

    return top_rated_movies

top_rated_movies = top_popular_recommendations(train_df)    

In [10]:
users = test_df[['userId_x']].drop_duplicates()
user_recs = users.merge(top_rated_movies, how='cross')
display(test_df)

Unnamed: 0,userId_x,movieId,rating
0,152008,2028,4.0
1,55909,1259,5.0
2,49357,832,4.0
3,89326,69644,3.0
4,116385,242204,4.5
...,...,...,...
28873,24932,5065,2.5
28874,93902,783,4.0
28875,179292,7162,3.0
28876,186428,377,3.0


In [11]:
final_df = (pd.concat([test_df, user_recs], ignore_index=True).drop_duplicates(subset=['userId_x', 'movieId'], keep=False))
final_df = final_df.rename(columns={'movie_mean': 'pred_rating'})
# example of user output
final_df[final_df['userId_x']==186428]

Unnamed: 0,userId_x,movieId,rating,pred_rating,rating_count
28876,186428,377,3.0,,
264018,186428,318,,4.373611,360.0
264019,186428,858,,4.362869,237.0
264020,186428,50,,4.298932,281.0
264021,186428,296,,4.283046,348.0
264022,186428,2959,,4.242857,280.0
264023,186428,527,,4.238014,292.0
264024,186428,1196,,4.204545,264.0
264025,186428,1193,,4.189944,179.0
264026,186428,593,,4.185185,378.0
