<a href="https://colab.research.google.com/github/jphall663/GWU_ML/blob/main/notebook/lecture_11/Lecture_11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# License 
***
Copyright (C) 2017-2022 J. Patrick Hall, jphall@gwu.edu

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

***
# Kaggle Movie Lens and Basic Collaborative Filtering

1. General imports and inits

In [None]:
# basic packages for recommendation
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import NMF

# to upload local files
import io
from google.colab import files  

SEED = 12345 # for better reproducibility

2. Import train data

In [None]:
# special google collab command to upload a file from computer
# REQUIRES STUDENT INPUT
# import: u.data and u.item
uploaded = files.upload() 

In [None]:
# 3
uploaded.keys() # what is stored in that Python object?

4. Load ratings data

In [None]:
r_cols = r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(io.StringIO(uploaded['u.data'].decode('latin-1')), sep='\t',
          names=r_cols) # name in quotes here must match name in 3 above

5. Load items data

In [None]:
i_cols = ['movie_id', 'title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv(io.StringIO(uploaded['u.item'].decode('latin-1')), sep='|',
                     names=i_cols) # name in quotes here must match name in 3 above
movies.index += 1 # necessary for later join

6. View ratings data 
* Sparse! but in dense COO format here
* Read as: user 196 rated movie 242 with a 3
* As a sparse matrix, rows would be all users, columns would be all movies, and data would be movie ratings


In [None]:
ratings

7. View movies data

In [None]:
movies.head()

8. Summarize loaded data

In [None]:
print('Total movies:', len(movies))
print('Total ratings:', len(ratings))

9. Expand COO matrix to large sparse matrix
* This is dumb and done just to use ... scikit-learn!
* For actual big data, this would be extremely inefficient and probably fail

In [None]:
df_ratings = ratings.pivot(index='user_id', columns='movie_id', values='rating') # expand 
df_ratings.fillna(0, inplace=True) # impute nans with 0's
df_ratings # NOTE: movies and users are indexed from 1, not 0!

10. Factorize ratings into:
* W matrix with all users
* H matrix with all movies

In [None]:
mf_model = NMF(n_components=, init='random', random_state=, max_iter=1000) # REQUIRES STUDENT INPUT
W = mf_model.fit_transform(df_ratings)
H = mf_model.components_
print(W.shape) # W has all users
print(H.shape) # H has all movies

11. Fit many small clusters in W

In [None]:
kmeans_model = KMeans(n_clusters=, random_state=).fit() # REQUIRES STUDENT INPUT: make many small clusters of users

12. Add cluster labels

In [None]:
cols = ['archetype_' + str(i) for i in range(0, W.shape[1])]
W_df = pd.DataFrame(W, columns=cols)
W_df['cluster'] = kmeans_model.predict(W)
W_df.sort_values(by='cluster', inplace=True)
W_df # scroll to right to see cluster labels

13. Extract users from first cluster

In [None]:
cluster_0_user_ids = W_df[W_df['cluster'] == 0].index # the row index of W corresponds to user ids - 1 
cluster_0_user_ids = cluster_0_user_ids + 1
cluster_0_user_ids = sorted(cluster_0_user_ids)
cluster_0_user_ids # see users ids in cluster 0

14. Extract Movie IDs sorted by ranking for cluster 0

In [None]:
cluster_0_movie_ids = df_ratings.loc[cluster_0_user_ids, :] # extract the rows from the ratings matrix for cluster 0 users
cluster_0_movie_ids = pd.DataFrame(cluster_0_movie_ids.sum(axis=0), columns=(['total_ratings'])) # sum the columns for those users to total their ratings for each movie
cluster_0_movie_ids.sort_values(by='total_ratings', ascending=False, inplace=True) # sort the movies by their total ratings
cluster_0_movie_ids = cluster_0_movie_ids[cluster_0_movie_ids['total_ratings'] > 0] # drop movies with 0 ratings (unwatched movies)
cluster_0_movie_ids

15. Join name information and see most popular movie titles in cluster 0

In [None]:
cluster_0_movie_ids.join(movies[['movie_id', 'title']], on='movie_id', how='left',  lsuffix='_L', rsuffix='_R') # action movies!

16. Determine which movies user 5 has seen
* User 5 is in cluster 0

In [None]:
user_5_movie_ids = df_ratings.loc[5, :] # select user 5 ratings
user_5_movie_ids = list(user_5_movie_ids[user_5_movie_ids > 0].index) # drop unrated, unwatched movies
print(user_5_movie_ids) # movies user 5 has watched and rated
print(len(user_5_movie_ids))

17. See recommended movie titles for user 5

In [None]:
recs = cluster_0_movie_ids.drop(user_5_movie_ids, axis=0) # drop the movies user 5 has seen from cluster_0_movie_ids
recs.join(movies[['movie_id', 'title']], on='movie_id', how='left',  lsuffix='_L', rsuffix='_R').head() # join to title information and show best recommended titles