In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.sparse
from scipy.sparse import csr_matrix
import sklearn
from sklearn.decomposition import TruncatedSVD
import numpy as np
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
import math
import time
start_time = time.time()

### Read the dataset : Amazon Movies

In [None]:
amazon_movies = pd.read_csv("ratings_Movies_and_TV.csv")#read data in csv format to dataframe

In [None]:
amazon_movies.columns = ["User_Id", "Item_Id", "Rating", "TimeStamp"]#set column names in dataset

### Frequency plot for different types of ratings

In [None]:
plt.rc("font",size=15)#Plot count for each rating
amazon_movies.Rating.value_counts(sort=True).plot(kind='bar')
plt.show()

### Taking train parameters:

In [None]:
%store -r item_count
%store -r user_count

In [None]:
#Get list of most frequently rated items
average_rating = pd.DataFrame(amazon_movies.groupby('Item_Id')['Rating'].mean())
average_rating['ratingCount'] = pd.DataFrame(amazon_movies.groupby('Item_Id')['Rating'].count())
average_rating = average_rating.sort_values('ratingCount', ascending=False)
average_rating = average_rating.reset_index()
frequent_rated_items = average_rating[:item_count]

In [None]:
#Subset the data to include only most frequently rated items
amazon_movies_item_subset = amazon_movies[(amazon_movies['Item_Id'].isin(frequent_rated_items.Item_Id)).tolist()]

In [None]:
#Get list of users that most frequently rate items
user_rated_most = pd.DataFrame(amazon_movies_item_subset.groupby('User_Id')['Item_Id'].count())
user_rated_most = user_rated_most.sort_values('Item_Id', ascending=False)
user_rated_most = user_rated_most.reset_index()
user_rated_most.columns = ["User_Id", "rated_items"]
top_user_rated_most = user_rated_most[:user_count]

In [None]:
#Subset the data to include only users that most frequently rate items
amazon_movies_user_subset = amazon_movies_item_subset[(amazon_movies_item_subset['User_Id'].isin(top_user_rated_most.User_Id)).tolist()]    

### Test Dataset Sampling

In [None]:
#Do random sampling of subset of data to create a test set
import random
# Parameter: dataset is the train set from which we take test dataset
# Parameter: k is the number of users-item combination for which we want to predict ratings
def create_testdataset(dataset,k):
    ho
    for i in range(0,k):
        index = random.choice(dataset.index.values.tolist())
        user = dataset.loc[index,'User_Id']
        item = dataset.loc[index,'Item_Id']
        rating = dataset.loc[index,'Rating']
        df = pd.DataFrame([[user,item,rating]],columns=["User_Id","Item_Id","Rating"])
        test_df = test_df.append(df)
    return test_df


### Taking test parameters:

In [None]:
%store -r test_rows
test_df = create_testdataset(amazon_movies_user_subset,test_rows)
test_df = test_df.reset_index()

In [None]:
#Print no. of unique users and items in train set
print("Unique Items:",amazon_movies_user_subset.Item_Id.unique().shape[0])
print("Unique Users:",amazon_movies_user_subset.User_Id.unique().shape[0])

### Check sparsity of user_item matrix

In [None]:
#Create user-item matrix from train set dataframe
user_item_pivot = amazon_movies_user_subset.pivot(index="User_Id",columns="Item_Id", values="Rating")
userId = user_item_pivot.index
itemId = user_item_pivot.columns
user_item_matrix = csr_matrix(user_item_pivot.values)
user_item_pivot_filled_zeros = user_item_pivot.fillna(0)
user_item_matrix_filled_zeros = csr_matrix(user_item_pivot_filled_zeros.values)

In [None]:
#Remove the ratings of rows in test set from train set
for i in range(0,test_df.shape[0]):
    user_item_pivot.loc[test_df.User_Id[i]][test_df.Item_Id[i]] = np.nan

In [None]:
#Calculate sparsity of user_item matrix
missing = pd.DataFrame(np.isnan(user_item_pivot).sum()/np.prod(user_item_pivot.shape))
missing = missing.reset_index()
missing.columns = ["Item_Id", "percent_missing"]
print("Sparsity Percentage:",(missing.percent_missing.sum())*100)

### Use Birch Clustering to create peer group for each user

In [None]:
#Cross validation and hyper-parameter tuning to find optimal number of clusters and branching factor
from sklearn.cluster import Birch
from sklearn.model_selection import KFold, GridSearchCV
from sklearn import metrics
RAND_STATE=50  # for reproducibility and consistency
folds=3
k_fold = KFold(n_splits=folds, shuffle=True, random_state=RAND_STATE)
hyperparams = {
    "branching_factor": [20,50,70,100],
    "n_clusters": [5,10,50,70,100,200],
}

bir = Birch()  # sets jobs equal to number of cores

def silhouette_score(estimator, X):
    clusters = estimator.fit_predict(X)
    score = metrics.silhouette_score(X, clusters)
    return score

ensemble = GridSearchCV(
    estimator=bir,
    param_grid=hyperparams,
    scoring=silhouette_score,
    cv=k_fold
)
ensemble.fit(user_item_matrix_filled_zeros)
print(ensemble.best_params_)

In [None]:
#initialise and fit birch clustering model to dataset
import numpy as np
from sklearn.cluster import Birch
estimator = Birch(branching_factor= ensemble.best_params_["branching_factor"],n_clusters=ensemble.best_params_["n_clusters"])
estimator.fit(user_item_matrix_filled_zeros)
estimator.labels_
neighbors = {i: np.where(estimator.labels_ == i)[0] for i in range(estimator.n_clusters)}

In [None]:
#returns k nearest neighbors given a user id
def get_neighbors2(query_index,user_item_matrix_filled_zeros):
    for cluster in neighbors:
        cluster = neighbors[cluster].tolist()
        print()
        if query_index in cluster:
            return [user_item_matrix_filled_zeros.index[x] for x in cluster if x != query_index]

### Normalize user-item Matrix: Subtracting the Mean

In [None]:
#calculate mean of ratings for each user
user_item_pivot['Mean'] = user_item_pivot.mean(axis=1)
user_item_pivot.head()

In [None]:
#store in another matrix and subract mean column from all other columns in user-item matrix
user_item_pivot2=user_item_pivot
user_item_pivot2 = user_item_pivot2.sub(user_item_pivot['Mean'].reset_index().Mean.tolist(),axis=0)

In [None]:
user_item_pivot2['Mean'] = user_item_pivot['Mean']
user_item_pivot2.head()

### Calulate similarity matrix using Pearson Similarity

In [None]:
#create user-user similarity matrix
user_item_normalized = user_item_pivot2.fillna(0).iloc[:,:-1]
pearson_similarity = user_item_normalized.dot(user_item_normalized.T)
sim_denom_sqrt = np.sqrt(np.sum(np.square(user_item_normalized), axis=1))
sim_denom_sqrt = np.array(sim_denom_sqrt)[np.newaxis]
similarity_denom = sim_denom_sqrt.T.dot(sim_denom_sqrt)
pearson_similarity = (pearson_similarity/similarity_denom).fillna(0)

### Fill the predicted user-item matrix : Neighborhood-based

In [None]:
#create user_item predicted values matrix
user_item_df = pd.DataFrame(columns=["User_Id","Item_Id","Rating"])
user_item_df['User_Id'] = amazon_movies_user_subset['User_Id']
user_item_df['Item_Id'] = amazon_movies_user_subset['Item_Id']
user_item_df['Rating'] = [0]*amazon_movies_user_subset.shape[0]
user_item_predicted = user_item_df.pivot(index="User_Id",columns="Item_Id", values="Rating")

In [None]:
#predict ratings that are missing in user-item matrix using ratings of users in peer set and similarity matrix
for i in range(0,user_item_pivot2.shape[0]):
    target_user = user_item_pivot2.index[i]
    items_target = user_item_pivot2.loc[target_user]
    mean_target_user=items_target['Mean']
    items_target = items_target.reset_index()
    items_target_negative = items_target[items_target[target_user].isna()]
    nearest_neighbor = get_neighbors2(i,user_item_pivot2)
  

    
    for j in items_target_negative['Item_Id']:
        sum_rating_nn = 0
        similarity_nn = 0
        for k in nearest_neighbor:
            if not (math.isnan(user_item_pivot2.loc[k][j])):
                sum_rating_nn=sum_rating_nn+((pearson_similarity.loc[target_user][k])*(user_item_pivot2.loc[k][j]))
                similarity_nn = similarity_nn + abs(pearson_similarity.loc[target_user][k])
                
        if(similarity_nn!=0):
            user_item_predicted.loc[target_user][j] = mean_target_user + (sum_rating_nn/similarity_nn)
        else:
            user_item_predicted.loc[target_user][j] = mean_target_user
            

In [None]:
#prints predicted and orginal ratings given test set and array of predicted ratings for user-item combinations in test set
def print_ratings(test_df,user_item_predicted):
    user_item_predicted_array = []
    for i in range(0,test_df.shape[0]):
        print("User ID:", test_df.User_Id[i])
        print("Predicted Ratings:", user_item_predicted.loc[test_df.User_Id[i]][test_df.Item_Id[i]])
        print("Original Ratings:",test_df.Rating[i])
        print("\n")
        user_item_predicted_array.append(user_item_predicted.loc[test_df.User_Id[i]][test_df.Item_Id[i]])
    
    return np.array(user_item_predicted_array)
    

### Accuracy Metrics

#### Root Mean Square Error

In [None]:
#returns rmse given predicted and observed rating arrays
def calculate_rmse(predicted_rating,observed_rating):
    rmse = np.sqrt((np.sum(np.square(predicted_rating-observed_rating)))/(len(observed_rating)))
    return rmse

#### Mean Absolute Error

In [None]:
#returns mae given predicted and observed rating arrays
def calculate_mae(predicted_rating,observed_rating):
    mae = (np.sum(abs(predicted_rating-observed_rating)))/(len(observed_rating))
    return mae

#### Observed and Predicted Ratings

In [None]:
#print observed and predicted ratings, rmse and mae
user_item_predicted_array = print_ratings(test_df,user_item_predicted)
test_ratings_array = np.array(test_df.Rating)
rmse = calculate_rmse(user_item_predicted_array,test_ratings_array)
print("Root Mean Square Error for Test Data:", rmse)
mae = calculate_mae(user_item_predicted_array,test_ratings_array)
print("Mean Absolute Error for Test Data:", mae)

#### Item Coverage

In [None]:
#returns item coverage for dataset ie. percentage of items predicted from all items.
def calculate_item_coverage(k,user_item_predicted):
    items_recommended =[]
    for i in range(0,user_item_predicted.shape[0]):
        top_k_items=[]
        target_user = user_item_predicted.index[i]
        top_k_items=top_k_items_recommended(k,target_user,user_item_predicted)
        items_recommended.extend(top_k_items)
        
    items_recommended = np.unique(np.array(items_recommended)).tolist()
    items_coverage = len(items_recommended)/user_item_predicted.shape[1]
    return items_coverage

In [None]:
#returns top-k recommened items for any user
def top_k_items_recommended(k,userid,user_item_predicted):
    items_target = user_item_predicted.loc[userid].reset_index()
    top_k_items = items_target.sort_values(userid, ascending=False)[:k].Item_Id.tolist()
    return top_k_items

#### Catalog Coverage

In [None]:
#returns catalog coverage for dataset ie. percentage of user-item pairs predicted from all possible user-item pairs
def calculate_catalog_coverage(k,user_item_predicted):
    total_user_item_pairs = user_item_predicted.shape[0]*user_item_predicted.shape[1]
    #since recommending items to all the users
    total_user_items_recommended = user_item_predicted.shape[0]*k
    coverage_ratio = total_user_items_recommended/total_user_item_pairs
    return coverage_ratio

In [None]:
print("Item Coverage:",calculate_item_coverage(5,user_item_predicted))

In [None]:
print("Catalog Coverage:",calculate_catalog_coverage(5,user_item_predicted))

In [None]:
print("The time taken to run with these parameters:", time.time() - start_time)