# Machine Learning Based Recommendation Systems
## Evaluating Recommendation Systems

In [1]:
import numpy as np
import pandas as pd

import sklearn
from sklearn.decomposition import TruncatedSVD

The MovieLens dataset was collected by the GroupLens Research Project at the University of Minnesota. You can download the dataset for this demostration at the following URL: https://grouplens.org/datasets/movielens/100k/

It was originally created by: [Moro et al., 2014] S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems, Elsevier, 62:22-31, June 2014

### Preparing the data

In [2]:
columns = ['user_id', 'item_id', 'rating', 'timestamp']
frame = pd.read_csv('dataset/ml-100k/u.data', sep='\t', names=columns)
frame.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
columns = ['item_id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
          'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
          'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv('dataset/ml-100k/u.item', sep='|', names=columns, encoding='latin-1')
movie_names = movies[['item_id', 'movie title']]
movie_names.head()

Unnamed: 0,item_id,movie title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [4]:
combined_movies_data = pd.merge(frame, movie_names, on='item_id')
combined_movies_data['timestamp'] = combined_movies_data.timestamp.astype(dtype = 'datetime64[s]')
combined_movies_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp,movie title
0,196,242,3,1997-12-04 15:55:49,Kolya (1996)
1,63,242,3,1997-10-01 23:06:30,Kolya (1996)
2,226,242,5,1998-01-04 04:37:51,Kolya (1996)
3,154,242,3,1997-11-10 05:03:55,Kolya (1996)
4,306,242,5,1997-10-10 17:16:33,Kolya (1996)


In [5]:
combined_movies_data.groupby('item_id')['rating'].count().sort_values(ascending=False).head()

item_id
50     583
258    509
100    508
181    507
294    485
Name: rating, dtype: int64

In [6]:
filter = combined_movies_data['item_id']==50
combined_movies_data[filter]['movie title'].unique()

array(['Star Wars (1977)'], dtype=object)

### Building a Utility Matrix

In [7]:
#pivot table with 0 instead of NaN
df = combined_movies_data.pivot_table(values='rating', index='user_id', columns='item_id', fill_value=0)
#pivot tables with NaN
df1 = combined_movies_data.pivot_table(values='rating', index='user_id', columns='item_id')
df2 = combined_movies_data.pivot_table(values='rating', index='user_id', columns='item_id')
df1.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [31]:
from sklearn.model_selection import train_test_split
# adding item average at end of table
df2 = df2.append(pd.DataFrame([df2.mean(axis = 0).values], index = ['item_average'], columns = df2.columns))
# adding user average at right side of table
df2['user_average'] = df2.mean(axis=1)

In [32]:
#filling NaN with user average and deleting user average and item average
rating_user_wise = df2
for i, b in zip(list(df2.index), list(df2['user_average'])):
    rating_user_wise.loc[i,:] = df2.loc[i,:].fillna(b)
rating_user_wise = rating_user_wise.drop(['user_average'], axis = 1)
rating_user_wise = rating_user_wise.drop(['item_average'])

### train test spliting

In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(rating_user_wise, test_size=0.2, random_state=0)


In [34]:
from sklearn.decomposition import PCA
# dimesion reduction by PCA and making similarity matrix by pearson
pca = PCA(n_components=8)
resultant_matrix = pca.fit_transform(rating_user_wise)  
corr_mat = np.corrcoef(resultant_matrix)
corr_mat = pd.DataFrame(corr_mat)

### calculating evalution for diffrent clustering in kmeans 

In [35]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.cluster import KMeans
#iterating cluster counts in kmeans
for Clus_count in range(3,16,2):
    kmeans = KMeans(n_clusters = Clus_count).fit(X_train)
    X_train['cluster'] = kmeans.labels_
    X_test['cluster'] = kmeans.predict(X_test)
    grouped = X_train.groupby('cluster')
    tp = 0
    tn = 0
    fn = 0
    fp = 0
    #iterating user for every items in test part
    for index, row in X_test.iterrows():
        similars = pd.Series(grouped.get_group(row['cluster']).index)
        count = 1
        for i in row.drop('cluster'):
            if df.loc[index, count] == 0:
                count+=1
            else:
                #calculating prediction by pearson similarity
                similars_P = pd.Series(df1.loc[similars][count].dropna().index)
                similars_P_in_corr_mat = similars_P.apply(lambda x:x-1)
                predict_P_A = df2.loc[index, 'user_average'] + ((np.sum(corr_mat.loc[index-1, similars_P_in_corr_mat] * (df1.loc[similars_P, count] - df2.loc[similars_P, 'user_average']))) / np.sum(corr_mat.loc[index-1, similars_P_in_corr_mat]))
                actual_P_A = i
                #evaluating
                if actual_P_A >= 2.9999 and predict_P_A >= 2.9999:
                    tp+=1
                elif actual_P_A <= 2.9999 and predict_P_A >= 2.9999:
                    fp+=1
                elif actual_P_A >= 2.9999 and predict_P_A <= 2.9999:
                    fn+=1
                elif actual_P_A <= 2.9999 and predict_P_A <= 2.9999:
                    tn+=1
                count+=1
    accuracy = (tp + tn )/ (tn + tp + fp + fn)
    percision = tp / (fp + tp)
    recall = tp / (fn + tp)
    print("for",Clus_count ,"Cluster in Kmeans:")
    print("   accuracy is {}". format(accuracy))
    print("   percision is {}". format(percision))
    print("   recall is {}". format(recall))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


for 3 Cluster in Kmeans:
   accuracy is 0.7846979865771813
   percision is 0.8415424101686629
   recall is 0.9067026599947327


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


for 5 Cluster in Kmeans:
   accuracy is 0.7984889368591473
   percision is 0.8414297709008036
   recall is 0.9279100529100529


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


for 7 Cluster in Kmeans:
   accuracy is 0.7969275865812615
   percision is 0.8419524587189258
   recall is 0.9250398724082934


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


for 9 Cluster in Kmeans:
   accuracy is 0.7998902305159166
   percision is 0.8421148145907689
   recall is 0.9307347730159792


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


for 11 Cluster in Kmeans:
   accuracy is 0.7996333740695478
   percision is 0.8425047438330171
   recall is 0.9301256926611704


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


for 13 Cluster in Kmeans:
   accuracy is 0.7994035126477411
   percision is 0.8428667770216418
   recall is 0.9278531416616049


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


for 15 Cluster in Kmeans:
   accuracy is 0.7991452991452992
   percision is 0.8428571428571429
   recall is 0.9276855303287022
