# Collaborati Filtering Use KNN

**Libraries**

In [1]:
import pandas as pd
import numpy as np

## Read Datasets

In [26]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

In [27]:
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [28]:
# drop genres cols
movies.drop('genres', axis =1, inplace=True)

In [29]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [30]:
#drop timestamp col
ratings.drop('timestamp', axis =1 , inplace =True)

In [31]:
ratings.shape, movies.shape

((100836, 3), (9742, 2))

In [32]:
# join or merge all datasets this is for add title col
df = pd.merge(ratings, movies, on='movieId')

In [33]:
df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [34]:
df.shape

(100836, 4)

In [35]:
#check missing values
df.isna().sum()

userId     0
movieId    0
rating     0
title      0
dtype: int64

In [37]:
#create col total Retings count
ratings_counts= df.groupby('title', as_index=False)['rating'].count()
ratings_counts.columns = ['title', 'ratings_counts']
ratings_counts.head()

Unnamed: 0,title,ratings_counts
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [38]:
# Merge df and rating_counts to add ratin_count col

df_rating_counts = pd.merge(df, ratings_counts, left_on='title', right_on='title', how='left')
df_rating_counts.head()

Unnamed: 0,userId,movieId,rating,title,ratings_counts
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [39]:
df_rating_counts.shape

(100836, 5)

In [40]:
#filter data base of popularity thres of ratings_counts col
popularity_thres = 50
popularity_ratings = df_rating_counts.query('ratings_counts >=@popularity_thres')
popularity_ratings.head()

Unnamed: 0,userId,movieId,rating,title,ratings_counts
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215


In [41]:
popularity_ratings.shape

(41362, 5)

In [42]:
# create a pivot matrix 
movies_features = pd.pivot_table(popularity_ratings, index='title', columns='userId', values = 'rating').fillna(0)
movies_features

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X2: X-Men United (2003),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0
You've Got Mail (1998),0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0
Young Frankenstein (1974),5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0
Zombieland (2009),0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5


## Collaborative Filtering

In [43]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [44]:
X = csr_matrix(movies_features.values)

### Model

In [46]:
knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
knn.fit(X)

NearestNeighbors(algorithm='brute', metric='cosine')

#### Precitions

In [50]:
# take data to predict or recomendations Base on userid 
ex = np.random.choice(movies_features.shape[0])
ex

413

In [54]:
#recomendation-predict
distances, indices = knn.kneighbors(movies_features.iloc[ex,:].values.reshape(1,-1),n_neighbors=6)

In [55]:
distances

array([[4.44089210e-16, 4.05856894e-01, 4.43625811e-01, 4.50832326e-01,
        4.59440140e-01, 4.63260328e-01]])

In [56]:
indices

array([[413, 326, 154, 101, 318, 424]], dtype=int64)

In [60]:
#loop data and predictions to view the title

for i in range(0, len(distances.flatten())):
    if i ==0:
        print('Recomendations for {0}:\n'.format(movies_features.index[ex]))
    else:
        print('{0}: {1}, with distances of {2}:'.format(i, movies_features.index[indices.flatten()[i]], distances.flatten()[i]))

Recomendations for Trainspotting (1996):

1: Reservoir Dogs (1992), with distances of 0.405856894472298:
2: Fight Club (1999), with distances of 0.4436258105824876:
3: Clockwork Orange, A (1971), with distances of 0.4508323261772147:
4: Pulp Fiction (1994), with distances of 0.4594401404544901:
5: Usual Suspects, The (1995), with distances of 0.46326032804497674:


In [61]:
distances.flatten()

array([4.44089210e-16, 4.05856894e-01, 4.43625811e-01, 4.50832326e-01,
       4.59440140e-01, 4.63260328e-01])

In [62]:
indices.flatten()

array([413, 326, 154, 101, 318, 424], dtype=int64)