<a href="https://www.kaggle.com/code/nirmit27/netflix-recommendation-system?scriptVersionId=171562510" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import os

import numpy as np
import pandas as pd

from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity as cos_sim

### Importing the datasets

In [2]:
titles = pd.read_csv('/kaggle/input/movie-titles/movie_titles.csv')

titles.head()

Unnamed: 0,MovieID,Title
0,1,Dinosaur Planet
1,2,Isle of Man TT 2004 Review
2,3,Character
3,4,Paula Abdul's Get Up & Dance
4,5,The Rise and Fall of ECW


In [3]:
files = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if "combined_data" in os.path.join(dirname, filename):
            files.append(os.path.join(dirname, filename))            
files

['/kaggle/input/netflix-prize-data/combined_data_3.txt',
 '/kaggle/input/netflix-prize-data/combined_data_4.txt',
 '/kaggle/input/netflix-prize-data/combined_data_1.txt',
 '/kaggle/input/netflix-prize-data/combined_data_2.txt']

### Combining the **four** `Movie Rating` files

In [4]:
data = open('final_data.csv', 'w')

data.write(','.join(['movie_id', 'customer_id', 'rating', 'date']))
data.write('\n')

for i in files:
    with open(i) as f:
        
        for line in f:
            line = line.strip()
            
            if line.endswith(':'):
                movie_id = line.replace(':', '')
            else:
                row = [x for x in line.split(',')]
                row.insert(0, movie_id)
                
                data.write(','.join(row))
                data.write('\n')
                
data.close()

In [5]:
df_all = pd.read_csv('/kaggle/working/final_data.csv')

df_all.head()

Unnamed: 0,movie_id,customer_id,rating,date
0,9211,1277134,1,2003-12-02
1,9211,2435457,2,2005-06-01
2,9211,2338545,3,2001-02-17
3,9211,2218269,1,2002-12-27
4,9211,441153,4,2002-10-11


In [6]:
df_all.date = pd.to_datetime(df_all.date)
df_all.sort_values(by='date', inplace=True)

In [7]:
# df_all.head()
df_all.shape

(100480507, 4)

## Data Preprocessing

In [8]:
rating_count = df_all.groupby('customer_id')['rating'].count().sort_values(ascending=False)

rating_count.head()

customer_id
305344     17653
387418     17436
2439493    16565
1664010    15813
2118461    14831
Name: rating, dtype: int64

### Filtering **Customers**
We can observe that `75 percentile` of customers have only watched around **300** movies, so, we will remove those customers who have watched **too many** movies as `outliers`.

In [9]:
rating_count.describe()

count    480189.000000
mean        209.251997
std         302.339155
min           1.000000
25%          39.000000
50%          96.000000
75%         259.000000
max       17653.000000
Name: rating, dtype: float64

In [10]:
q1 = rating_count.quantile(0.25)
q3 = rating_count.quantile(0.75)

iqr = q3 - q1
upper_bound = q3 + iqr * 1.5

filtered_df = df_all[~df_all['customer_id'].isin([x for x in rating_count.index if rating_count[x] > upper_bound])]

filtered_df.shape

(60430880, 4)

In [11]:
remaining = filtered_df.groupby('customer_id')['rating'].count().sort_values(ascending=False)

remaining.head()

customer_id
2598861    589
1729808    589
2594647    589
1732566    589
853517     589
Name: rating, dtype: int64

In [12]:
filtered_df.to_csv('final_data.csv')

### **Sparse** Matrix
Since `75 percentile` of the customers have ONLY rated **~250 movies**, while the actual number of movies is way larger than that, we can create a **sparse** matrix for holding the reviews.

In [13]:
r = filtered_df.rating.values
c_id = filtered_df.customer_id.values
m_id = filtered_df.movie_id.values

user_movie_matrix = sparse.csr_matrix((r, (c_id, m_id)))
sparse.save_npz('/kaggle/working/user_movie.npz',user_movie_matrix)

In [14]:
um_df = sparse.load_npz('/kaggle/working/user_movie.npz')

um_df

<2649430x17771 sparse matrix of type '<class 'numpy.int64'>'
	with 60430880 stored elements in Compressed Sparse Row format>

## **Item-based** Collaborative Filtering

### Computing the `cosine_similarity` scores

In [15]:
movie_movie_score = cos_sim(X=um_df.T, dense_output=True)

movie_movie_score

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.00000000e+00, 2.27158160e-03, ...,
        5.94204676e-03, 1.39719439e-03, 4.81246303e-04],
       [0.00000000e+00, 2.27158160e-03, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 5.94204676e-03, 0.00000000e+00, ...,
        1.00000000e+00, 2.72904991e-03, 7.14390454e-02],
       [0.00000000e+00, 1.39719439e-03, 0.00000000e+00, ...,
        2.72904991e-03, 1.00000000e+00, 5.09659313e-03],
       [0.00000000e+00, 4.81246303e-04, 0.00000000e+00, ...,
        7.14390454e-02, 5.09659313e-03, 1.00000000e+00]])

### Recommendation function

In [16]:
def recommend(scores, k, index):
    return np.argsort(-movie_movie_score[index])[1:k+1]

In [17]:
index = titles[titles['Title'] == 'The Volcano Disaster']['MovieID'].values[0]
res = recommend(movie_movie_score, 5, index)

for i in res:
    print(titles[titles['MovieID'] == i]['Title'].values[0])

Earthquake: Nature Unleashed
Tornado
I Got Five on It
The Off Season
Killer Flood: The Day the Dam Broke
