In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

/kaggle/input/netflix-prize-data/combined_data_3.txt
/kaggle/input/netflix-prize-data/movie_titles.csv
/kaggle/input/netflix-prize-data/combined_data_4.txt
/kaggle/input/netflix-prize-data/combined_data_1.txt
/kaggle/input/netflix-prize-data/README
/kaggle/input/netflix-prize-data/probe.txt
/kaggle/input/netflix-prize-data/combined_data_2.txt
/kaggle/input/netflix-prize-data/qualifying.txt
/kaggle/input/updated-data-3/movie_titles_new.csv


## _Importing the datasets_

## Movie titles

In [2]:
# path = input("Enter file path : ")
path = '/kaggle/input/updated-data-3/movie_titles_new.csv'

df = pd.read_csv(path, encoding = "ISO-8859-1", header = None, names = ['Movie_Id', 'Year', 'Name'])

df.head()

Unnamed: 0,Movie_Id,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW


## User Reviews
> #### from the `.txt` files

In [3]:
def read_combined_data(file_path):
    movie_ids, cust_ids, ratings, dates = [], [], [], []
    
    with open(file_path, 'r') as f:
        lines = f.readlines()
        
    current_movie_id = None
        
    for line in lines:
        line = line.strip()
        
        
        if line.endswith(':'):
            current_movie_id = line[:-1].strip()
        
        elif line:
            data = line.strip().split(',')
            
            if len(data) >= 3:
                cust_id = data[0].strip()
                rating = data[1].strip()
                date = data[2].strip()
                
                try:
                    rating = float(rating)
                except ValueError:
                    print(f"\nSkipping invalid rating (not a valid float value) : '{rating}'\n")
                    continue
                
                movie_ids.append(current_movie_id)
                cust_ids.append(cust_id)
                ratings.append(rating)
                dates.append(date)
    
    result_df = pd.DataFrame({
        'Movie_ID' : movie_ids,
        'Customer_ID' : cust_ids,
        'Rating' : ratings,
        'Date' : dates
    })            
        
    return result_df

In [4]:
df1 = pd.DataFrame()

# file_path = input("Enter file path : ")
file_path = '/kaggle/input/netflix-prize-data/combined_data_1.txt'
df1 = read_combined_data(file_path)

df1.head()

Unnamed: 0,Movie_ID,Customer_ID,Rating,Date
0,1,1488844,3.0,2005-09-06
1,1,822109,5.0,2005-05-13
2,1,885013,4.0,2005-10-19
3,1,30878,4.0,2005-12-26
4,1,823519,3.0,2004-05-03


# __Data Cleaning__

## Movie Titles
- Dropping `Year` column

In [5]:
df.drop(columns=['Year'], inplace=True)

df.head()

Unnamed: 0,Movie_Id,Name
0,1,Dinosaur Planet
1,2,Isle of Man TT 2004 Review
2,3,Character
3,4,Paula Abdul's Get Up & Dance
4,5,The Rise and Fall of ECW


- Dropping **duplicate titles**

In [6]:
df.drop_duplicates(subset=['Name'], inplace=True)

df.head()

Unnamed: 0,Movie_Id,Name
0,1,Dinosaur Planet
1,2,Isle of Man TT 2004 Review
2,3,Character
3,4,Paula Abdul's Get Up & Dance
4,5,The Rise and Fall of ECW


- Checking for **missing values**

In [7]:
df.isna().sum()

Movie_Id    0
Name        0
dtype: int64

## User Reviews

- Dropping the `Date` column

In [8]:
df1.drop(columns=['Date'], inplace=True)

df1.head()

Unnamed: 0,Movie_ID,Customer_ID,Rating
0,1,1488844,3.0
1,1,822109,5.0
2,1,885013,4.0
3,1,30878,4.0
4,1,823519,3.0


- **Count** of unique users and ratings

In [9]:
df1.nunique()

Movie_ID         4499
Customer_ID    470758
Rating              5
dtype: int64

- Checking if any user has rated **more than once**

No such users found.

In [10]:
df1.groupby(['Movie_ID', 'Customer_ID'])['Rating'].count().value_counts()

Rating
1    24053764
Name: count, dtype: int64

- **Minimum** and **Maximum** rating

In [11]:
max_rating = df1['Rating'].max()
min_rating = df1['Rating'].min()

print(f"Maximum Rating : {max_rating}\nMinimum Rating : {min_rating}")

Maximum Rating : 5.0
Minimum Rating : 1.0


# __Collaborative Filtering__

## Item-based Collaborative Filtering
### For making recommendations based on _item-item interactions_

In [12]:
df1_ = df1.groupby('Customer_ID').count().sort_values(by='Movie_ID', ascending=False)

df1_.head()

Unnamed: 0_level_0,Movie_ID,Rating
Customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
305344,4467,4467
387418,4422,4422
2439493,4195,4195
1664010,4019,4019
2118461,3769,3769


### User Criterion
- Filtering out `users` who have rated *more than 100* movies

In [13]:
active_users = df1_[df1_['Movie_ID'] > 100].index

print(f"No. of active users = {len(active_users)}")

No. of active users = 70270


### Filtering out active users

In [14]:
filtered_users = df1[df1['Customer_ID'].isin(active_users)]

filtered_users

Unnamed: 0,Movie_ID,Customer_ID,Rating
0,1,1488844,3.0
3,1,30878,4.0
4,1,823519,3.0
5,1,893988,3.0
7,1,1248029,3.0
...,...,...,...
24053756,4499,2219917,3.0
24053757,4499,1796454,1.0
24053759,4499,2591364,2.0
24053762,4499,988963,3.0


### Movie Criterion
- Filtering out `movies` which have *more than 100* ratings

In [15]:
df1__ = filtered_users.groupby('Movie_ID').count()

top_movies_id = df1__[df1__['Rating'] > 100].index

print(f"No. of top movies = {len(top_movies_id)}")

No. of top movies = 3455


### Filtering out top movies

In [16]:
filtered_movies = filtered_users[filtered_users['Movie_ID'].isin(top_movies_id)]

filtered_movies

Unnamed: 0,Movie_ID,Customer_ID,Rating
0,1,1488844,3.0
3,1,30878,4.0
4,1,823519,3.0
5,1,893988,3.0
7,1,1248029,3.0
...,...,...,...
24053756,4499,2219917,3.0
24053757,4499,1796454,1.0
24053759,4499,2591364,2.0
24053762,4499,988963,3.0


### User - Movie matrix
Pivoting the `filtered_movies` dataframe

In [17]:
user_movie_interact = filtered_movies.pivot_table(index=['Movie_ID'], columns='Customer_ID', values='Rating').fillna(0.0)

user_movie_interact

Customer_ID,1000033,1000062,1000079,1000084,1000095,1000192,1000301,1000328,1000380,1000387,...,999598,999601,999663,999693,999756,999768,999836,999901,99993,999944
Movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,3.0,2.0,1.0,2.0,0.0
996,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5.0,0.0,5.0,0.0,0.0
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Finding the `Cosine Similarity` scores
The **cosine similarity** scores of `3455` movies **against** `3455` movies

In [18]:
cs_scores = cosine_similarity(user_movie_interact)

cs_scores.shape

(3455, 3455)

### Converting `cs_scores` into a `DataFrame`

In [19]:
cs_df = pd.DataFrame(cs_scores, index=user_movie_interact.index, columns=user_movie_interact.index)

cs_df

Movie_ID,1,10,1000,1001,1004,1005,1006,1008,101,1011,...,989,990,991,992,993,994,996,997,998,999
Movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.024121,0.032351,0.023647,0.031895,0.037316,0.022510,0.063522,0.024290,0.044160,...,0.022131,0.044583,0.026667,0.050610,0.055554,0.046915,0.034023,0.056882,0.018371,0.027939
10,0.024121,1.000000,0.026594,0.026056,0.022484,0.043160,0.027964,0.041542,0.027663,0.018241,...,0.035222,0.024558,0.026681,0.020163,0.017164,0.023804,0.041277,0.045653,0.019785,0.030996
1000,0.032351,0.026594,1.000000,0.023731,0.042835,0.027059,0.010408,0.023330,0.028194,0.038563,...,0.015110,0.015691,0.023399,0.023584,0.036557,0.059851,0.024653,0.040100,0.016513,0.019418
1001,0.023647,0.026056,0.023731,1.000000,0.052092,0.037686,0.071612,0.028791,0.027692,0.085412,...,0.114364,0.098505,0.011330,0.055853,0.065251,0.065415,0.080905,0.023978,0.035790,0.082351
1004,0.031895,0.022484,0.042835,0.052092,1.000000,0.020305,0.010660,0.017631,0.023693,0.068839,...,0.075024,0.024809,0.024756,0.034760,0.040787,0.059265,0.015814,0.035792,0.012497,0.042057
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,0.046915,0.023804,0.059851,0.065415,0.059265,0.025242,0.020079,0.025014,0.024499,0.210286,...,0.110954,0.056470,0.030145,0.133327,0.132324,1.000000,0.096299,0.024175,0.028260,0.059319
996,0.034023,0.041277,0.024653,0.080905,0.015814,0.038480,0.054434,0.044912,0.049933,0.063128,...,0.091207,0.129480,0.053533,0.089677,0.080774,0.096299,1.000000,0.014820,0.025424,0.026938
997,0.056882,0.045653,0.040100,0.023978,0.035792,0.026543,0.031835,0.052397,0.040818,0.027152,...,0.016042,0.054047,0.031410,0.062287,0.055035,0.024175,0.014820,1.000000,0.004235,0.029890
998,0.018371,0.019785,0.016513,0.035790,0.012497,0.009530,0.012414,0.028626,0.015260,0.034057,...,0.045438,0.017465,0.015417,0.009462,0.021713,0.028260,0.025424,0.004235,1.000000,0.042100


### Scaling the ratings
For example, if a user has rated a movie `3`, then the **recommendations** should be multiplied i.e. `scaled` to `3` as well

In [20]:
sims = cs_df['1'][1:]*3
sims

Movie_ID
10      0.072363
1000    0.097052
1001    0.070942
1004    0.095684
1005    0.111947
          ...   
994     0.140745
996     0.102068
997     0.170645
998     0.055113
999     0.083817
Name: 1, Length: 3454, dtype: float64

### Recommender function
This function will return the **top 5** movie recommendations for each `Movie_ID` sorted in **descending** order by `similarity score` and `scaled` upto the **given rating**

In [127]:
def recommend(movie_title, movie_rating):
    movie_id = str(df[df['Name'] == movie_title]['Movie_Id'].values[0])
    
    res = cs_df[movie_id]*(movie_rating - 2.5)
    res.index = res.index.astype(int)
    
    res = pd.DataFrame(y).reset_index().rename(columns={'Movie_ID': 'Movie_Id', movie_id: 'Sim_Score'})
    res = res.sort_values(by='Sim_Score', ascending=False)[1:6]
    
    res = res.merge(df)
    
    return res[['Name', 'Sim_Score']]

In [129]:
title_, rating_ = list(input("Enter the movie title and rating : ").split(', '))

recommend(title_, float(rating_))

Enter the movie title and rating :  10 Things I Hate About You, 5


Unnamed: 0,Name,Sim_Score
0,What Women Want,1.531093
1,50 First Dates,1.530237
2,Pirates of the Caribbean: The Curse of the Bla...,1.522726
3,Ever After: A Cinderella Story,1.522016
4,Mean Girls,1.521931


# __Metrics__
### Checking the **accuracy** of the `cosine similarity` scores

# __Output__

### Taking the similarity scores and movie titles as output

In [130]:
cs_df.to_csv('item_based_sim_scores.csv')
df.to_csv('movie_titles.csv')