<a href="https://colab.research.google.com/github/pccastros/movie_recomendation_system/blob/master/movie_recomendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import movie dataset**

In [1]:
!wget -O moviedataset.zip https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/moviedataset.zip
!unzip -o -j moviedataset.zip 

--2021-08-14 23:02:31--  https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/moviedataset.zip
Resolving s3-api.us-geo.objectstorage.softlayer.net (s3-api.us-geo.objectstorage.softlayer.net)... 67.228.254.196
Connecting to s3-api.us-geo.objectstorage.softlayer.net (s3-api.us-geo.objectstorage.softlayer.net)|67.228.254.196|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 160301210 (153M) [application/zip]
Saving to: ‘moviedataset.zip’


2021-08-14 23:02:36 (31.9 MB/s) - ‘moviedataset.zip’ saved [160301210/160301210]

Archive:  moviedataset.zip
  inflating: links.csv               
  inflating: movies.csv              
  inflating: ratings.csv             
  inflating: README.txt              
  inflating: tags.csv                


**Import Libraries**

In [2]:
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

**Import and clean data**

In [3]:
# Read movies csv
movies = pd.read_csv('movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
## Delete year in column title
movies['title'] = movies['title'].map(lambda x: x[:-7])

## Drop genres column 
movies.drop('genres', 1, inplace=True)

movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story
1,2,Jumanji
2,3,Grumpier Old Men
3,4,Waiting to Exhale
4,5,Father of the Bride Part II


In [5]:
## Import ratings csv
ratings = pd.read_csv('ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [6]:
## Drop timestamp column
ratings.drop('timestamp', 1, inplace=True)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


**User Input**

In [7]:
## User input
userRating = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userRating)
inputMovies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [8]:
#Filtering out the input movies by title
inputIdMovie = movies[movies['title'].isin(inputMovies['title'].tolist())]

## Merge input movies and movies dataset
inputMovies = pd.merge(inputIdMovie, inputMovies)
inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


Tomamos del raing el numero de personas que calificaron las mismas pelis

**Trainning Dataset**

In [9]:
## Get rating from each in movie dataset
moviesData = ratings[ratings['movieId'].isin(inputMovies['movieId'].tolist())]
moviesData.head()

Unnamed: 0,userId,movieId,rating
19,4,296,4.0
441,12,1968,3.0
479,13,2,2.0
531,13,1274,5.0
681,14,296,2.0


grupar por userId

In [10]:
## Group movies data by user Id
usersData = moviesData.groupby(['userId'])

## Sort by user Id
usersData = sorted(usersData,  key=lambda x: len(x[1]), reverse=True)

## Print 3 users ratings for example
usersData[:3]


[(75,       userId  movieId  rating
  7507      75        1     5.0
  7508      75        2     3.5
  7540      75      296     5.0
  7633      75     1274     4.5
  7673      75     1968     5.0), (106,       userId  movieId  rating
  9083     106        1     2.5
  9084     106        2     3.0
  9115     106      296     3.5
  9198     106     1274     3.0
  9238     106     1968     3.5), (686,        userId  movieId  rating
  61336     686        1     4.0
  61337     686        2     3.0
  61377     686      296     4.0
  61478     686     1274     4.0
  61569     686     1968     5.0)]

**Pearson Correlation**

In [11]:
## Take first 100 users
usersData = usersData[0:100]

In [12]:
pearsonCorr = {}

#For every user group in our subset
for idUser, moviesRatings in usersData:

    ## user input ratings
    inRating = inputMovies[inputMovies['movieId'].isin(moviesRatings['movieId'].tolist())]
    inRatingList = inRating['rating'].tolist()

    ## users inputs
    moviesRatings = moviesRatings.sort_values(by='movieId')
    ratingList = moviesRatings['rating'].tolist()

    ## calculate correlation 
    corr = np.corrcoef(inRatingList, ratingList)[0, 1]
    pearsonCorr[idUser] = corr

pearsonCorr


{75: 0.8272781516947569,
 106: 0.5860090386731193,
 686: 0.8320502943378437,
 815: 0.5765566601970551,
 1040: 0.9434563530497266,
 1130: 0.28915746598312014,
 1502: 0.8770580193070293,
 1599: 0.43852900965351466,
 1625: 0.7161148740394329,
 1950: 0.17902871850985821,
 2065: 0.43852900965351466,
 2128: 0.5860090386731193,
 2432: 0.1386750490563073,
 2791: 0.8770580193070293,
 2839: 0.8204126541423671,
 2948: -0.11720180773462387,
 3025: 0.45124262819714017,
 3040: 0.895143592549291,
 3186: 0.6784622064861936,
 3271: 0.2698959481797065,
 3429: 0.0,
 3734: -0.1504142093990467,
 4099: 0.05860090386731193,
 4208: 0.29417420270727607,
 4282: -0.43852900965351466,
 4292: 0.6564386345361466,
 4415: -0.11183835382312352,
 4586: -0.9024852563942803,
 4725: -0.08006407690254358,
 4818: 0.4885967564883422,
 5104: 0.7674257668936506,
 5165: -0.43852900965351466,
 5547: 0.17200522903844537,
 6082: -0.047287799241095906,
 6207: 0.9615384615384617,
 6366: 0.657793514480272,
 6482: 0.0,
 6530: -0.35160

In [13]:
## Convert pearson correlation dict into Dataframe
corrDF = pd.DataFrame.from_dict(pearsonCorr, orient='index')
corrDF.columns = ['similarityIndex']
corrDF['userId'] = corrDF.index
corrDF.index = range(len(corrDF))
corrDF.head()

Unnamed: 0,similarityIndex,userId
0,0.827278,75
1,0.586009,106
2,0.83205,686
3,0.576557,815
4,0.943456,1040


In [14]:
## Get the first 50 best similarity index
bestSimil = corrDF.sort_values(by='similarityIndex', ascending=False)[0:50]
bestSimil.head()

Unnamed: 0,similarityIndex,userId
64,0.961678,12325
34,0.961538,6207
55,0.961538,10707
67,0.960769,13053
4,0.943456,1040


In [15]:
## Get best similar users ratings from movies
bestSimilRatings = bestSimil.merge(ratings, left_on='userId', right_on='userId', how='inner')
bestSimilRatings.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,0.961678,12325,1,3.5
1,0.961678,12325,2,1.5
2,0.961678,12325,3,3.0
3,0.961678,12325,5,0.5
4,0.961678,12325,6,2.5


In [16]:
#Multiplies the similarity by the user's ratings
bestSimilRatings['weightedRating'] = bestSimilRatings['similarityIndex']*bestSimilRatings['rating']
bestSimilRatings.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,0.961678,12325,1,3.5,3.365874
1,0.961678,12325,2,1.5,1.442517
2,0.961678,12325,3,3.0,2.885035
3,0.961678,12325,5,0.5,0.480839
4,0.961678,12325,6,2.5,2.404196


In [17]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = bestSimilRatings.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,38.376281,140.800834
2,38.376281,96.656745
3,10.253981,27.254477
4,0.929294,2.787882
5,11.723262,27.151751


In [18]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.668955,1
2,2.518658,2
3,2.657941,3
4,3.0,4
5,2.316058,5


In [19]:
## Get first 10 movies
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
26801,5.0,26801
3539,5.0,3539
1902,5.0,1902
6660,5.0,6660
6668,5.0,6668
121,5.0,121
3851,5.0,3851
90531,5.0,90531
3776,5.0,3776
3775,5.0,3775


In [20]:
movies.loc[movies['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title
119,121,"Boys of St. Vincent, The"
1819,1902,Dream for an Insomniac
3449,3539,"Filth and the Fury, The"
3685,3775,Make Mine Music
3686,3776,Melody Time
3759,3851,I'm the One That I Want
6551,6660,"Red Shoes, The"
6559,6668,"Road Home, The (Wo de fu qin mu qin)"
9064,26801,Dragon Inn (Sun lung moon hak chan)
18106,90531,Shame
