In [1]:
# Import all necessary packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import surprise
from surprise.prediction_algorithms import *
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
import numpy as np
import datetime as dt

In [3]:
# Read in the 'links.csv'.
df_links = pd.read_csv("Data/links.csv")
df_links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [7]:
# Checking the data.
df_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB


In [8]:
# Checking the movie ID column for duplicates.
df_links['movieId'].duplicated().sum()

0

# Movies.csv

In [9]:
# Read in the 'movies.csv'.
df_movies = pd.read_csv("Data/movies.csv")
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
# Checking the data.
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [11]:
# Checking the movie ID column for duplicates.
df_movies['movieId'].duplicated().sum()

0

# Rating.csv

In [12]:
# Read in the 'ratings.csv'.
df_ratings = pd.read_csv("Data/ratings.csv")
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [13]:
# Checking the data
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [14]:
# Checking the movie ID column for duplicates.
df_ratings['movieId'].duplicated().sum()

91112

In [15]:
# Checking the number of user ID's
df_ratings['userId']

0           1
1           1
2           1
3           1
4           1
         ... 
100831    610
100832    610
100833    610
100834    610
100835    610
Name: userId, Length: 100836, dtype: int64

In [16]:
# Double checking user ID's by counting the amount of users.
print(len(df_ratings['userId'].value_counts() > 1))

610


In [17]:
# Looking at the denomination breakdown of ratings.
print((df_ratings['rating'].value_counts()))

4.0    26818
3.0    20047
5.0    13211
3.5    13136
4.5     8551
2.0     7551
2.5     5550
1.0     2811
1.5     1791
0.5     1370
Name: rating, dtype: int64


# Tags.csv

In [18]:
# Read in 'tags.csv'.
df_tags = pd.read_csv("Data/tags.csv")
df_tags.head()


Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [19]:
# Checking the tags data.
df_tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


In [20]:
# Checking the movie ID column for duplicates.
df_tags['movieId'].duplicated().sum()

2111

In [21]:
# Checking how many moveId's have a value count equal to 1.
df_tagid = df_tags['movieId']
df_tagid = df_tagid.to_frame()
print(len(df_tagid['movieId'].value_counts()==1))


1572


In [22]:
# Counting the number of userId's
print(len(df_tags['userId'].value_counts() > 1))

58


# Merging Dataframes

In [23]:
# Merge 'links' and 'movie' dataframes together and check new dataframe.
movies_df = df_links.merge(df_movies, on='movieId')
movies_df = movies_df.dropna()
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9734 entries, 0 to 9741
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9734 non-null   int64  
 1   imdbId   9734 non-null   int64  
 2   tmdbId   9734 non-null   float64
 3   title    9734 non-null   object 
 4   genres   9734 non-null   object 
dtypes: float64(1), int64(2), object(2)
memory usage: 456.3+ KB


In [24]:
# Check to see if there are any movieId duplicates.
movies_df['movieId'].duplicated().sum()

0

In [25]:
# Merge new dataframe with the 'ratings.csv'
movie_df = movies_df.merge(df_ratings, on='movieId')
movie_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 100823 entries, 0 to 100822
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   movieId    100823 non-null  int64  
 1   imdbId     100823 non-null  int64  
 2   tmdbId     100823 non-null  float64
 3   title      100823 non-null  object 
 4   genres     100823 non-null  object 
 5   userId     100823 non-null  int64  
 6   rating     100823 non-null  float64
 7   timestamp  100823 non-null  int64  
dtypes: float64(2), int64(4), object(2)
memory usage: 6.9+ MB


In [26]:
# Checking to see how many movieIds are duplicated
movie_df['movieId'].duplicated().sum()


91107

In [27]:
print("In this merged dataset we have", len(movie_df['movieId']) - movie_df['movieId'].duplicated().sum(), "individual movie titles.")

In this merged dataset we have 9716 individual movie titles.


In [28]:
movie_df.describe()

Unnamed: 0,movieId,imdbId,tmdbId,userId,rating,timestamp
count,100823.0,100823.0,100823.0,100823.0,100823.0,100823.0
mean,19435.437737,351570.4,20105.462633,326.130823,3.501637,1205945000.0
std,35532.291269,622092.2,53274.14362,182.618176,1.04243,216261300.0
min,1.0,417.0,2.0,1.0,0.5,828124600.0
25%,1199.0,99685.0,712.0,177.0,3.0,1019124000.0
50%,2991.0,118771.0,6957.0,325.0,3.5,1186087000.0
75%,8120.0,314979.0,11635.0,477.0,4.0,1435994000.0
max,193609.0,8391976.0,525662.0,610.0,5.0,1537799000.0


In [29]:
# We are dropping the 'imdbId' and 'tmdbId' columns because they point to information 
# we do not have access to.
df_cleaned = movie_df.drop(columns= ['imdbId', 'tmdbId'], axis=1)


In [30]:
# Checking the new dataframe.
df_cleaned.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 100823 entries, 0 to 100822
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   movieId    100823 non-null  int64  
 1   title      100823 non-null  object 
 2   genres     100823 non-null  object 
 3   userId     100823 non-null  int64  
 4   rating     100823 non-null  float64
 5   timestamp  100823 non-null  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 5.4+ MB


In [31]:
# Checking the value counts for different genre combinations.
df_cleaned['genres'].value_counts()


Comedy                                      7194
Drama                                       6290
Comedy|Romance                              3967
Comedy|Drama|Romance                        3000
Comedy|Drama                                2851
                                            ... 
Horror|Sci-Fi|Western                          1
Comedy|Crime|Horror|Mystery|Thriller           1
Crime|Drama|Film-Noir|Romance|Thriller         1
Animation|Children|Comedy|Drama                1
Animation|Children|Comedy|Musical|Sci-Fi       1
Name: genres, Length: 951, dtype: int64