In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
# load the data
link_df = pd.read_csv('links.csv')
movie_df = pd.read_csv('movies.csv')
rating_df = pd.read_csv('ratings.csv')
tag_df = pd.read_csv('tags.csv')


In [3]:
link_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [6]:
link_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB


In [8]:
link_df.duplicated().any()
# there's no duplicates

False

In [9]:
# null value
link_df.isnull().sum()

movieId    0
imdbId     0
tmdbId     8
dtype: int64

### Rating dataset

In [4]:
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [10]:
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [12]:
rating_df.duplicated().any()

False

In [13]:
rating_df.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [5]:
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [14]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [15]:
movie_df.duplicated().any()

False

In [16]:
movie_df.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [19]:
# check special character
special_charcter = movie_df['title'].str.contains('[^a-zA-Z0-9\s]', regex=True)
special_charcter.unique().sum()
movie_df[special_charcter]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [22]:
# genre column
special_characters = movie_df['genres'].str.contains('[^a-zA-Z0-9\s]', regex=True)
special_characters.unique().sum()
movie_df[special_characters]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,6,Heat (1995),Action|Crime|Thriller
...,...,...,...
9733,193567,anohana: The Flower We Saw That Day - The Movi...,Animation|Drama
9734,193571,Silver Spoon (2014),Comedy|Drama
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy


In [34]:
#  handle special character on the title column
def remove_special_characters(text):
    return re.sub(r'[^a-zA-Z0-9\s]', ' ', text)

In [35]:
movie_df['title']= movie_df['title'].apply(remove_special_characters)
movie_df['genres']= movie_df['genres'].apply(remove_special_characters)

In [36]:
# check if still,it  contains special characters
special_characters = movie_df['genres'].str.contains('[^a-zA-Z0-9\s]', regex=True)
special_characters.unique().sum()
movie_df[special_characters]

Unnamed: 0,movieId,title,genres


In [37]:
# check special character
special_charcter = movie_df['title'].str.contains('[^a-zA-Z0-9\s]', regex=True)
special_charcter.unique().sum()
movie_df[special_charcter]

Unnamed: 0,movieId,title,genres


In [38]:
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story 1995,Adventure Animation Children Comedy Fantasy
1,2,Jumanji 1995,Adventure Children Fantasy
2,3,Grumpier Old Men 1995,Comedy Romance
3,4,Waiting to Exhale 1995,Comedy Drama Romance
4,5,Father of the Bride Part II 1995,Comedy


### merging data

In [40]:
# remove the spaces
movie_df.columns = movie_df.columns.str.strip()
rating_df.columns = rating_df.columns.str.strip()

In [43]:
df = pd.merge(movie_df ,rating_df, on='movieId',how ='inner')

In [44]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story 1995,Adventure Animation Children Comedy Fantasy,1,4.0,964982703
1,1,Toy Story 1995,Adventure Animation Children Comedy Fantasy,5,4.0,847434962
2,1,Toy Story 1995,Adventure Animation Children Comedy Fantasy,7,4.5,1106635946
3,1,Toy Story 1995,Adventure Animation Children Comedy Fantasy,15,2.5,1510577970
4,1,Toy Story 1995,Adventure Animation Children Comedy Fantasy,17,4.5,1305696483


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   movieId    100836 non-null  int64  
 1   title      100836 non-null  object 
 2   genres     100836 non-null  object 
 3   userId     100836 non-null  int64  
 4   rating     100836 non-null  float64
 5   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 5.4+ MB


In [46]:
# check duplicates
df.duplicated().any()

False

In [47]:
# check null value
df.isnull().sum()

movieId      0
title        0
genres       0
userId       0
rating       0
timestamp    0
dtype: int64

In [51]:
# drop unnecessary columns
df.drop('userId',axis =1,inplace=True)

In [52]:
df.head()

Unnamed: 0,movieId,title,genres,rating,timestamp
0,1,Toy Story 1995,Adventure Animation Children Comedy Fantasy,4.0,964982703
1,1,Toy Story 1995,Adventure Animation Children Comedy Fantasy,4.0,847434962
2,1,Toy Story 1995,Adventure Animation Children Comedy Fantasy,4.5,1106635946
3,1,Toy Story 1995,Adventure Animation Children Comedy Fantasy,2.5,1510577970
4,1,Toy Story 1995,Adventure Animation Children Comedy Fantasy,4.5,1305696483


In [54]:
df.to_json('movie_rating.json',orient  ='records',lines = True)