In [1]:
import pandas as pd
import numpy as np

In [2]:
df_movies = pd.read_csv("dataset/movies.csv")
df_tags = pd.read_csv("dataset/tags.csv")
df_ratings = pd.read_csv("dataset/ratings.csv")
df_links = pd.read_csv("dataset/links.csv")

In [3]:
df_movies['genres'] = df_movies['genres'].str.replace('|', ' ', regex=False)

In [4]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
df_movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [6]:
df_movies.shape

(9742, 3)

In [7]:
df_tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [8]:
df_tags.isnull().sum()

userId       0
movieId      0
tag          0
timestamp    0
dtype: int64

In [9]:
df_tags = df_tags.drop(columns=['timestamp','userId'])

In [10]:
df_tags.shape

(3683, 2)

In [11]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [12]:
df_ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [13]:
df_ratings = df_ratings.drop(columns=['timestamp','userId'])

In [14]:
df_ratings.shape

(100836, 2)

In [15]:
df_links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [16]:
df_links.isnull().sum()

movieId    0
imdbId     0
tmdbId     8
dtype: int64

In [17]:
df_links.shape

(9742, 3)

In [18]:
merged_df = pd.merge(df_movies, df_tags, on='movieId', how='outer')

merged_df = pd.merge(merged_df, df_ratings, on='movieId', how='outer')

merged_df = pd.merge(merged_df, df_links, on='movieId', how='outer')

merged_df.head()

Unnamed: 0,movieId,title,genres,tag,rating,imdbId,tmdbId
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar,4.0,114709,862.0
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar,4.0,114709,862.0
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar,4.5,114709,862.0
3,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar,2.5,114709,862.0
4,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar,4.5,114709,862.0


In [19]:
merged_df = merged_df.groupby(['movieId', 'title'], as_index=False).agg({
    'tag': lambda x: ' '.join(sorted(set(x.dropna().astype(str)))),
    'rating': 'mean',                                    
    'genres': 'first',
    'imdbId': 'first',
    'tmdbId': 'first',
})
merged_df['rating'] = merged_df['rating'].round(2)

In [20]:
merged_df['content'] = merged_df['genres'].fillna('') + ' ' + merged_df['tag'].fillna('')
merged_df = merged_df.drop(columns=['genres','tag'])

In [21]:
merged_df.head()

Unnamed: 0,movieId,title,rating,imdbId,tmdbId,content
0,1,Toy Story (1995),3.92,114709,862.0,Adventure Animation Children Comedy Fantasy fu...
1,2,Jumanji (1995),3.43,113497,8844.0,Adventure Children Fantasy Robin Williams fant...
2,3,Grumpier Old Men (1995),3.26,113228,15602.0,Comedy Romance moldy old
3,4,Waiting to Exhale (1995),2.36,114885,31357.0,Comedy Drama Romance
4,5,Father of the Bride Part II (1995),3.07,113041,11862.0,Comedy pregnancy remake


In [22]:
merged_df.to_csv("cleaned_movies.csv", index=False)