# Preprocessing data

## 1. Hanlding missing values

In [5]:
import pandas as pd
import numpy as np

In [6]:
movies = pd.read_csv('ml-32m/movies.csv')

In [7]:
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [8]:
ratings = pd.read_csv('ml-32m/ratings.csv')

In [9]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228


In [10]:
tags = pd.read_csv('ml-32m/tags.csv')

In [11]:
tags.head(2)

Unnamed: 0,userId,movieId,tag,timestamp
0,22,26479,Kevin Kline,1583038886
1,22,79592,misogyny,1581476297


In [12]:
links = pd.read_csv('ml-32m/links.csv')

In [13]:
links.head(2)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0


In [14]:
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [15]:
tags.isnull().sum()

userId        0
movieId       0
tag          17
timestamp     0
dtype: int64

In [16]:
# tags['tag'].fillna("", inplace=True)
tags.fillna({'tag': "Unknown"}, inplace=True)

In [17]:
tags.isnull().sum()

userId       0
movieId      0
tag          0
timestamp    0
dtype: int64

In [18]:
tags[tags['tag'] =='Unknown'].head(2)

Unnamed: 0,userId,movieId,tag,timestamp
185377,27046,33826,Unknown,1221450908
1394089,89369,281500,Unknown,1670942104


In [19]:
ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [20]:
links.isnull().sum()

movieId      0
imdbId       0
tmdbId     124
dtype: int64

In [21]:
# Fill by 0 and convert to int
cleaned_links = links.copy()
cleaned_links['tmdbId'] = cleaned_links['tmdbId'].fillna(0).astype(int)
cleaned_links.isnull().sum()

movieId    0
imdbId     0
tmdbId     0
dtype: int64

In [22]:
cleaned_links_2  = links.copy()
missing_value_records = cleaned_links_2[cleaned_links_2.isna().any(axis=1)]
missing_value_records.head()

Unnamed: 0,movieId,imdbId,tmdbId
706,721,114103,
715,730,125877,
754,770,38426,
775,791,113610,
1080,1107,102336,


In [23]:
cleaned_links_2.dropna(subset=['tmdbId'], inplace=True)
cleaned_links_2.isnull().sum()

movieId    0
imdbId     0
tmdbId     0
dtype: int64

In [24]:
missing_value_records = cleaned_links_2[cleaned_links_2.isna().any(axis=1)]
missing_value_records.head()

Unnamed: 0,movieId,imdbId,tmdbId


## 2. Removing duplicates

In [None]:
print("Movies:", movies.duplicated().sum())
print("Ratings:", ratings.duplicated().sum())
print("Tags:", tags.duplicated().sum())
print("Links:", links.duplicated().sum())

Movies: 0


In [None]:
movies.drop_duplicates(inplace=True)
ratings.drop_duplicates(inplace=True)
tags.drop_duplicates(inplace=True)
links.drop_duplicates(inplace=True)

In [None]:
# check specific columns
ratings.duplicated(subset=['userId', 'movieId']).sum()

## 3. Transforming data

#### Extract year from movie title

In [37]:
import re

In [103]:
# example
text = "The God Father (1972)"
pattern  = r'\((\d{4})\)'
match = re.search(pattern, text)
match.group(0)

'(1972)'

In [104]:
match.group(1)

'1972'

In [118]:
# Count how many movie titles that do not contain year
movies['title'].apply(lambda x: 1 if not re.search(r"\((\d{4})\)", x) else 0).sum()

np.int64(615)

In [120]:
# Add a column "year" that is extracted from column "title", if not, assign "nan"
movies_with_year = movies.copy()
# movies['title'] = movies['title'].apply(lambda x: )

In [122]:
movies_with_year['year'] = movies['title'].apply(lambda x: re.search(r"\((\d{4})\)", x).group(1) if re.search(r"\((\d{4})\)", x) else np.nan)
movies_with_year.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [124]:
movies_with_year['title'].apply(lambda x: 1 if not re.search(r"\((\d{4})\)", x) else 0).head(5)

0    0
1    0
2    0
3    0
4    0
Name: title, dtype: int64

In [132]:
# real change
movies['year'] = movies['title'].apply(lambda x: re.search(r"\((\d{4})\)", x).group(1) if re.search(r"\((\d{4})\)", x) else np.nan)
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [133]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


####  Convert timestamps to a readable datetime format.

In [134]:
new_ratings = ratings.copy()
new_ratings['timestamp'] = pd.to_datetime(new_ratings['timestamp'], unit='s')
new_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,1999-12-03 19:24:37
1,1,25,1.0,1999-12-03 19:43:48
2,1,29,2.0,1999-11-22 00:36:16
3,1,30,5.0,1999-12-03 19:24:37
4,1,32,5.0,1999-11-22 00:00:58


In [1]:
# real change
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
tags['timestamp'] = pd.to_datetime(tags['timestamp'], unit='s')

NameError: name 'pd' is not defined

In [137]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228


In [139]:
tags.head(2)

Unnamed: 0,userId,movieId,tag,timestamp
0,22,26479,Kevin Kline,2020-03-01 05:01:26
1,22,79592,misogyny,2020-02-12 02:58:17


## 4. Normalzing & Scaling features

## 5. Merging datasets

## 6.Saving the cleaned data