# Step 2: Data Cleaning

Goal: Handle missing values, convert data types, and prepare for analysis.

In [7]:
import pandas as pd

#Load the dataset
df = pd.read_csv("../data/netflix_titles.csv")

In [8]:
#Check missing values
df.isna().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [9]:
#Fill missing data
df['country'] = df['country'].fillna('Unknown')
df['director'] = df['director'].fillna('Unkown')
df['rating'] = df['rating'].fillna('Unkown')
df['duration'] = df['duration'].fillna('Unkown')
df['cast'] = df['cast'].fillna('Unkown')

In [10]:
#Drop rows with no date_added
df = df.dropna(subset=['date_added'])

#Trim leading and trailing spaces in date_added
df['date_added'] = df['date_added'].str.strip()

In [11]:
#Convert date_added to datetime
df['date_added'] = pd.to_datetime(df['date_added'])

#Extract year from date_added
df['year_added'] = df['date_added'].dt.year

In [12]:
#Standardize column names
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,year_added
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unkown,United States,2021-09-25,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",2021
1,s2,TV Show,Blood & Water,Unkown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2021
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Unknown,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,2021
3,s4,TV Show,Jailbirds New Orleans,Unkown,Unkown,Unknown,2021-09-24,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",2021
4,s5,TV Show,Kota Factory,Unkown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,2021


In [13]:
#Export cleaned dataset to new csv
df.to_csv("../data/netflix_titles_cleaned.csv", index=False)

### Cleaning summary
- Filled missing country, director, rating, duration, and cast with Unknown
- Removed rows with no date_added
- Converted date_added to datetime
- Created new column year_added
- Normalized column names
- Exported cleaned data for EDA