In [None]:
import pandas as pd
from google.colab import files

uploaded = files.upload()

f = pd.read_csv("netflix_titles.csv")

# Display info before Cleaning
print("Dataset shape before cleaning:", f.shape)

# Identify missing values
print("\nMissing values per column:")
print(f.isnull().sum())

# Remove Duplicates rows
f = f.drop_duplicates()

# Replace missing values in the cell by 'Unknown'
f['director'] = f['director'].fillna("Unknown")
f['cast'] = f['cast'].fillna("Unknown")
f['country'] = f['country'].fillna("Unknown")
f['date_added'] = f['date_added'].fillna("Unknown")
f['rating'] = f['rating'].fillna("Unknown")
f['duration'] = f['duration'].fillna("Unknown")


# Standardize text values
f['country'] = f['country'].replace({
    'United States': 'USA',
    'United Kingdom': 'UK',
    'Canada': 'CA',
    'United Arab Emirates': 'UAE'
})

# Rename columns to lowercase & remove spaces
f.columns = [col.lower().strip().replace(" ", "_") for col in f.columns]

# Convert 'date_added' to date-time (dd-mm-yyyy) format
f['date_added'] = f['date_added'].astype(str).str.strip()
f = f[f['date_added'] != 'Unknown']
f['date_added'] = pd.to_datetime(f['date_added'])
f['date_added'] = f['date_added'].dt.strftime('%d-%m-%Y')


print("\nDataset shape after cleaning:", f.shape)
print("\nCleaned sample:")
print(f.head())

# Save cleaned dataset to CSV
f.to_csv("Cleaned_Netflix.csv", index=False)
files.download("Cleaned_Netflix.csv")
