In [3]:
import pandas as pd

# Load the dataset
df= pd.read_csv('netflix_titles.csv')

# Identify missing values
print("Missing Values:\n", df.isnull().sum())

# Handle missing values (example: fill with 'Not Available')
df.fillna('Not Available', inplace=True)

# Verify all missing values are handled
print("\nAfter Handling Missing Values:\n", df.isnull().sum())

Missing Values:
 show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

After Handling Missing Values:
 show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64


In [4]:
df.drop_duplicates(inplace=True)

# Optional: Check the number of remaining rows
print("Remaining rows after removing duplicates:", len(df))

Remaining rows after removing duplicates: 8807


In [5]:
df['country'] = df['country'].str.lower().str.strip()

# Replace common country variations
df['country'] = df['country'].replace({
    'united states': 'usa',
    'united kingdom': 'uk',
    'south korea': 'korea',
    'russia': 'russian federation',
    'u.s.': 'usa',
    'u.k.': 'uk'
    # Add more mappings as needed
})

# View unique country values after standardization
print(df['country'].unique())

['usa' 'south africa' 'not available' 'india'
 'united states, ghana, burkina faso, united kingdom, germany, ethiopia'
 'uk' 'germany, czech republic' 'mexico' 'turkey' 'australia'
 'united states, india, france' 'finland' 'china, canada, united states'
 'south africa, united states, japan' 'nigeria' 'japan'
 'spain, united states' 'france' 'belgium' 'united kingdom, united states'
 'united states, united kingdom' 'france, united states' 'korea' 'spain'
 'united states, singapore' 'united kingdom, australia, france'
 'united kingdom, australia, france, united states'
 'united states, canada' 'germany, united states'
 'south africa, united states' 'united states, mexico'
 'united states, italy, france, japan'
 'united states, italy, romania, united kingdom'
 'australia, united states' 'argentina, venezuela'
 'united states, united kingdom, canada' 'china, hong kong'
 'russian federation' 'canada' 'hong kong'
 'united states, china, hong kong' 'italy, united states'
 'united states, germ

In [6]:
print(df.columns)

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')


In [7]:
# Convert 'date_added' to datetime format
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

# Format it to 'dd-mm-yyyy'
df['date_added'] = df['date_added'].dt.strftime('%d-%m-%Y')

# Display first few values to verify
print(df['date_added'].head())

0    25-09-2021
1    24-09-2021
2    24-09-2021
3    24-09-2021
4    24-09-2021
Name: date_added, dtype: object


In [8]:
# Rename columns: lowercase, replace spaces with underscores
df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_')

# Display new column names
print(df.columns)

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')


In [12]:
 print(df['rating'].dtype)

object


In [13]:
print(df['date_added'].dtype)

object
