# Netflix ratings analysis

## Introduction
This project aims to analyse the ratings of Netflix movies and TV shows.
### Dataset 
The dataset consists of listings of all the movies and tv shows available on Netflix, along with details such as - cast, directors, ratings, release year, duration, etc. as of mid 2021.
https://www.kaggle.com/datasets/shivamb/netflix-shows

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

### Loading data

In [11]:
try:
    df = pd.read_csv("netflix_titles.csv")
    print("Data/ Loaded successfully")
except FileNotFoundError:
    print("File 'netflix_titles.csv' not found")
except Exception as e:
    print(f"[ERROR] An unexpected error occurred: {type(e).__name__}: {e}")

Data/ Loaded successfully


### Data Overview

In [None]:
print(f"Dimensions:\t{df.shape}")
print(f"Total columns:\t{df.shape[1]}")
print(f"Total rows:\t{df.shape[0]}")
print(f"Column names:\n{list(df.columns)}")


df.info()
display(df.describe())
display(df.head(2))
display(df.tail(2))

### Data Cleaning
First convert wrong data types into correct usable ones (e.g. 'duration' from string to time, 'rating' from string to float)

In [None]:
# Duplicate dataframe for recovery
dff = df.copy() 

In [None]:
# column wise cleaning

# standardize capitalization
df['type'] = df['type'].str.strip().str.title()

# removing extra whitespace
df['title'] = df['title'].str.strip() 

# Fill empty cells with Unknown
df['director'] = df['director'].fillna("Unknown")
df['cast'] = df['cast'].fillna("Unknown")

# convert from string to datetime
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
# Leave empty date as they are 


# fill 'UNKNOWN' to empty cells, remove balnk space, convert to upper case
df['rating'] = df['rating'].fillna('UNKNOWN').str.strip().str.upper()

# rating column had some values with very few count 
# replace all of them with 'Other'

df_rating_count = df['rating'].value_counts()
rare_rating_count = df_rating_count[df_rating_count < 10]
rare_rating_categories = rare_rating_count.index
df['rating'] = df['rating'].replace(rare_rating_categories, 'Other')
df['rating'].value_counts()
# since there were only 4 'UNKNOWN' values they were replaced with Other



In [None]:
# clean remaining columns - duration, listed_in, description
# duration column has 'x min' for movies and 'x seasons' for TV shows
# create two new columns seaparating them 'duration_time' and 'duration_type' 

# using column wise sending

def extract_time(x):
    try:
        if pd.isna(x):
            return np.nan
        return int(x.strip().split()[0])
    except Exception:
        return np.nan
    
def extract_type(x):
    try:
        if pd.isna(x):
            return np.nan
        else:
            return x.strip().split()[1]
    except Exception:
        return np.nan
    
df['duration_time'] = df['duration'].apply(extract_time)
df['duration_type'] = df['duration'].apply(extract_type)
df.head(2)


In [None]:
# row-wise sending row
# creating colums 'duration_time' and 'duration_type' 
def extract_time(row):
    try:
        if pd.isna(row['duration']):
            return np.nan
        else:
            return int(row['duration'].strip().split()[0])
    except Exception:
        return np.nan
    
def extract_duration(row):
    try:
        if pd.isna(row['duration']):
            return np.nan
        else:
            return row['duration'].strip().split()[1]
    except Exception:
        return np.nan

df['duration_time'] = df.apply(extract_time, axis=1)
df['duration_type'] = df.apply(extract_duration, axis=1)

# Standardize duration units 
# convert mins to min and seasons to season
df['duration_type'] = df['duration_type'].replace({
    'mins': 'min',
    'Seasons': 'Season',
    'seasons': 'Season'
})


In [None]:
df['listed_in'] = df['listed_in'].str.strip()
df['description'] = df['description'].str.strip()


In [None]:
# remove duplicates
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True) #drop used to drop old index


### Save cleaned data into new csv file 'netflix_cleaned.csv'

In [None]:
df.to_csv("netflix_cleaned.csv")