In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Setup
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

In [None]:
df = pd.read_csv("netflix_titles.csv")

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum()

### **Insights of DATA :-**
#### `director` column has most **missing values**
#### then `cast` and `country`
#### `date_added` , `rating` and `duration` has less amount of **missing values**
#### Most common Data type :- **`object`**

In [None]:
# Filling Missing Values :-
# Fill or Drop missing values where necessary

df['director'] = df['director'].fillna("Not given")
df['cast'] = df['cast'].fillna("Not given")
df['country'] = df['country'].fillna("Unknown")

In [None]:
# df['date_added'] = pd.to_datetime(df['date_added'])
df['date_added'] = pd.to_datetime(df["date_added"] , errors='coerce')

# error='coerce' will convert Nan to NaT (not a time ) and it will also not throw error if value is Nan

## **EDA (Exploratory Data Analysis)**

In [None]:
# 1. Count of Movies vs TV Shows
sns.countplot(data=df,x='type',hue='type',palette='Blues')
plt.title("Count of TV shows and Movies")
plt.show()

In [None]:
from collections import Counter

In [None]:
# 2. Top 10 Genres (listed under 'listed_in')
genre = ",".join(df['listed_in'].dropna()).split(", ")
genre_counts = Counter(genre)

# 👇 Dummy 'Category' column added to use `palette` without warning (required from Seaborn v0.14+)
top_genres_df = pd.DataFrame(genre_counts.most_common(10), columns=["Genre", "Count"])
top_genres_df["Category"] = "Top 10"

sns.barplot(data=top_genres_df, x="Count", y="Genre", hue="Category", palette="Reds", legend=False)
plt.title("Top 10 Most Common Genres")
plt.xlabel("Number of Titles")
plt.ylabel("Genre")
plt.show()

In [None]:
# 3. Content added over the years

df["year_added"] = df["date_added"].dt.year
df["month_added"] = df["date_added"].dt.month
sns.histplot(data=df, x="year_added", hue="type", multiple="stack", bins=15)
plt.title("Content Added to Netflix by Year")
plt.xlabel("Year")
plt.ylabel("Count")
plt.show()

In [None]:
# 4. Countries with most content
top_countries = df[df["country"] != "Unknown"]["country"].value_counts().head(10)
sns.barplot(y=top_countries.index,x=top_countries.values,hue=top_countries.index,palette="magma")
# Hue is dummy in this case , just wrote to avoid warning 

In [None]:
# 5. Most frequent directors

directors = df["director"].value_counts().drop("Not given").head(10)
sns.barplot(y=directors.index, x=directors.values,hue=directors.index,palette='pastel')
plt.title("Top 10 Directors on Netflix")
plt.xlabel("Number of Titles")
plt.ylabel("Director")
plt.show()