## Install and Import

In [None]:
!pip install pandas matplotlib seaborn plotly --quiet

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import re
from google.colab import files


## Theme setup

In [None]:
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10,5)

## Upload and Load Dataset

In [None]:
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
df = pd.read_csv(file_name)

Saving Netflix Dataset (12).csv to Netflix Dataset (12).csv


In [None]:
print(" Data Loaded Successfully")
print("Rows:", df.shape[0], "Columns:", df.shape[1])
df.head()

 Data Loaded Successfully
Rows: 7789 Columns: 11


Unnamed: 0,Show_Id,Category,Title,Director,Cast,Country,Release_Date,Rating,Duration,Type,Description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,07:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",PG-13,123 min,Dramas,A brilliant group of students become card-coun...


## Data Preprocessing


In [None]:
df.columns = df.columns.str.strip().str.replace(' ', '_')

# Handle missing/blank values
df.replace(['', ' ', 'NA', 'NaN', 'nan'], np.nan, inplace=True)


In [None]:
# Extract year
def extract_year(x):
    try:
        return pd.to_datetime(x, errors='coerce').year
    except:
        return np.nan
df['release_year'] = df['Release_Date'].apply(extract_year)

In [None]:
# Clean category names
df['Category'] = df['Category'].astype(str).str.strip().str.title()

# Convert movie durations to numeric minutes
def extract_minutes(x):
    if pd.isna(x): return np.nan
    m = re.search(r'(\d+)\s*min', str(x), re.IGNORECASE)
    return int(m.group(1)) if m else np.nan
df['duration_minutes'] = df['Duration'].apply(extract_minutes)

In [None]:
# Fill some missing categorical data
# Fill missing categorical data safely
df.fillna({
    'Country': 'Unknown',
    'Rating': 'Not Rated',
    'Type': 'Unspecified'
}, inplace=True)



In [None]:
print("\n✅ Preprocessing done.")
df.info()



✅ Preprocessing done.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7789 entries, 0 to 7788
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Show_Id           7789 non-null   object 
 1   Category          7789 non-null   object 
 2   Title             7789 non-null   object 
 3   Director          5401 non-null   object 
 4   Cast              7071 non-null   object 
 5   Country           7789 non-null   object 
 6   Release_Date      7779 non-null   object 
 7   Rating            7789 non-null   object 
 8   Duration          7789 non-null   object 
 9   Type              7789 non-null   object 
 10  Description       7789 non-null   object 
 11  release_year      7779 non-null   float64
 12  duration_minutes  5379 non-null   float64
dtypes: float64(2), object(11)
memory usage: 791.2+ KB


## Interactive Visualizations

In [None]:
# Movies vs TV Shows
fig1 = px.histogram(df, x="Category", color="Category",
                    title="Movies vs TV Shows on Netflix",
                    color_discrete_sequence=['#636EFA', '#EF553B'])
fig1.show()

In [None]:
#Titles Added per Year
year_counts = df['release_year'].value_counts().sort_index()
fig2 = px.bar(x=year_counts.index, y=year_counts.values,
              labels={'x':'Year', 'y':'Number of Titles'},
              title="Netflix Content Growth Over Years",
              color=year_counts.values,
              color_continuous_scale='viridis')
fig2.show()

In [None]:
#Top 10 Countries
top_countries = df['Country'].value_counts().head(10)
fig3 = px.bar(x=top_countries.values, y=top_countries.index,
              orientation='h', title="Top 10 Content-Producing Countries",
              labels={'x':'Number of Titles', 'y':'Country'},
              color=top_countries.values, color_continuous_scale='plasma')
fig3.show()

In [None]:
#Top 10 Genres
df_genres = df.dropna(subset=['Type']).copy()
df_genres['Genre'] = df_genres['Type'].str.split(',')
df_genres = df_genres.explode('Genre')
df_genres['Genre'] = df_genres['Genre'].str.strip()

top_genres = df_genres['Genre'].value_counts().head(10)
fig4 = px.bar(x=top_genres.values, y=top_genres.index,
              orientation='h', title="Top 10 Genres on Netflix",
              color=top_genres.values, color_continuous_scale='magma')
fig4.show()

In [None]:
#Interactive Timeline: Movie vs TV Show
trend = df.groupby(['release_year','Category']).size().reset_index(name='Count')
fig5 = px.line(trend, x='release_year', y='Count', color='Category',
               markers=True, title="Trend of Movies vs TV Shows Over Years")
fig5.show()

## Data Filtering with Interactivity

In [None]:
# Example filter widgets (useful in Colab)
country_input = input("Enter a country to analyze (e.g., India): ").title()

filtered = df[df['Country'].str.contains(country_input, case=False, na=False)]
print(f"\n🎬 Showing first 10 records for {country_input}:")
display(filtered[['Title','Category','release_year','Rating','Type']].head(10))

Enter a country to analyze (e.g., India): India

🎬 Showing first 10 records for India:


Unnamed: 0,Title,Category,release_year,Rating,Type
8,706,Movie,2019.0,TV-14,"Horror Movies, International Movies"
9,1920,Movie,2017.0,TV-MA,"Horror Movies, International Movies, Thrillers"
18,15-Aug,Movie,2019.0,TV-14,"Comedies, Dramas, Independent Movies"
20,​​Kuch Bheege Alfaaz,Movie,2018.0,TV-14,"Dramas, Independent Movies, International Movies"
21,​Goli Soda 2,Movie,2018.0,TV-14,"Action & Adventure, Dramas, International Movies"
22,​Maj Rati ​​Keteki,Movie,2018.0,TV-14,"Dramas, International Movies"
23,​Mayurakshi,Movie,2018.0,TV-14,"Dramas, International Movies"
59,1000 Rupee Note,Movie,2016.0,TV-14,"Dramas, International Movies"
78,2 States,Movie,2018.0,TV-PG,"Comedies, Dramas, International Movies"
86,21 Sarfarosh: Saragarhi 1897,Tv Show,2018.0,TV-14,"International TV Shows, TV Dramas"


In [None]:
# Plot trend for that country
if not filtered.empty:
    c_trend = filtered.groupby(['release_year','Category']).size().reset_index(name='Count')
    fig6 = px.line(c_trend, x='release_year', y='Count', color='Category',
                   title=f"{country_input} - Movies vs TV Shows Trend")
    fig6.show()
else:
    print(f"No records found for {country_input}")

## Automated Insights Summary

In [None]:
print("AUTOMATED INSIGHTS")

total_titles = len(df)
movies = len(df[df['Category']=='Movie'])
tvshows = len(df[df['Category']=='Tv Show'])
top_country = df['Country'].value_counts().idxmax()
top_genre = df_genres['Genre'].value_counts().idxmax()
latest_year = int(df['release_year'].max())

print(f"• Total content analyzed: {total_titles} titles.")
print(f"• Movies: {movies} ({movies/total_titles*100:.1f}%) | TV Shows: {tvshows} ({tvshows/total_titles*100:.1f}%)")
print(f"• Top contributing country: {top_country}")
print(f"• Most common genre: {top_genre}")
print(f"• Most recent data year: {latest_year}")

AUTOMATED INSIGHTS
• Total content analyzed: 7789 titles.
• Movies: 5379 (69.1%) | TV Shows: 2410 (30.9%)
• Top contributing country: United States
• Most common genre: International Movies
• Most recent data year: 2021


In [None]:
# Trend pattern insight
growth = df['release_year'].value_counts().sort_index()
if growth.iloc[-1] > growth.iloc[-2]:
    print("• Content addition increased in the latest year — steady growth trend.")
else:
    print("• Content addition decreased recently — possible slowdown in new additions.")

• Content addition decreased recently — possible slowdown in new additions.


In [None]:
# Country diversity insight
unique_countries = df['Country'].nunique()
if unique_countries > 100:
    print(f"• Netflix has highly global content presence ({unique_countries} countries).")
else:
    print(f"• Content mostly comes from a limited set of {unique_countries} countries.")


• Netflix has highly global content presence (682 countries).


In [None]:
# Duration insight
avg_dur = df['duration_minutes'].mean(skipna=True)
if avg_dur:
    print(f"• Average movie duration: {avg_dur:.1f} minutes.")

print("\n✅ Interactive analysis and insights complete!")

• Average movie duration: 99.3 minutes.

✅ Interactive analysis and insights complete!
