# 🎬 Netflix Content Analytics Notebook (Code Sections Only)

This notebook focuses on the code implementation for Netflix content analytics, including data loading, cleaning, analytics, visualization, recommendations, and report export.

## 1. Setup and Import Libraries

Import required Python libraries and set up plotting styles.

In [None]:
# Import required libraries
import os
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_context('talk')

ModuleNotFoundError: No module named 'seaborn'

## 2. Find and Load Netflix Dataset

Define file paths, check for dataset existence, and load the CSV file into a pandas DataFrame.

In [None]:
DATA_PATHS = [
    'Netflix Dataset.csv',
    os.path.join('src', 'data', 'Netflix Dataset.csv')
]

def find_data_path(paths):
    for p in paths:
        if os.path.exists(p):
            return p
    return None

data_path = find_data_path(DATA_PATHS)
if data_path is None:
    raise FileNotFoundError("Netflix Dataset.csv not found in project/ or src/data/.")

df = pd.read_csv(data_path)

## 3. Prepare and Normalize Data

Clean column names, normalize data types, and ensure all expected columns exist.

In [None]:
# Clean column names
df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]

# Normalize data types
if 'release_year' in df.columns:
    df['release_year'] = pd.to_numeric(df['release_year'], errors='coerce')

if 'date_added' in df.columns:
    df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

# Ensure expected columns exist
expected_cols = [
    'show_id','type','title','director','cast','country',
    'date_added','release_year','rating','duration','listed_in','description'
]
for col in expected_cols:
    if col not in df.columns:
        df[col] = np.nan

## 4. Compute KPIs & Executive Summary

Calculate total content, split by type, year range, top countries, top genres, and rating distribution.

In [None]:
total_movies = (df['type'] == 'Movie').sum()
total_tv = (df['type'] == 'TV Show').sum()
total_content = len(df)

years = df['release_year'].dropna().astype(int)
year_start = int(years.min()) if len(years) else None
year_end = int(years.max()) if len(years) else None

# Top countries
country_counts = {}
for val in df['country'].fillna(''):
    for c in [x.strip() for x in str(val).split(',') if x.strip()]:
        country_counts[c] = country_counts.get(c, 0) + 1
top_countries = sorted(country_counts.items(), key=lambda x: x[1], reverse=True)[:10]

# Top genres
genre_counts = {}
for val in df['listed_in'].fillna(''):
    for g in [x.strip() for x in str(val).split(',') if x.strip()]:
        genre_counts[g] = genre_counts.get(g, 0) + 1
top_genres = sorted(genre_counts.items(), key=lambda x: x[1], reverse=True)[:10]

# Rating distribution
rating_counts = df['rating'].fillna('').value_counts()

summary = {
    'total_content': total_content,
    'total_movies': total_movies,
    'total_tv_shows': total_tv,
    'year_range': (year_start, year_end),
    'top_countries': top_countries,
    'top_genres': top_genres,
    'rating_distribution': rating_counts.to_dict()
}

## 5. Visualize Trends & Distributions

Plot content growth by year, rating distribution, top countries, and top genres.

In [None]:
# Content by year
by_year = df.dropna(subset=['release_year']).copy()
by_year['release_year'] = by_year['release_year'].astype(int)
pivot = by_year.pivot_table(index='release_year', columns='type', values='show_id', aggfunc='count', fill_value=0)

fig, ax = plt.subplots(figsize=(10,5))
pivot.plot(kind='line', ax=ax)
ax.set_title('Content Growth Trend (Movies vs TV Shows)')
ax.set_xlabel('Year')
ax.set_ylabel('Count')
plt.show()

# Rating distribution (top 10)
top_ratings = rating_counts.head(10)
fig, ax = plt.subplots(figsize=(10,5))
sns.barplot(x=top_ratings.index, y=top_ratings.values, ax=ax, color='#e50914')
ax.set_title('Content Rating Distribution (Top 10)')
ax.set_xlabel('Rating')
ax.set_ylabel('Count')
plt.show()

# Top countries
tc_df = pd.DataFrame(top_countries, columns=['country','count'])
fig, ax = plt.subplots(figsize=(10,5))
sns.barplot(data=tc_df, x='count', y='country', ax=ax, color='#0080ff')
ax.set_title('Top 10 Content-Producing Countries')
ax.set_xlabel('Count')
ax.set_ylabel('Country')
plt.show()

# Top genres
tg_df = pd.DataFrame(top_genres, columns=['genre','count'])
fig, ax = plt.subplots(figsize=(10,5))
sns.barplot(data=tg_df, x='count', y='genre', ax=ax, color='#00c853')
ax.set_title('Top 10 Genres')
ax.set_xlabel('Count')
ax.set_ylabel('Genre')
plt.show()

## 6. Calculate Diversity Metrics (Yearly)

Compute yearly diversity scores based on unique genres, countries, and type balance.

In [None]:
yearly = {}
for _, row in df.dropna(subset=['release_year']).iterrows():
    y = int(row['release_year'])
    yearly.setdefault(y, {
        'genres': set(),
        'countries': set(),
        'movies': 0,
        'tv': 0
    })
    for g in [x.strip() for x in str(row['listed_in']).split(',') if x.strip()]:
        yearly[y]['genres'].add(g)
    for c in [x.strip() for x in str(row['country']).split(',') if x.strip()]:
        yearly[y]['countries'].add(c)
    if row['type'] == 'Movie':
        yearly[y]['movies'] += 1
    elif row['type'] == 'TV Show':
        yearly[y]['tv'] += 1

rows = []
for y, d in yearly.items():
    total = d['movies'] + d['tv']
    type_balance = (min(d['movies'], d['tv']) / total) if total > 0 else 0
    score = round((len(d['genres'])/15)*40 + (len(d['countries'])/20)*40 + type_balance*20)
    rows.append([y, len(d['genres']), len(d['countries']), min(100, score)])

div_df = pd.DataFrame(rows, columns=['year','genre_count','country_count','diversity_score']).sort_values('year')

## 7. Plot Diversity Score by Year

Visualize the diversity score over time using a line plot.

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
sns.lineplot(data=div_df, x='year', y='diversity_score', ax=ax, marker='o')
ax.set_title('Diversity Score by Year')
ax.set_xlabel('Year')
ax.set_ylabel('Score (0–100)')
plt.show()

## 8. Genre Co-occurrence Analysis

Count and tabulate how often pairs of genres co-occur in the same title.

In [None]:
from collections import Counter

pairs = Counter()
for val in df['listed_in'].fillna(''):
    genres = sorted({x.strip() for x in str(val).split(',') if x.strip()})
    for i in range(len(genres)):
        for j in range(i+1, len(genres)):
            pairs[(genres[i], genres[j])] += 1

cooc_df = pd.DataFrame([(g1, g2, cnt) for (g1,g2), cnt in pairs.items()], columns=['genre1','genre2','count'])
top_cooc = cooc_df.sort_values('count', ascending=False).head(10)
print(top_cooc)

## 9. Simple Recommendations Function

Define a function to recommend similar titles based on type, genre, country, rating, year, and duration similarity.

In [None]:
def parse_duration_minutes(s):
    s = str(s)
    if 'min' in s:
        try:
            return int(''.join([ch for ch in s if ch.isdigit()]))
        except:
            return None
    return None

def recs_for_title(title, limit=5):
    base = df[df['title'].str.lower() == title.lower()].head(1)
    if base.empty:
        return pd.DataFrame()
    base = base.iloc[0]

    def score(row):
        sc = 0
        if row['type'] == base['type']:
            sc += 3
        # genres
        g1 = {x.strip() for x in str(base['listed_in']).split(',') if x.strip()}
        g2 = {x.strip() for x in str(row['listed_in']).split(',') if x.strip()}
        sc += 2 * len(g1.intersection(g2))
        # countries
        c1 = {x.strip() for x in str(base['country']).split(',') if x.strip()}
        c2 = {x.strip() for x in str(row['country']).split(',') if x.strip()}
        sc += 1.5 * len(c1.intersection(c2))
        # rating
        if str(row['rating']) == str(base['rating']):
            sc += 1
        # year
        try:
            ydiff = abs(int(row['release_year']) - int(base['release_year']))
            if ydiff <= 2:
                sc += 2
            elif ydiff <= 5:
                sc += 1
        except:
            pass
        # duration (movies)
        if base['type'] == 'Movie' and row['type'] == 'Movie':
            d1 = parse_duration_minutes(base['duration'])
            d2 = parse_duration_minutes(row['duration'])
            if d1 is not None and d2 is not None and abs(d1-d2) <= 20:
                sc += 1
        return sc

    cand = df[df['title'].str.lower() != title.lower()].copy()
    cand['score'] = cand.apply(score, axis=1)
    return cand.sort_values('score', ascending=False).head(limit)[['title','type','rating','release_year','listed_in','country','score']]

# Example usage
example_title = df['title'].dropna().iloc[0]
print(recs_for_title(example_title, limit=5))

## 10. Export Analytics Report

Generate and export a text report summarizing KPIs, top countries, genres, and ratings.

In [None]:
def generate_report(summary_dict):
    lines = []
    lines.append('NETFLIX CONTENT ANALYTICS REPORT')
    lines.append(f'Generated: {datetime.now().strftime("%Y-%m-%d")}')
    lines.append('='*60)
    lines.append('')
    lines.append('EXECUTIVE SUMMARY')
    lines.append('-'*60)
    lines.append(f"Total Content Items: {summary_dict['total_content']}")
    lines.append(f"Movies: {summary_dict['total_movies']} ({(summary_dict['total_movies']/max(1,summary_dict['total_content'])*100):.1f}%)")
    lines.append(f"TV Shows: {summary_dict['total_tv_shows']} ({(summary_dict['total_tv_shows']/max(1,summary_dict['total_content'])*100):.1f}%)")
    lines.append(f"Content Period: {summary_dict['year_range'][0]} - {summary_dict['year_range'][1]}")
    lines.append('')
    lines.append('TOP CONTENT PRODUCERS')
    lines.append('-'*60)
    for i, (c, n) in enumerate(summary_dict['top_countries'], start=1):
        lines.append(f"{i}. {c:<20} {n} titles")
    lines.append('')
    lines.append('TOP GENRES')
    lines.append('-'*60)
    for i, (g, n) in enumerate(summary_dict['top_genres'], start=1):
        lines.append(f"{i}. {g:<20} {n} titles")
    lines.append('')
    lines.append('CONTENT RATINGS (Top 10)')
    lines.append('-'*60)
    rc = pd.Series(summary_dict['rating_distribution']).sort_values(ascending=False).head(10)
    for i, (r, n) in enumerate(rc.items(), start=1):
        lines.append(f"{i}. {r:<10} {n} titles")
    lines.append('')
    return '\n'.join(lines)

report_text = generate_report(summary)
report_path = 'netflix-analytics-report.txt'
with open(report_path, 'w', encoding='utf-8') as f:
    f.write(report_text)
print(report_path)
print('\n'.join(report_text.splitlines()[:12]))