<a href="https://colab.research.google.com/github/natashamuthoni2333/Age-Gender-Detect-OpenCv/blob/main/Netflix%20Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
# Step 1: Install & Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import sidetable as stb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')


In [16]:
# Step 2: Load Data
from google.colab import files

# Upload the dataset file (choose netflix_titles_CLEANED.csv when prompted)
uploaded = files.upload()

# Read the uploaded file
df = pd.read_csv("netflix_titles_CLEANED (1).csv")

# Display basic info
print("Dataset Shape:", df.shape)
df.head()

Saving netflix_titles_CLEANED (1).csv to netflix_titles_CLEANED (1) (2).csv
Dataset Shape: (8807, 12)


Unnamed: 0,show_id,type,title,directors,cast,countries,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [None]:
# Step 2: Load Data
from google.colab import files

# Upload the dataset (choose netflix_titles_CLEANED (1).csv when prompted)
uploaded = files.upload()

# Automatically detect the uploaded file name
filename = list(uploaded.keys())[0]

# Read the uploaded file
df = pd.read_csv(filename)

# Display basic info
print(f"✅ File '{filename}' uploaded successfully!")
print("Dataset Shape:", df.shape)
df.head()


In [None]:
# 3.1 Pie Chart: Movie vs TV Show
plt.figure(figsize=(6,6))
plt.pie(df['type'].value_counts(),
        autopct='%1.1f%%',
        startangle=90,
        colors=['#E50914', '#000000'],
        labels=['Movie', 'TV Show'])

plt.title('Content Type Distribution')
plt.ylabel('')
plt.show()


In [None]:
# 3.2 Top 10 Countries (Bar Plot)
country_counts = df['countries'].dropna().str.split(',').explode().str.strip().value_counts().head(10)

plt.figure(figsize=(10,6))
sns.barplot(x=country_counts.values, y=country_counts.index, palette='Reds_r')

plt.title('Top 10 Countries by Content Volume')
plt.xlabel('Number of Titles')
plt.ylabel('Countries')
plt.tight_layout()
plt.show()


In [None]:
# 3.3 Top 10 Genres (Bar Plot)
genre_counts = df['listed_in'].dropna().str.split(',').explode().str.strip().value_counts().head(10)

plt.figure(figsize=(10,6))
sns.barplot(x=genre_counts.values, y=genre_counts.index, palette='Reds_r')

plt.title('Top 10 Genres')
plt.xlabel('Number of Titles')
plt.ylabel('Genres')
plt.tight_layout()
plt.show()


In [None]:
# Step 4: Preprocessing for Clustering

# Combine text features into a single column
df['features'] = df['listed_in'].fillna('') + ' ' + df['type'].fillna('')

# Vectorize using TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
X = tfidf.fit_transform(df['features'])

# Optional: Add numerical features (release year, duration as numeric)
df['duration_num'] = df['duration'].str.extract('(\d+)').astype(float)
df['release_year'] = pd.to_numeric(df['release_year'], errors='coerce').fillna(0)

# Stack TF-IDF + numerical (optional)
# Here we’ll use only TF-IDF for simplicity


In [None]:
# Step 5: Clustering with K-Means

k = 5  # number of clusters
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X)

df['Cluster'] = kmeans.labels_
print("✅ Clustering complete. Number of clusters:", k)


In [None]:
# Step 6: Visualize Clusters (PCA 2D)
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X.toarray())

plt.figure(figsize=(8,6))
plt.scatter(X_pca[:, 0], X_pca[:, 1],
            c=df['Cluster'], cmap='tab10', alpha=0.7)
plt.title('Netflix Content Clusters (PCA Projection)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.colorbar(label='Cluster')
plt.show()


In [None]:
# Step 7: Analyze Clusters

# Top genres per cluster
print("Top 3 Genres per Cluster\n")

for cluster in sorted(df['Cluster'].unique()):
    cluster_data = df[df['Cluster'] == cluster]
    top_genres = cluster_data['listed_in'].dropna().str.split(',').explode().str.strip().value_counts().head(3)
    print(f"Cluster {cluster} top genres:")
    for g, v in top_genres.items():
        print(f"  {g}: {v}")
    print("")

# Example interpretation (can be customized)
print("Cluster 0: Documentaries & TV Shows")
print("Cluster 1: Dramas & Comedies")
print("Cluster 2: International Movies")
print("Cluster 3: Action & Adventure")
print("Cluster 4: International TV Shows")


In [None]:
# Step 8: Cluster Distribution (Pie Chart)

cluster_counts = df['Cluster'].value_counts().sort_index()
plt.figure(figsize=(7,7))
plt.pie(cluster_counts,
        autopct='%1.1f%%',
        startangle=90,
        colors=sns.color_palette('Set3', len(cluster_counts)),
        labels=[f'Cluster {i}' for i in cluster_counts.index])

plt.title('Cluster Size Distribution')
plt.ylabel('')
plt.show()
