In [7]:
import pandas as pd
from sklearn.cluster import KMeans , AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import SpectralClustering


from sklearn.manifold import TSNE

In [8]:
df = pd.read_csv('synthetic_youtube_data_nested_full.csv')

In [9]:
df.head(20)


Unnamed: 0,User_ID,Normalized_Content_Watched,Main_Category_Watched,Sub_Category_Watched
0,1,0.112625,Sports,Basketball
1,2,0.383299,Entertainment,Music Videos
2,3,0.92005,Entertainment,Drama
3,4,0.732725,Entertainment,Comedy
4,5,0.914604,Education,Lectures
5,6,0.013328,Gaming,Role-Playing Games
6,7,0.787867,Sports,Soccer
7,8,0.58318,Entertainment,Music Videos
8,9,0.207058,Entertainment,Music Videos
9,10,0.568909,Gaming,eSports


In [10]:
#1.clustering using K-Means

In [12]:
X = df[['Normalized_Content_Watched']]
kmeans = KMeans(n_clusters=3, random_state=42)
df['cluster'] = kmeans.fit_predict(X)

df['kmeans_cluster'] = kmeans.labels_

In [13]:
# Normalize data using StandardScaler
scaler = StandardScaler()
df[['Normalized_Content_Watched']] = scaler.fit_transform(df[['Normalized_Content_Watched']])

In [14]:
print(df)

      User_ID  Normalized_Content_Watched Main_Category_Watched  \
0           1                   -1.356806                Sports   
1           2                   -0.418846         Entertainment   
2           3                    1.441145         Entertainment   
3           4                    0.792011         Entertainment   
4           5                    1.422271             Education   
...       ...                         ...                   ...   
9995     9996                    0.163595         Entertainment   
9996     9997                    1.343899         Entertainment   
9997     9998                   -1.300834                Gaming   
9998     9999                   -0.907617                Gaming   
9999    10000                    0.080511                  News   

     Sub_Category_Watched  cluster  kmeans_cluster  
0              Basketball        0               0  
1            Music Videos        2               2  
2                   Drama        1  

In [15]:
# Create the StandardScaler object
scaler = StandardScaler()

In [16]:
# 2. Hierarchical Clustering (Agglomerative Clustering)
X_scaled = scaler.fit_transform(df[['Normalized_Content_Watched']])  # Normalize data
ward_cluster = AgglomerativeClustering(n_clusters=3, linkage='ward')  # Ward's method
df['ward_cluster'] = ward_cluster.fit_predict(X_scaled)

In [18]:
df.head(20)

Unnamed: 0,User_ID,Normalized_Content_Watched,Main_Category_Watched,Sub_Category_Watched,cluster,kmeans_cluster,ward_cluster
0,1,-1.356806,Sports,Basketball,0,0,0
1,2,-0.418846,Entertainment,Music Videos,2,2,0
2,3,1.441145,Entertainment,Drama,1,1,2
3,4,0.792011,Entertainment,Comedy,1,1,1
4,5,1.422271,Education,Lectures,1,1,2
5,6,-1.700897,Gaming,Role-Playing Games,0,0,0
6,7,0.983092,Sports,Soccer,1,1,2
7,8,0.273797,Entertainment,Music Videos,2,2,1
8,9,-1.029568,Entertainment,Music Videos,0,0,0
9,10,0.224344,Gaming,eSports,2,2,1


In [19]:
# 3. DBSCAN (already implemented)
dbscan = DBSCAN(eps=0.2, min_samples=2)
df['dbscan_cluster'] = dbscan.fit_predict(X_scaled)


In [20]:
df.head(20)

Unnamed: 0,User_ID,Normalized_Content_Watched,Main_Category_Watched,Sub_Category_Watched,cluster,kmeans_cluster,ward_cluster,dbscan_cluster
0,1,-1.356806,Sports,Basketball,0,0,0,0
1,2,-0.418846,Entertainment,Music Videos,2,2,0,0
2,3,1.441145,Entertainment,Drama,1,1,2,0
3,4,0.792011,Entertainment,Comedy,1,1,1,0
4,5,1.422271,Education,Lectures,1,1,2,0
5,6,-1.700897,Gaming,Role-Playing Games,0,0,0,0
6,7,0.983092,Sports,Soccer,1,1,2,0
7,8,0.273797,Entertainment,Music Videos,2,2,1,0
8,9,-1.029568,Entertainment,Music Videos,0,0,0,0
9,10,0.224344,Gaming,eSports,2,2,1,0


In [21]:
# 4. Spectral Clustering

spectral = SpectralClustering(n_components=3, random_state=42)
df['spectral_cluster'] = spectral.fit_predict(X_scaled)

In [22]:
df.head(20)

Unnamed: 0,User_ID,Normalized_Content_Watched,Main_Category_Watched,Sub_Category_Watched,cluster,kmeans_cluster,ward_cluster,dbscan_cluster,spectral_cluster
0,1,-1.356806,Sports,Basketball,0,0,0,0,7
1,2,-0.418846,Entertainment,Music Videos,2,2,0,0,4
2,3,1.441145,Entertainment,Drama,1,1,2,0,0
3,4,0.792011,Entertainment,Comedy,1,1,1,0,2
4,5,1.422271,Education,Lectures,1,1,2,0,0
5,6,-1.700897,Gaming,Role-Playing Games,0,0,0,0,3
6,7,0.983092,Sports,Soccer,1,1,2,0,5
7,8,0.273797,Entertainment,Music Videos,2,2,1,0,6
8,9,-1.029568,Entertainment,Music Videos,0,0,0,0,7
9,10,0.224344,Gaming,eSports,2,2,1,0,6
