In [1]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
tfidf_matrix = tfidf_vectorizer.fit_transform(df_artists['lemmatized_genres'])

# KMeans Clustering
kmeans = KMeans(n_clusters=8, random_state=42)
kmeans_labels = kmeans.fit_predict(tfidf_matrix)
df_artists['kmeans_labels'] = kmeans_labels

# DBSCAN Clustering
dbscan = DBSCAN(eps=0.5, min_samples=5, metric='cosine')
dbscan_labels = dbscan.fit_predict(tfidf_matrix)
df_artists['dbscan_labels'] = dbscan_labels

# Visualization using Word Cloud for KMeans
for i in range(8):
    cluster_genres = df_artists[df_artists['kmeans_labels'] == i]['lemmatized_genres']
    all_genres_text = ' '.join([' '.join(genre) for genre in cluster_genres])
    
    wordcloud = WordCloud(width=800, height=400).generate(all_genres_text)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Genres Word Cloud for KMeans Cluster {i}')
    plt.show()




NameError: name 'df_artists' is not defined

In [None]:
distortions = []
K_range = range(1,15) # testing from 1 to 14 clusters

for k in K_range:
    kmeanModel = KMeans(n_clusters=k, random_state=42)
    kmeanModel.fit(tfidf_matrix)
    distortions.append(kmeanModel.inertia_) # inertia is the sum of squared distances to the closest centroid for each point

# Plotting the Elbow graph
plt.figure(figsize=(10,6))
plt.plot(K_range, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
import pandas as pd
import numpy as np

# Ensure that the artist popularity and followers columns are numeric
df_artists['artist_popularity'] = pd.to_numeric(df_artists['artist_popularity'], errors='coerce')
df_artists['followers'] = pd.to_numeric(df_artists['followers'], errors='coerce')

# Drop NaN values from artist_popularity and followers columns
df_artists.dropna(subset=['artist_popularity', 'followers'], inplace=True)

# Function to analyze clusters
def analyze_cluster(cluster_column):
    # Group by the cluster labels and get mean of artist_popularity and followers
    cluster_analysis = df_artists.groupby(cluster_column).agg(
        Average_Popularity=pd.NamedAgg(column='artist_popularity', aggfunc=np.mean),
        Average_Followers=pd.NamedAgg(column='followers', aggfunc=np.mean),
        Unique_Artists=pd.NamedAgg(column='id', aggfunc=pd.Series.nunique),
        Unique_Genres=pd.NamedAgg(column='cleaned_genres', aggfunc=lambda x: len(set(x.sum())))
    ).reset_index()

    return cluster_analysis

# Analyze KMeans Clusters
kmeans_analysis = analyze_cluster('kmeans_labels')
print("KMeans Cluster Analysis:\n", kmeans_analysis)

# Analyze DBSCAN Clusters
dbscan_analysis = analyze_cluster('dbscan_labels')
print("\nDBSCAN Cluster Analysis:\n", dbscan_analysis)


In [None]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np

# Finding the best epsilon based on silhouette score and number of clusters
best_epsilon = 0
best_score = -1
epsilons = np.linspace(0.1, 1, 10)  # 10 different epsilon values between 0.1 and 1

for eps in epsilons:
    dbscan = DBSCAN(eps=eps, min_samples=5, metric='cosine')
    dbscan_labels = dbscan.fit_predict(tfidf_matrix)
    
    # Calculate silhouette score excluding the noise (-1 labels) if there is more than 1 cluster found
    num_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
    if num_clusters > 1 and num_clusters : #<= 10:
        score = silhouette_score(tfidf_matrix, dbscan_labels)
        if score > best_score:
            best_score = score
            best_epsilon = eps

# Applying DBSCAN with the best epsilon
dbscan = DBSCAN(eps=best_epsilon, min_samples=5, metric='cosine')
dbscan_labels = dbscan.fit_predict(tfidf_matrix)
df_artists['dbscan_labels'] = dbscan_labels

# Visualization using Word Cloud for DBSCAN
max_label = max(dbscan_labels)
for i in range(-1, max_label+1):
    cluster_genres = df_artists[df_artists['dbscan_labels'] == i]['cleaned_genres']
    all_genres_text = ' '.join([' '.join(genre) for genre in cluster_genres])
    
    wordcloud = WordCloud(width=800, height=400).generate(all_genres_text)
    
    plt.figure(figsize=(10, 5))
