In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

In [3]:
# 1. Load the dataset
# Assumes the file is in the local directory
df = pd.read_csv('E:\AIML Tasks\city_lifestyle_dataset.csv')

# 2. Feature Selection
# We drop categorical columns ('city_name', 'country') for the clustering model
# We will keep the original df to map labels back later
features = [
    'population_density', 'avg_income', 'internet_penetration', 
    'avg_rent', 'air_quality_index', 'public_transport_score', 
    'happiness_score', 'green_space_ratio'
] # Features derived from dataset [cite: 1]

X = df[features]

# 3. Preprocessing: Standardization
# Scaling is crucial for K-Means to treat all features equally
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to dataframe for easier handling later (optional)
X_scaled_df = pd.DataFrame(X_scaled, columns=features)

print("Data loaded and scaled successfully.")
print(f"Feature shape: {X_scaled.shape}")

Data loaded and scaled successfully.
Feature shape: (300, 8)


  df = pd.read_csv('E:\AIML Tasks\city_lifestyle_dataset.csv')


In [8]:
# Calculate Inertia for K=1 to K=10
inertia = []
K_range = range(1, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# Visualize the Elbow Curve
plt.figure(figsize=(10, 6))
plt.plot(K_range, inertia, marker='o', linestyle='--')
plt.title('Elbow Method to Determine Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia (WCSS)')
plt.grid(True)
plt.savefig('elbow_curve.png') 
plt.close()

In [5]:
# 1. Fit the Final Model
optimal_k = 3  # Adjust based on the Elbow plot results
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_scaled)

# 2. Add labels back to original data
df['Cluster'] = cluster_labels

# 3. Evaluate with Silhouette Score
# Score ranges from -1 to 1. Higher is better.
sil_score = silhouette_score(X_scaled, cluster_labels)
print(f"Silhouette Score for K={optimal_k}: {sil_score:.3f}")

Silhouette Score for K=3: 0.356


In [9]:
# 1. Dimensionality Reduction (PCA) for Visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# 2. Create a visualization DataFrame
pca_df = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])
pca_df['Cluster'] = cluster_labels

# 3. Plot the Clusters
plt.figure(figsize=(12, 8))
sns.scatterplot(
    x='PC1', 
    y='PC2', 
    hue='Cluster', 
    data=pca_df, 
    palette='viridis', 
    s=100, 
    alpha=0.8
)
plt.title('City Lifestyle Clusters (Visualized via PCA)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Cluster')
plt.savefig('pca_clusters.png')
plt.close()

In [10]:
# Analyze the mean values of features for each cluster
cluster_summary = df.groupby('Cluster')[features].mean()

# Display the summary
print("\nCluster Profiling (Average Values per Group):")
print(cluster_summary)

# Visualizing the profile differences (Heatmap)
plt.figure(figsize=(12, 6))
sns.heatmap(cluster_summary.T, annot=True, cmap='coolwarm', fmt='.1f')
plt.title('Feature Distribution by Cluster')
plt.savefig('cluster_heatmap.png')
plt.close()


Cluster Profiling (Average Values per Group):
         population_density   avg_income  internet_penetration     avg_rent  \
Cluster                                                                       
0               2991.329412  1434.000000             55.301176   495.411765   
1               2367.335664  3804.055944             86.260140  1361.538462   
2               8203.625000  2531.805556             72.997222   889.166667   

         air_quality_index  public_transport_score  happiness_score  \
Cluster                                                               
0                78.517647               40.991765         4.965882   
1                53.762238               60.756643         8.153846   
2                97.388889               63.093056         5.626389   

         green_space_ratio  
Cluster                     
0                36.027059  
1                37.082517  
2                25.456944  
