In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage

In [2]:


POPULATION_FILE = 'data/cities_population.csv'
ECONOMICS_FILE = 'data/cities_economics.csv'
CLIMATE_FILE = 'data/cities_climate.csv'
COST_SAFETY_FILE = 'data/cities_cost_safety.csv'
ADDITIONAL_FILE = 'data/cities_additional_features.csv'

# Desired number of clusters for K-Means
DESIRED_CLUSTERS = 5

# Columns to use for clustering 
FEATURES = [
    'population_density', 'gdp_per_capita', 'unemployment_rate',
    'cost_of_living_index', 'safety_index', 'avg_temperature',
    'annual_precipitation', 'public_transit_score', 'cultural_index',
    'healthcare_index', 'education_index', 'pollution_index', 'green_space_percent'
]
#  Load and Merge Data
print("Loading data...")
population_df = pd.read_csv(POPULATION_FILE)
economics_df = pd.read_csv(ECONOMICS_FILE)
climate_df = pd.read_csv(CLIMATE_FILE)
cost_safety_df = pd.read_csv(COST_SAFETY_FILE)
additional_df = pd.read_csv(ADDITIONAL_FILE)

# Merge all into one DataFrame
df = population_df.merge(economics_df, on='city', how='left')
df = df.merge(climate_df, on='city', how='left')
df = df.merge(cost_safety_df, on='city', how='left')
df = df.merge(additional_df, on='city', how='left')

print(f"Data loaded. Total cities: {len(df)}")


Loading data...


FileNotFoundError: [Errno 2] No such file or directory: 'data/cities_economics.csv'

In [None]:
# Handle Missing Data
print("Handling missing data via KNN imputation...")
numeric_cols = df.select_dtypes(include=[np.number]).columns
imputer = KNNImputer(n_neighbors=5)
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])


# Outlier Handling (Winsorization)

print("Winsorizing outliers at 5th and 95th percentiles...")
for col in numeric_cols:
    lower_bound = np.percentile(df[col], 5)
    upper_bound = np.percentile(df[col], 95)
    df[col] = np.clip(df[col], lower_bound, upper_bound)

In [3]:
# Check for Missing Features

missing_features = [f for f in FEATURES if f not in df.columns]
if missing_features:
    raise ValueError(f"Missing required features for clustering: {missing_features}")

X = df[FEATURES].copy()


# Standardize Features

print("Standardizing features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

NameError: name 'df' is not defined

In [None]:
#Determine Optimal k with Elbow Method

print("Determining optimal number of clusters using the elbow method...")
inertias = []
K_range = range(2, 11)
for k in K_range:
    kmeans_temp = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans_temp.fit(X_scaled)
    inertias.append(kmeans_temp.inertia_)

plt.figure(figsize=(8,5))
plt.plot(K_range, inertias, 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.grid(True)
plt.savefig('elbow_plot.png')
plt.close()
print("Elbow plot saved as elbow_plot.png")

# If you want to adjust based on the elbow plot, change DESIRED_CLUSTERS above.

In [None]:

#  K-Means Clustering

print(f"Running K-Means with k={DESIRED_CLUSTERS}...")
kmeans = KMeans(n_clusters=DESIRED_CLUSTERS, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(X_scaled)

cluster_centers = kmeans.cluster_centers_
cluster_centers_inv = scaler.inverse_transform(cluster_centers)
centroids_df = pd.DataFrame(cluster_centers_inv, columns=FEATURES)
centroids_df['cluster'] = range(DESIRED_CLUSTERS)

print("Cluster centroids:")
print(centroids_df)

In [None]:

# Hierarchical Clustering

print("Performing hierarchical clustering for comparison...")
Z = linkage(X_scaled, method='ward')  # Compute the linkage matrix for dendrogram
plt.figure(figsize=(10, 5))
dendrogram(Z, truncate_mode='level', p=5)
plt.title('Hierarchical Clustering Dendrogram (truncated)')
plt.xlabel('Sample index or (cluster size)')
plt.ylabel('Distance')
plt.savefig('hierarchical_dendrogram.png')
plt.close()
print("Hierarchical dendrogram saved as hierarchical_dendrogram.png")

In [None]:
# DBSCAN Clustering

print("Performing DBSCAN for comparison...")
dbscan = DBSCAN(eps=2.0, min_samples=5)  # Adjust eps as needed
dbscan_labels = dbscan.fit_predict(X_scaled)
df['dbscan_cluster'] = dbscan_labels
unique_dbscan = np.unique(dbscan_labels)
print(f"DBSCAN resulted in clusters: {unique_dbscan}")

# Note: Many points may be labeled as -1 (noise)

In [None]:

# PCA for Visualization

print("Performing PCA for visualization...")
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)
df['pca1'] = X_pca[:, 0]
df['pca2'] = X_pca[:, 1]

plt.figure(figsize=(10,7))
sns.scatterplot(data=df, x='pca1', y='pca2', hue='cluster', palette='Set2')
plt.title('PCA Visualization of K-Means Clusters')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig('clusters_pca_plot.png')
plt.close()
print("PCA cluster plot saved as clusters_pca_plot.png")

In [None]:

# Save Results

output_file = 'city_clusters_results.csv'
df.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")

print("All analysis steps completed successfully!")
