In [12]:

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/data.csv')

# Use label encoding for Country because of high dimensionality (193 unique values)
label_encoder = LabelEncoder()
df['Country'] = label_encoder.fit_transform(df['Country'])

region_column = df['Region']

# Use one-hot encoding for Region because of low dimensionality (6 unique values)
df_encoded = pd.get_dummies(df, columns=['Region'], drop_first=True)

y_original = df['Life_expectancy'].values
X = df_encoded.drop(columns='Life_expectancy')
y = df_encoded['Life_expectancy']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
%pip install pyqt5

In [1]:
import matplotlib.pyplot as plt
%matplotlib qt

In [22]:
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
import matplotlib.cm as cm

# Load the data
data = pd.read_csv('../data/data.csv')

# Select features relevant to socioeconomic and health conditions
features = [
    'Infant_deaths', 'Under_five_deaths', 'Adult_mortality', 'Alcohol_consumption',
    'Hepatitis_B', 'Measles', 'BMI', 'Polio', 'Diphtheria', 'Incidents_HIV',
    'GDP_per_capita', 'Population_mln', 'Thinness_ten_nineteen_years', 'Thinness_five_nine_years',
    'Schooling', 
]
X = data[features]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply DBSCAN with initial parameters
dbscan = DBSCAN(eps=2, min_samples=10)
clusters = dbscan.fit_predict(X_scaled)

# Count number of clusters and outliers
n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
n_outliers = list(clusters).count(-1)

# Use PCA to reduce the data to 3 dimensions for visualization
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

# Plot the clusters in 3D
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Generate 13 distinct colors from Matplotlib's 'tab20' colormap
colormap = cm.get_cmap('tab20', 13)  # Get a colormap with 13 distinct colors
colors = [colormap(i) for i in range(13)]

# Plot non-outliers (clusters)
for cluster in set(clusters):
    if cluster == -1:
        # Plot outliers in red
        ax.scatter(X_pca[clusters == -1, 0], X_pca[clusters == -1, 1], X_pca[clusters == -1, 2], 
                   color='red', label='Outliers', s=20, edgecolor='k')
    else:
        # Plot each cluster with a unique color from the colormap
        ax.scatter(X_pca[clusters == cluster, 0], X_pca[clusters == cluster, 1], X_pca[clusters == cluster, 2], 
                   color=colors[cluster % 13], label=f'Cluster {cluster}', s=20)

# Set plot title and legend
ax.set_title(f'DBSCAN Clustering\nClusters: {n_clusters}, Outliers: {n_outliers}')
ax.set_xlabel('PCA Component 1')
ax.set_ylabel('PCA Component 2')
ax.set_zlabel('PCA Component 3')
ax.legend()
plt.show()


  colormap = cm.get_cmap('tab20', 13)  # Get a colormap with 13 distinct colors


In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Assuming 'data' is the original DataFrame and 'clusters' contains the DBSCAN cluster labels
data['Cluster'] = clusters

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Create an empty DataFrame to hold the normalized features
normalized_data = pd.DataFrame()

# Apply Min-Max scaling to each feature within each cluster (excluding outliers)
for cluster in data['Cluster'].unique():
    if cluster != -1:  # Skip outliers
        # Select the data for the current cluster
        cluster_data = data[data['Cluster'] == cluster][features]
        
        # Scale the features to the range [0, 1] for the current cluster
        # cluster_data_scaled = pd.DataFrame(scaler.fit_transform(cluster_data), columns=features)

        cluster_data_scaled = cluster_data
        
        # Add the cluster label back to the scaled data for later reference
        cluster_data_scaled['Cluster'] = cluster
        
        # Append the scaled data to the normalized_data DataFrame
        normalized_data = pd.concat([normalized_data, cluster_data_scaled])

# Calculate the mean of each scaled feature within each cluster
cluster_summary_scaled = normalized_data.groupby('Cluster')[features].mean()

# divide by the max mean value of each feature
for feature in features:
    max_value = cluster_summary_scaled[feature].max()
    cluster_summary_scaled[feature] = cluster_summary_scaled[feature] / max_value

# print amount per cluster
for cluster in data['Cluster'].unique():
    print(f"Cluster {cluster}: {len(data[data['Cluster'] == cluster])}")

# Calculate the mean of each feature for outliers, for comparison
outliers_data = data[data['Cluster'] == -1][features]
outliers_data_scaled = pd.DataFrame(scaler.fit_transform(outliers_data), columns=features)
outliers_summary_scaled = outliers_data_scaled.mean()

# Display the normalized cluster summary
print("Cluster Feature Scales (0-1):")
print(cluster_summary_scaled)


Cluster 0: 2600
Cluster 1: 13
Cluster -1: 167
Cluster 2: 15
Cluster 4: 16
Cluster 3: 22
Cluster 7: 10
Cluster 6: 10
Cluster 5: 11
Cluster Feature Scales (0-1):
         Infant_deaths  Under_five_deaths  Adult_mortality  \
Cluster                                                      
0             0.379709           0.374674         0.326320   
1             0.738928           0.695668         0.369327   
2             0.828007           0.945958         0.986887   
3             0.974706           0.913115         0.407186   
4             0.233729           0.205671         0.176612   
5             1.000000           1.000000         1.000000   
6             0.406887           0.355002         0.335091   
7             0.352755           0.321374         0.378023   

         Alcohol_consumption  Hepatitis_B   Measles       BMI     Polio  \
Cluster                                                                   
0                   0.782339     0.936529  0.825888  0.965240  0.9410

In [19]:
print("\nOutlier Feature Scales (0-1):")
print(outliers_summary_scaled)


Outlier Feature Scales (0-1):
Infant_deaths                  0.437404
Under_five_deaths              0.423660
Adult_mortality                0.377162
Alcohol_consumption            0.307967
Hepatitis_B                    0.603420
Measles                        0.533728
BMI                            0.281038
Polio                          0.619793
Diphtheria                     0.541302
Incidents_HIV                  0.162012
GDP_per_capita                 0.071300
Population_mln                 0.056010
Thinness_ten_nineteen_years    0.233691
Thinness_five_nine_years       0.248394
Schooling                      0.425209
dtype: float64


In [11]:
import seaborn as sns
import matplotlib.pyplot as plt

# Visualize the mean feature values across clusters using a heatmap
plt.figure(figsize=(20, 10))
sns.heatmap(cluster_summary_scaled, cmap="PuRd", annot=True, fmt=".2f")
plt.title("Mean Feature Values by Cluster")
plt.xlabel("Features")
plt.ylabel("Clusters")
plt.show()


In [37]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Load the data
data = pd.read_csv('../data/data.csv')

# Select features relevant to socioeconomic and health conditions
features = [
    'Infant_deaths', 'Under_five_deaths', 'Adult_mortality', 'Alcohol_consumption',
    'Hepatitis_B', 'Measles', 'BMI', 'Polio', 'Diphtheria', 'Incidents_HIV',
    'GDP_per_capita', 'Population_mln', 'Thinness_ten_nineteen_years', 'Thinness_five_nine_years',
    'Schooling', 
]
X = data[features]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 1: Perform PCA
pca = PCA(n_components=3)
pca_result = pca.fit_transform(X_scaled)

# Step 2: Fit K-Means on PCA-transformed data
kmeans = KMeans(n_clusters=2, random_state=42)  
labels = kmeans.fit_predict(pca_result)

# Step 3: Create a 3D scatter plot
fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection='3d')

# Scatter plot
sc = ax.scatter(pca_result[:, 0], pca_result[:, 1], pca_result[:, 2], c=labels, cmap='viridis', s=50)

# Add labels
ax.set_title('3D K-Means Clustering (PCA Projection)')
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3')

# Add color bar
plt.colorbar(sc, label='Cluster Label')

plt.show()


In [27]:
# Use Elbow Method to find optimal k - the goal is to find the elbow point in the graph
sse = []  # Measures how tightly the clusters fit around their centers 
k_range = range(2, 31)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df_encoded)
    sse.append(kmeans.inertia_)  # Inertia is the within-cluster sum of squares

# Step 4: Plot the Elbow Method
plt.figure(figsize=(8, 6))
plt.plot(k_range, sse, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('SSE (Sum of Squared Errors)')
plt.title('Elbow Method for Optimal k')
plt.show()


In [38]:
from sklearn.metrics import silhouette_score
silhouette_avg = silhouette_score(X_scaled, labels)  # Use K-Means or DBSCAN labels
print(f"Silhouette Score: {silhouette_avg}")


Silhouette Score: 0.39140837027877745
