In [7]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import numpy as np

In [13]:
# Step 1: Patch threadpoolctl to prevent OpenBLAS error
import threadpoolctl



In [14]:

def safe_threadpool_limits(*args, **kwargs):
    class DummyContext:
        def __enter__(self): pass
        def __exit__(self, *args): pass
    return DummyContext()

# Apply the patch
threadpoolctl.threadpool_limits = safe_threadpool_limits

In [15]:
# Step 1: Load the cleaned dataset
df = pd.read_csv("Cleaned_Exoplanet_Data.csv")

In [16]:
# Step 2: Select relevant features for clustering
features = ['mass_multiplier', 'radius_multiplier', 'orbital_radius', 'distance', 'eccentricity']
X = df[features]


In [17]:
# Step 3: Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)




In [18]:
# Step 4: Apply KMeans clustering (3 clusters for habitability classes)
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
labels = kmeans.fit_predict(X_scaled)

In [19]:
# Step 4: Apply KMeans clustering (3 clusters for habitability classes)
kmeans = KMeans(n_clusters=3, random_state=42)
labels = kmeans.fit_predict(X_scaled)




In [20]:
# Step 5: Assign cluster labels to the dataset
df['habitability_cluster'] = labels

In [21]:
# Step 6: Save final dataset
df.to_csv("Exoplanet_With_Labels_1.csv", index=False)

In [22]:
# Optional: Check label distribution
print("Cluster distribution:\n", df['habitability_cluster'].value_counts())

Cluster distribution:
 habitability_cluster
0    3285
1    1960
2       5
Name: count, dtype: int64
