In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
import shap
import numpy as np
import os
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score


In [22]:
# Load the final dataset
df = pd.read_csv("EXPOPLANET_TEST_DATA.csv")

In [23]:
# Features and target
features = ['mass_multiplier', 'radius_multiplier', 'orbital_radius', 'distance', 'eccentricity']
X = df[features]
y = df['habitability_cluster']

In [24]:
# Separate class 0 (only one sample)
class_0 = df[df['habitability_cluster'] == 0]
rest = df[df['habitability_cluster'] != 0]


In [25]:
# Split the rest of the data
X_rest = rest[features]
y_rest = rest['habitability_cluster']

X_rest_scaled = StandardScaler().fit_transform(X_rest)

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_rest_scaled, y_rest, test_size=0.2, random_state=42
)


In [26]:

# Scale and prepare class 0
scaler = StandardScaler()
X_0_scaled = scaler.fit_transform(X[features].iloc[class_0.index])

In [27]:
# Add class 0 back to training set
X_train = np.vstack([X_train_r, X_0_scaled])
y_train = pd.concat([y_train_r, class_0['habitability_cluster']], ignore_index=True)

In [28]:
# Add class 0 to test set as well
X_test = np.vstack([X_test_r, X_0_scaled])
y_test = pd.concat([y_test_r, class_0['habitability_cluster']], ignore_index=True)

In [29]:
# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

In [30]:
print("\n===== Random Forest Results =====")
print(confusion_matrix(y_test, rf_preds))
print(classification_report(y_test, rf_preds, zero_division=0))


===== Random Forest Results =====
[[3286    0]
 [   0  393]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3286
           1       1.00      1.00      1.00       393

    accuracy                           1.00      3679
   macro avg       1.00      1.00      1.00      3679
weighted avg       1.00      1.00      1.00      3679



In [31]:
# Create output directory
os.makedirs("plot", exist_ok=True)

In [32]:
def save_conf_matrix(y_true, y_pred, title, filename):
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap="Blues")
    plt.title(title)
    plt.savefig(f"plots/{filename}", bbox_inches='tight')
    plt.close()


In [33]:
save_conf_matrix(y_test, rf_preds, "Random Forest Confusion Matrix", "rf_confusion_matrix.png")


In [34]:


# --- Save Feature Importance Plots ---

def save_feature_importance(model, features, title, filename):
    importances = model.feature_importances_
    sorted_idx = np.argsort(importances)
    plt.figure(figsize=(8, 5))
    plt.barh(range(len(sorted_idx)), importances[sorted_idx], align='center')
    plt.yticks(range(len(sorted_idx)), [features[i] for i in sorted_idx])
    plt.title(title)
    plt.xlabel("Importance Score")
    plt.tight_layout()
    plt.savefig(f"plots/{filename}", bbox_inches='tight')
    plt.close()



In [35]:
# Convert X_test to DataFrame with proper column names
X_test_df = pd.DataFrame(X_test, columns=features)


In [36]:
# Create explainer and compute SHAP values
explainer = shap.Explainer(rf, X_train)
shap_values = explainer(X_test_df)




In [37]:
# ✅ Use only the values for the first output class if it's multiclass or structured
if isinstance(shap_values, list) or hasattr(shap_values, 'values'):
    shap_vals = shap_values.values if hasattr(shap_values, 'values') else shap_values[0].values
else:
    shap_vals = shap_values

In [38]:
# Now plot safely
shap.summary_plot(shap_vals, X_test_df, show=False)
plt.title("SHAP Summary Plot - Random Forest")
plt.savefig("plots/shap_summary_rf.png", bbox_inches='tight')
plt.close()

In [39]:

# Use the same features and target as before
features = ['mass_multiplier', 'radius_multiplier', 'orbital_radius', 'distance', 'eccentricity']
X = df[features]
y = df['habitability_cluster']

In [40]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [41]:
# Define scoring metrics
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision_macro': make_scorer(precision_score, average='macro', zero_division=0),
    'recall_macro': make_scorer(recall_score, average='macro', zero_division=0),
    'f1_macro': make_scorer(f1_score, average='macro', zero_division=0)
}


In [42]:
# 5-fold stratified cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [43]:
# Random Forest cross-validation
print("\n===== Random Forest Cross-Validation =====")
rf_cv_scores = cross_validate(RandomForestClassifier(random_state=42), X_scaled, y, cv=cv, scoring=scoring)
for metric, scores in rf_cv_scores.items():
    if 'test' in metric:
        print(f"{metric}: {np.mean(scores):.4f}")


===== Random Forest Cross-Validation =====
test_accuracy: 0.9907
test_precision_macro: 0.9939
test_recall_macro: 0.9928
test_f1_macro: 0.9933


In [91]:
Import required libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd


In [92]:
# Define features and target
features = ['mass_multiplier', 'radius_multiplier', 'orbital_radius', 'eccentricity', 'distance']
X = df[features]
y = df['habitability_cluster']

In [93]:
# Train the model
model = RandomForestClassifier()
model.fit(X, y)


In [94]:
# --- Predict on a fixed input (optional) ---
mass = 0.5
radius = 0.3
orbital_radius = 0.2
ecc = 0.05
distance = 32

In [95]:
# Create DataFrame for input
input_data = pd.DataFrame([{
    'mass_multiplier': mass,
    'radius_multiplier': radius,
    'orbital_radius': orbital_radius,
    'eccentricity': ecc,
    'distance': distance
}])

In [96]:
# Create input DataFrame
input_data = pd.DataFrame([{
    'mass_multiplier': mass,
    'radius_multiplier': radius,
    'orbital_radius': orbital_radius,
    'eccentricity': ecc,
    'distance': distance
}])

In [97]:
# Predict
prediction = model.predict(input_data)[0]

In [98]:
# Map predicted class
habitability_mapping = {
    0: "Not Habitable",
    1: "Possibly Habitable",
    2: "Habitable"
}

In [99]:
print("\n🌍 Predicted Habitability:", habitability_mapping[prediction])



🌍 Predicted Habitability: Possibly Habitable


In [101]:
# --- User Input Section ---
print("\n🌍 Enter exoplanet parameters (with suggested units and ranges):")
print(" - mass_multiplier (relative to Earth's mass, M⊕) [Range: 0.1 – 10.0, Earth = 1.0]")
print(" - radius_multiplier (relative to Earth's radius, R⊕) [Range: 0.1 – 2.5, Earth = 1.0]")
print(" - orbital_radius (in Astronomical Units, AU) [Range: 0.1 – 5.0, Earth = 1.0]")
print(" - eccentricity (dimensionless) [Range: 0.0 – 1.0, circular = 0.02]")
print(" - distance (in light-years, ly) [Range: 1 – 5000]")

user_input = [float(input(f"Enter {feat}: ")) for feat in features]




🌍 Enter exoplanet parameters (with suggested units and ranges):
 - mass_multiplier (relative to Earth's mass, M⊕) [Range: 0.1 – 10.0, Earth = 1.0]
 - radius_multiplier (relative to Earth's radius, R⊕) [Range: 0.1 – 2.5, Earth = 1.0]
 - orbital_radius (in Astronomical Units, AU) [Range: 0.1 – 5.0, Earth = 1.0]
 - eccentricity (dimensionless) [Range: 0.0 – 1.0, circular = 0.02]
 - distance (in light-years, ly) [Range: 1 – 5000]
Enter mass_multiplier: 0.2
Enter radius_multiplier: 0.8
Enter orbital_radius: 0.55
Enter eccentricity: 0.02
Enter distance: 500


In [102]:
# Convert to DataFrame
user_df = pd.DataFrame([user_input], columns=features)

In [103]:
# --- Predict label and probabilities (no scaler needed) ---
predicted_label = model.predict(user_df)[0]
predicted_probs = model.predict_proba(user_df)[0] * 100  # Convert to percentage

In [104]:
# Map label
label_map = {
    0: "Non-Habitable",
    1: "Possibly Habitable",
    2: "Habitable"
}
label_name = label_map[predicted_label]
confidence = predicted_probs[predicted_label]

In [105]:
# --- Final Output ---
print(f"\n🌍 Prediction: {label_name}")
print(f"📊 Probability prediction: {confidence:.2f}%")""""


🌍 Prediction: Non-Habitable
📊 Probability prediction: 97.00%


In [106]:
# Import required libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [107]:
# Assume your DataFrame 'df' is already loaded with data
# Define features and target
features = ['mass_multiplier', 'radius_multiplier', 'orbital_radius', 'eccentricity', 'distance']
X = df[features]
y = df['habitability_cluster']

In [108]:
# Train the model
model = RandomForestClassifier()
model.fit(X, y)

In [109]:
# Define label map
label_map = {0: "Non-Habitable", 1: "Possibly Habitable", 2: "Habitable"}

In [111]:
# Ask user to enter 9 exoplanet records
print("🌍 Please enter values for 9 exoplanets:")
print("Units & Ranges:")
print(" - mass_multiplier (0.1–10.0)")
print(" - radius_multiplier (0.1–2.5)")
print(" - orbital_radius (AU, 0.1–5.0)")
print(" - eccentricity (0.0–1.0)")
print(" - distance (light-years, 1–5000)")

results = []

for i in range(1, 10):  # 1 to 9
    print(f"\n🔢 Enter data for Exoplanet #{i}:")
    user_input = []
    for feat in features:
        value = float(input(f"Enter {feat}: "))
        user_input.append(value)

    input_df = pd.DataFrame([user_input], columns=features)
    label = model.predict(input_df)[0]
    probs = model.predict_proba(input_df)[0] * 100

    result = {
        "mass_multiplier": user_input[0],
        "radius_multiplier": user_input[1],
        "orbital_radius": user_input[2],
        "eccentricity": user_input[3],
        "distance": user_input[4],
        "Predicted_Label": label_map[label],
        "Probability (%)": round(probs[label], 2)
    }
    results.append(result)


🌍 Please enter values for 9 exoplanets:
Units & Ranges:
 - mass_multiplier (0.1–10.0)
 - radius_multiplier (0.1–2.5)
 - orbital_radius (AU, 0.1–5.0)
 - eccentricity (0.0–1.0)
 - distance (light-years, 1–5000)

🔢 Enter data for Exoplanet #1:
Enter mass_multiplier: 19.4
Enter radius_multiplier: 1.08
Enter orbital_radius: 1.29
Enter eccentricity: 0.23
Enter distance: 304

🔢 Enter data for Exoplanet #2:
Enter mass_multiplier: 14.74
Enter radius_multiplier: 1.09
Enter orbital_radius: 1.53
Enter eccentricity: 0.08
Enter distance: 409

🔢 Enter data for Exoplanet #3:
Enter mass_multiplier: 4.8
Enter radius_multiplier: 1.15
Enter orbital_radius: 0.83
Enter eccentricity: 0
Enter distance: 246

🔢 Enter data for Exoplanet #4:
Enter mass_multiplier: 8.13881
Enter radius_multiplier: 1.12
Enter orbital_radius: 2.773069
Enter eccentricity: 0.37
Enter distance: 58

🔢 Enter data for Exoplanet #5:
Enter mass_multiplier: 6.3
Enter radius_multiplier: 1.11
Enter orbital_radius: 7506
Enter eccentricity: 0
En

In [112]:
# Convert to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv("user_habitability_predictions.csv", index=False)

print("\n✅ All predictions saved to 'user_habitability_predictions.csv'")


✅ All predictions saved to 'user_habitability_predictions.csv'
