# Step 0: Import Libraries

In [23]:
import pandas as pd
import numpy as np

# Step 1: Load Raw Dataset

In [24]:
df = pd.read_csv("../data/exoplanets.csv")

In [25]:
print("✅ Raw dataset loaded!")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head()

✅ Raw dataset loaded!
Shape: (4855, 12)
Columns: ['Name', 'Mass (MJ)', 'Radius (RJ)', 'Period (days)', 'Semi-major axis (AU)', 'Temp. (K)', 'Discovery method', 'Disc. Year', 'Distance (ly)', 'Host star mass (M☉)', 'Host star temp. (K)', 'Remarks']


Unnamed: 0,Name,Mass (MJ),Radius (RJ),Period (days),Semi-major axis (AU),Temp. (K),Discovery method,Disc. Year,Distance (ly),Host star mass (M☉),Host star temp. (K),Remarks
0,16 Cygni Bb,2.38,,799.5,1.66,,radial vel.,1996.0,68.99,1.04,5750,
1,23 Librae b,1.61,,258.18,0.81,,radial vel.,1999.0,85.46,1.07,5736,
2,47 Ursae Majoris b,2.53,,1078.0,2.1,,radial vel.,1996.0,45.02,1.08,5892,Proper name Taphao Thong
3,51 Pegasi b,0.46,,4.230785,0.0527,,radial vel.,1995.0,50.45,1.12,5793,Proper name Dimidium; previously informally na...
4,55 Cancri b,0.8306,,14.65152,0.115227,700.0,radial vel.,1996.0,41.06,0.905,5196,Proper name Galileo


# Step 2: Rename Columns for Consistency

In [26]:
df = df.rename(columns={
    "Name": "name",
    "Mass (MJ)": "mass_mj",
    "Radius (RJ)": "radius_rj",
    "Period (days)": "period_days",
    "Semi-major axis (AU)": "semi_major_axis_au",
    "Temp. (K)": "temp_k",
    "Distance (ly)": "distance_ly",
    "Host star mass (M☉)": "star_mass_solar",  # fix unicode
    "Host star temp. (K)": "star_temp_k",
    "Discovery method": "discovery_method",
    "Disc. Year": "disc_year",
    "Remarks": "remarks"
})

In [27]:
print("✅ Columns renamed!")

✅ Columns renamed!


# Step 3: Unit Conversion

In [28]:
df["mass_earth"] = pd.to_numeric(df["mass_mj"], errors="coerce") * 317.8
df["radius_earth"] = pd.to_numeric(df["radius_rj"], errors="coerce") * 11.21


In [29]:
print("✅ Units converted to Earth Mass & Radius")

✅ Units converted to Earth Mass & Radius


# Step 4: Handle Missing Values

In [30]:
num_cols = [
    "mass_earth", "radius_earth", "period_days", "semi_major_axis_au",
    "temp_k", "distance_ly", "star_mass_solar", "star_temp_k"
]

In [31]:
# Convert numeric + fill missing
for col in num_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")
        df[col] = df[col].fillna(df[col].median())

In [32]:
# Categorical fill
cat_cols = ["discovery_method"]
for col in cat_cols:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].mode()[0])

In [33]:
# Drop if everything is NaN
df = df.dropna(subset=["mass_earth", "radius_earth", "semi_major_axis_au"], how="all")

In [34]:
print("✅ Missing values handled!")

✅ Missing values handled!


# Step 5: Habitability Scoring Function

In [35]:
def habitability_score(row):
    scores = []

    # Rule 1: Temperature (ideal = 288K, tolerance wider)
    if pd.notna(row["temp_k"]):
        score_temp = max(0, 1 - abs(row["temp_k"] - 288) / 288)
    else:
        score_temp = 0
    scores.append(score_temp)

    # Rule 2: Distance (ideal = 1 AU)
    if pd.notna(row["semi_major_axis_au"]):
        score_distance = max(0, 1 - abs(row["semi_major_axis_au"] - 1) / 1)
    else:
        score_distance = 0
    scores.append(score_distance)

    # Rule 3: Radius (ideal = 1 Earth radius)
    if pd.notna(row["radius_earth"]):
        score_radius = max(0, 1 - abs(row["radius_earth"] - 1) / 5)
    else:
        score_radius = 0
    scores.append(score_radius)

    # Rule 4: Mass (ideal = 1 Earth mass)
    if pd.notna(row["mass_earth"]):
        score_mass = max(0, 1 - abs(row["mass_earth"] - 1) / 10)
    else:
        score_mass = 0
    scores.append(score_mass)

    # Rule 5: Star Mass (ideal = 1 Solar mass)
    if pd.notna(row["star_mass_solar"]):
        score_star = max(0, 1 - abs(row["star_mass_solar"] - 1) / 1)
    else:
        score_star = 0
    scores.append(score_star)

    # Average all scores → percentage
    return (np.mean(scores)) * 100

In [36]:
df["habitability_score"] = df.apply(habitability_score, axis=1)

In [37]:
# Label: 1 if score ≥ 50%
df["habitability_label"] = (df["habitability_score"] >= 50).astype(int)

In [38]:
print("✅ Habitability score calculated!")

✅ Habitability score calculated!


In [39]:
print("✅ Continuous habitability scoring complete!")

✅ Continuous habitability scoring complete!


# Step 6: Save Processed Dataset

In [41]:
output_path = "../data/exoplanets_processed.csv"
df.to_csv(output_path, index=False)

In [42]:
print("✅ Preprocessing complete!")
print("Saved at:", output_path)
print("Total planets:", len(df))
print("Potentially habitable planets:", df["habitability_label"].sum())

✅ Preprocessing complete!
Saved at: ../data/exoplanets_processed.csv
Total planets: 4855
Potentially habitable planets: 106


In [43]:
# Preview top 10 planets
df[["name", "habitability_score", "habitability_label"]].head(10)

Unnamed: 0,name,habitability_score,habitability_label
0,16 Cygni Bb,40.71812,0
1,23 Librae b,49.51812,0
2,47 Ursae Majoris b,33.11812,0
3,51 Pegasi b,33.37212,0
4,55 Cancri b,35.12266,0
5,70 Virginis b,42.53812,0
6,109 Piscium b,32.51812,0
7,Gamma Cephei Ab,26.71812,0
8,Gliese 86 b,33.51812,0
9,Gliese 876 b,25.28446,0
