In [4]:
!pip install pandas matplotlib seaborn scikit-learn lightgbm plotly -U nbformat



In [None]:
# =========================
# Install necessary packages
# =========================
!pip install -q lightgbm plotly -U
!pip install -q scikit-learn nbformat

import pandas as pd
from sklearn.feature_selection import mutual_info_regression, RFE
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
import plotly.express as px

# =========================
# 2Load dataset
df = pd.read_csv("C:/Users/pasu_ba/Downloads/exoplanet_data.csv")

print("Data shape:", df.shape)

# =========================
# Separate target & drop metadata/text columns
# =========================
target_col = 'pl_pnum'  # <-- CHANGE to your target
meta_cols = ['rowid', 'toi', 'toipfx', 'tid', 'ctoi_alias', 'toi_created', 'rowupdate']
text_cols = ['rastr', 'decstr']  # string columns

X = df.drop(columns=[target_col] + meta_cols + text_cols, errors='ignore')
y = df[target_col]

# =========================
# 4️⃣ Handle missing values
# =========================
# Numeric columns → fill NaN with 0
num_cols = X.select_dtypes(include=['int64','float64']).columns
X[num_cols] = X[num_cols].fillna(0)

# Categorical columns → fill NaN with 'missing'
cat_cols = X.select_dtypes(include=['object']).columns
X[cat_cols] = X[cat_cols].fillna('missing')

# Convert categorical columns to numeric
X = pd.get_dummies(X, drop_first=True)

# Final check
if X.isna().sum().sum() > 0:
    print("Still missing values in X. Dropping remaining rows...")
    X = X.dropna()
    y = y[X.index]

print("Preprocessed X shape:", X.shape)

# =========================
# 5️⃣ Feature ranking
# =========================
# Mutual Info Regression
mi = mutual_info_regression(X, y, random_state=42)
mi_rank = pd.Series(mi, index=X.columns).rank(ascending=False)

# RFE with RandomForestRegressor
estimator = RandomForestRegressor(n_estimators=100, random_state=42)
rfe = RFE(estimator, n_features_to_select=1, step=1)
rfe.fit(X, y)
rfe_rank = pd.Series(rfe.ranking_, index=X.columns).rank(ascending=True)

# LightGBM feature importance
lgb_model = lgb.LGBMRegressor(random_state=42)
lgb_model.fit(X, y)
lgb_rank = pd.Series(lgb_model.feature_importances_, index=X.columns).rank(ascending=False)

# Combine rankings
rank_df = pd.DataFrame({
    "MI": mi_rank,
    "RFE": rfe_rank,
    "LGBM": lgb_rank
})
rank_df["mean_rank"] = rank_df.mean(axis=1)
rank_df = rank_df.sort_values("mean_rank")

print("Top 10 features:")
print(rank_df.head(10))

# =========================
# 6️⃣ Plot top features with Plotly
# =========================
top_n = 10
top_features = rank_df.head(top_n).reset_index()
top_features.rename(columns={'index':'Feature'}, inplace=True)

# Melt for grouped bar chart
df_melted = top_features.melt(
    id_vars='Feature',
    value_vars=['MI','RFE','LGBM'],
    var_name='Method',
    value_name='Rank'
)

fig = px.bar(
    df_melted,
    x='Feature',
    y='Rank',
    color='Method',
    barmode='group',
    text='Rank',
    title=f'Top {top_n} Features by MI, RFE, and LGBM Rankings'
)
fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')

# Overlay mean rank as a line
fig.add_scatter(
    x=top_features['Feature'],
    y=top_features['mean_rank'],
    mode='lines+markers',
    name='Mean Rank',
    line=dict(color='black', width=3, dash='dash'),
    marker=dict(symbol='circle', size=10)
)

fig.update_layout(yaxis=dict(autorange="reversed"))  # smaller rank = better
fig.show()


Data shape: (7703, 87)
Preprocessed X shape: (7703, 81)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002822 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9712
[LightGBM] [Info] Number of data points in the train set: 7703, number of used features: 47
[LightGBM] [Info] Start training from score 1.049331
Top 10 features:
               MI   RFE  LGBM  mean_rank
pl_rade       1.0   1.0   3.0   1.666667
pl_trandep    2.0   9.0   1.0   4.000000
pl_insol      9.0   7.0  11.0   9.000000
st_pmra      24.0   2.0   2.0   9.333333
st_logg       3.0  15.0  10.0   9.333333
st_dist       6.0  12.0  16.0  11.333333
pl_orbper    28.0   3.0   4.0  11.666667
pl_tranmid   30.0   6.0   7.0  14.333333
pl_radeerr1  12.0  24.0   8.0  14.666667
pl_eqt        8.0  21.0  18.0  15.666667
