# 📚 Menopause Age Prediction Model Training

In [None]:
# -- Imports --
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder

# -- Load or Generate Dataset --
# (If loading from file: df = pd.read_csv('path_to_file.csv'))
# Assuming extended_df is available from previous notebook

In [None]:
# -- Prepare Features and Target --
X = extended_df.drop(columns=["Name", "Menopause_Age"])
y = extended_df["Menopause_Age"]

# Encode categorical features if any
for col in X.select_dtypes(include="object").columns:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))

# -- Split Data --
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# -- Define Models --
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

results = {}
importances = {}

# -- Train and Evaluate Models --
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    results[name] = {"R2": r2, "MSE": mse}
    
    # Feature importance
    if hasattr(model, "feature_importances_"):
        importances[name] = pd.Series(model.feature_importances_, index=X.columns)
    elif hasattr(model, "coef_"):
        importances[name] = pd.Series(model.coef_, index=X.columns).abs()

# -- Compare Model Performance --
score_df = pd.DataFrame(results).T.sort_values(by="R2", ascending=False)

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x=score_df.index, y=score_df["R2"], palette="coolwarm")
plt.ylabel("R² Score (Accuracy)")
plt.title("Model Comparison Based on Accuracy (R²)")
plt.ylim(0, 1)
plt.xticks(rotation=15)
plt.grid(True)
plt.tight_layout()
plt.show()

# -- Feature Importance of Best Model --
best_model_name = score_df.index[0]
plt.figure(figsize=(10, 6))
importances[best_model_name].sort_values(ascending=False).plot(kind="bar", color="teal")
plt.title(f"Feature Importance - Best Model: {best_model_name}")
plt.ylabel("Importance Score")
plt.grid(True)
plt.tight_layout()
plt.show()