In [6]:

import numpy as np
import pandas as pd
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn import set_config
from sklearn.metrics import accuracy_score,classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
import shap
import joblib


# Load dataset
np.random.seed(42)
n_samples = 30000

ages = np.random.randint(18, 60, size=n_samples)
genders = np.random.choice(['Male', 'Female', 'Other'], size=n_samples)
goals = np.random.choice(['Weight Loss', 'Muscle Gain', 'Flexibility', 'General Fitness', 'Stress Relief'], size=n_samples)
experiences = np.random.choice(['Beginner', 'Intermediate', 'Advanced'], size=n_samples)
hours = np.random.randint(1, 15, size=n_samples)
workout_types = np.random.choice(['Zumba', 'Yoga', 'HIIT', 'Mix'], size=n_samples)
timings = np.random.choice(['Morning', 'Evening', 'Flexible'], size=n_samples)
budgets = np.random.choice(['Low', 'Medium', 'High'], size=n_samples)

# Logic to assign Recommended Plan
plans = []
for i in range(n_samples):
    if experiences[i] == 'Beginner' and hours[i] <= 4:
        plans.append('Monthly')
    elif budgets[i] == 'High' and experiences[i] == 'Advanced' and hours[i] >= 8:
        plans.append('Yearly')
    elif budgets[i] == 'Medium' and 4 < hours[i] < 8:
        plans.append('Quarterly')
    else:
        plans.append(np.random.choice(['Monthly', 'Quarterly', 'Yearly']))

data = pd.DataFrame({
    'Age': ages,
    'Gender': genders,
    'Fitness Goal': goals,
    'Workout Experience': experiences,
    'Hours/Week': hours,
    'Workout Type': workout_types,
    'Timing': timings,
    'Budget': budgets,
    'Recommended Plan': plans
})





In [7]:
# Features and target
X = data.drop('Recommended Plan', axis=1)
y = data['Recommended Plan']

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer

numeric_features = ['Age', 'Hours/Week']
categorical_ordinal_features = ['Workout Experience', 'Budget']
categorical_nominal_features = ['Gender', 'Fitness Goal', 'Workout Type', 'Timing']

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

ordinal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('ordinal', OrdinalEncoder(categories=[['Beginner', 'Intermediate', 'Advanced'], ['Low', 'Medium', 'High']]))
])

nominal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(drop='first', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_features),
    ('ord', ordinal_pipeline, categorical_ordinal_features),
    ('nom', nominal_pipeline, categorical_nominal_features)
])


In [9]:
data['AgeGroup'] = pd.cut(data['Age'], bins=[17, 25, 35, 45, 60], labels=['18-25', '26-35', '36-45', '46-60'])


In [18]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_classif, k=13)),
    ('classifier', RandomForestClassifier(random_state=42))
])
# Train
pipe.fit(X_train, y_train)



NameError: name 'X_train' is not defined

In [None]:
param_dist = {
    'feature_selection__k': [10, 13, 15],
    'classifier__n_estimators': randint(100, 300),
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': randint(2, 10),
    'classifier__min_samples_leaf': randint(1, 5)
}


In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

random_search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    n_iter=10,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

random_search.fit(X_train, y_train)
# Use best model
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluation
print("Best Parameters Found by RandomizedSearchCV:")
print(random_search.best_params_)

print("\nAccuracy on Test Set:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



In [None]:
# STEP 4: SHAP Explanation
import shap

# Use the *best trained* classifier from the fitted pipeline
# The fitted model is random_search.best_estimator_
best_classifier = random_search.best_estimator_.named_steps['classifier']

# Transform the X_test data using the preprocessor from the best fitted pipeline
X_test_transformed = random_search.best_estimator_.named_steps['preprocessor'].transform(X_test)


# Initialize SHAP explainer for tree-based model
explainer = shap.TreeExplainer(best_classifier)
# shap_values = explainer.shap_values(X_test_transformed) # This will return a list of arrays for multi-class classification
shap_values = explainer.shap_values(X_test_transformed)


# STEP 5: SHAP Plots
# For multi-class classification, shap_values is a list.
# We can plot the SHAP values for a specific class, e.g., class 0
# Or plot summary for all classes (will average across classes or require specifying class)
# Let's plot the summary plot for all classes which is common
shap.summary_plot(shap_values, X_test_transformed, plot_type="bar")
# For the dot plot, it often looks better if we plot for a specific class
# Let's choose class 0 for demonstration, but you might want to iterate or choose based on context
shap.summary_plot(shap_values, X_test_transformed)

In [None]:

joblib.dump(pipe, 'best_model.pkl')



In [None]:
def predict_plan(input_dict):
    """
    Predicts the recommended fitness plan based on input features.
    
    Parameters:
    - input_dict (dict): Dictionary containing input values for the following keys:
        'Age', 'Gender', 'Fitness Goal', 'Workout Experience', 'Hours/Week',
        'Workout Type', 'Timing', 'Budget'
        
    Returns:
    - str: Predicted plan (e.g., 'Monthly', 'Quarterly', 'Yearly')
    """
    input_df = pd.DataFrame([input_dict])
    return best_model.predict(input_df)[0]


In [None]:
sample_input = {
    'Age':45 ,
    'Gender': 'Male',
    'Fitness Goal': 'General Fitness',
    'Workout Experience': 'Beginner',
    'Hours/Week': 15,
    'Workout Type': 'HIIT',
    'Timing': 'Flexible',
    'Budget': 'High'
}

print(predict_plan(sample_input))
