In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report
import xgboost as xgb
import joblib
import os
import warnings
warnings.filterwarnings('ignore')


In [2]:
#Loading the processed data

train_df = pd.read_csv('../data/processed/train_processed.csv')
test_df = pd.read_csv('../data/processed/test_processed.csv')
symptom_cols = [i for i in train_df.columns if i !="prognosis"]
train_df.shape, test_df.shape

((4920, 48), (42, 48))

In [3]:
#Separating target variable

print(f"Symptom Columns ({len(symptom_cols)}):", symptom_cols)
X = train_df[symptom_cols]
y = train_df["prognosis"]
X_test = test_df[symptom_cols]
y_test = test_df["prognosis"]

Symptom Columns (47): ['itching', 'nodal_skin_eruptions', 'shivering', 'ulcers_on_tongue', 'burning_micturition', 'spotting_ urination', 'weight_gain', 'anxiety', 'mood_swings', 'weight_loss', 'patches_in_throat', 'irregular_sugar_level', 'sunken_eyes', 'breathlessness', 'sweating', 'dehydration', 'dark_urine', 'nausea', 'mild_fever', 'yellow_urine', 'acute_liver_failure', 'swelling_of_stomach', 'swelled_lymph_nodes', 'weakness_in_limbs', 'pain_during_bowel_movements', 'swollen_legs', 'extra_marital_contacts', 'knee_pain', 'movement_stiffness', 'spinning_movements', 'foul_smell_of urine', 'passage_of_gases', 'muscle_pain', 'altered_sensorium', 'red_spots_over_body', 'belly_pain', 'dischromic _patches', 'watering_from_eyes', 'family_history', 'mucoid_sputum', 'rusty_sputum', 'lack_of_concentration', 'visual_disturbances', 'blackheads', 'scurring', 'skin_peeling', 'red_sore_around_nose']


In [4]:
#Splitting training data to check for validation and tuning, test data will be used for final testing

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)
print("Training Split Shape:", X_train.shape)
print("Validation Split Shape:", X_val.shape)

Training Split Shape: (3690, 47)
Validation Split Shape: (1230, 47)


In [5]:
#Defining models
from sklearn.metrics import accuracy_score, precision_score, recall_score
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='mlogloss')
}

results = {}
for name, model in models.items():
    print(f"\nTraining {name} model...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    results[name] = classification_report(y_val, y_pred, output_dict=True)
    print("Results:")
    print("Accuracy Score:",accuracy_score(y_val, y_pred))
    print("Recall Score:", recall_score(y_val, y_pred, average='weighted'))
    print("Precision Score:", precision_score(y_val, y_pred, average='weighted'))
    print("_"*50)




Training LogisticRegression model...
Results:
Accuracy Score: 0.975609756097561
Recall Score: 0.975609756097561
Precision Score: 0.9832242669554937
__________________________________________________

Training RandomForest model...
Results:
Accuracy Score: 0.9747967479674797
Recall Score: 0.9747967479674797
Precision Score: 0.9824771536682256
__________________________________________________

Training GradientBoosting model...
Results:
Accuracy Score: 0.9747967479674797
Recall Score: 0.9747967479674797
Precision Score: 0.9824771536682256
__________________________________________________

Training XGBoost model...
Results:
Accuracy Score: 0.9747967479674797
Recall Score: 0.9747967479674797
Precision Score: 0.9824771536682256
__________________________________________________


## Models Used
### Logistic Regression
- Baseline model for a classification problem
### Random Forest 
- Robust and tree based model to handle dataset with large number of correlated features
### Gradient Boosting
- Sequential model to correct errors sequentially and potential for highly accurate predictions
### XG Boost
- Efficient and an advanced model to test for higher accuracy


In [None]:
#Hyper Parameter tuning 

models = {
    'LogisticRegression': {
        'model': LogisticRegression(max_iter=1000, random_state=42),
        'param_grid': {'C': [0.01, 0.1, 1, 10], 'solver': ['lbfgs', 'liblinear']}
    },
    'RandomForest': {
        'model': RandomForestClassifier(random_state=42),
        'param_grid': {
            'n_estimators': [100, 200],
            'max_depth': [10, 20, None],
            'min_samples_split': [2, 5]
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(random_state=42),
        'param_grid': {
            'n_estimators': [100, 200],
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 5]
        }
    },
    'XGBoost': {
        'model': xgb.XGBClassifier(random_state=42, eval_metric='mlogloss'),
        'param_grid': {
            'n_estimators': [100, 200],
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 5]
        }
    }
}


In [7]:
results = {}
best_models = {}
best_scores = {}

for name, config in models.items():
    print(f"\nTuning {name}...")
    grid_search = GridSearchCV(
        config['model'], config['param_grid'], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models[name] = grid_search.best_estimator_
    best_scores[name] = grid_search.best_score_
    print(f"Best {name} Parameters:", grid_search.best_params_)
    print(f"Best {name} CV Accuracy:", grid_search.best_score_)
    
    
    # Evaluate on validation set
    y_pred = best_models[name].predict(X_val)
    results[name] = classification_report(y_val, y_pred, output_dict=True)
    print(f"\nValidation Results for {name}:")
    print("Accuracy Score:",accuracy_score(y_val, y_pred))
    print("Recall Score:", recall_score(y_val, y_pred, average='macro'))
    print("Precision Score:", precision_score(y_val, y_pred, average='macro'))
    print("_"*100)



Tuning LogisticRegression...
Best LogisticRegression Parameters: {'C': 1, 'solver': 'lbfgs'}
Best LogisticRegression CV Accuracy: 0.9745257452574526

Validation Results for LogisticRegression:
Accuracy Score: 0.975609756097561
Recall Score: 0.9757738714078027
Precision Score: 0.981443209359826
____________________________________________________________________________________________________

Tuning RandomForest...
Best RandomForest Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Best RandomForest CV Accuracy: 0.9742547425474255

Validation Results for RandomForest:
Accuracy Score: 0.9747967479674797
Recall Score: 0.9743097611742709
Precision Score: 0.9814096518009858
____________________________________________________________________________________________________

Tuning XGBoost...
Best XGBoost Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Best XGBoost CV Accuracy: 0.9739837398373984

Validation Results for XGBoost:
Accuracy 

- Out of all the trained models, the logistic regression model has given highest accuracy in both testing and cross validation.
- Gradient boosting is taking too much time to tune and doesn't have any improvement over other so in the final code we will remove this model

In [8]:
#Selecting best model
best_model_name = max(best_scores,key=best_scores.get)
best_model = best_models[best_model_name]
print(f"\nSelected '{best_model_name}' with Test Accuracy: {best_scores[best_model_name]:.4f}")


Selected 'LogisticRegression' with Test Accuracy: 0.9745


In [9]:
#Fitting the best model and now testing on test data
best_model.fit(X,y) #fitting the model on the train data provided
y_test_pred = best_model.predict(X_test)

In [10]:
#Accuracy Scores for test result

print(f"\nTest Set Results for {best_model_name}:")
print("Accuracy Score:", accuracy_score(y_test, y_test_pred))
print("Recall Score:", recall_score(y_test, y_test_pred, average="macro"))
print("Precision Score:", precision_score(y_test, y_test_pred, average="macro"))
print(classification_report(y_test, y_test_pred))


Test Set Results for LogisticRegression:
Accuracy Score: 0.9761904761904762
Recall Score: 0.9878048780487805
Precision Score: 0.9878048780487805
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1
           2       1.00      1.00      1.00         1
           3       1.00      1.00      1.00         1
           4       1.00      1.00      1.00         1
           5       1.00      1.00      1.00         1
           6       1.00      1.00      1.00         1
           7       1.00      1.00      1.00         1
           8       1.00      1.00      1.00         1
           9       1.00      1.00      1.00         1
          10       1.00      1.00      1.00         1
          11       1.00      1.00      1.00         1
          12       1.00      1.00      1.00         1
          13       1.00      1.00      1.00         1
          14       1.00      1.00      1.00

In [11]:
#Saving the predictions

test_predictions = pd.DataFrame({
    'prognosis_encoded': y_test_pred,
    'prognosis': joblib.load('../models/label_encoder.pkl').inverse_transform(y_test_pred)
})
test_predictions.to_csv('../predictions.csv', index=False)
print("Test predictions saved to ../predictions.csv")

Test predictions saved to ../predictions.csv


In [12]:
#Saving the best model

os.makedirs('../models', exist_ok=True)
joblib.dump(best_model, '../models/trained_model.pkl')
print(f"Best model ({best_model_name}) saved to ../models/trained_model.pkl")

Best model (LogisticRegression) saved to ../models/trained_model.pkl
