Taking a look at the dataset and checking imbalances


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score


# Load the dataset
data = pd.read_csv('Train_Data.csv')

# Take a look at the data
print(data.head())


print("\n\nCheck Class imbalance")
class_distribution = data['Healthy'].value_counts()
print(class_distribution)

     ID1  Specific ailments    ID2 Food preference  Age        BMI Smoker?  \
0   2408                 44   2668             DX6   49  20.500470      NO   
1  25063                 39  10363        DX3 DX4    20  26.076580      NO   
2  26798                 29    132             DX6    1  21.420866      NO   
3  31907                 27  10499            DX1    30  25.203247      NO   
4  26412                  9   7963             DX6   40  19.355846     YES   

  Living in? Any heriditary condition?  Follow Diet  Physical activity  \
0      RURAL                    Stable          1.0                0.0   
1      URBAN                    Stable          0.0                0.0   
2      URBAN                    Stable          1.0                0.0   
3      RURAL                    Stable          1.0                0.0   
4      RURAL                    Stable          1.0                0.0   

   Regular sleeping hours  Alcohol consumption  Social interaction  \
0               

Checking data types to encode other types to int


In [2]:
data.dtypes

ID1                            int64
Specific ailments              int64
ID2                            int64
Food preference               object
Age                            int64
BMI                          float64
Smoker?                       object
Living in?                    object
Any heriditary condition?     object
Follow Diet                  float64
Physical activity            float64
Regular sleeping hours       float64
Alcohol consumption          float64
Social interaction           float64
Taking supplements           float64
Mental health management     float64
Illness count last year      float64
Healthy                        int64
dtype: object

In [3]:
# Label encode categorical columns from string to int
categorical_cols = ['Specific ailments', 'Food preference', 'Smoker?', 'Living in?', 'Any heriditary condition?',
               'Follow Diet', 'Mental health management']
label_encoder = LabelEncoder()
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

After testing various imputation methods, it was observed that the best result was obtained using mean


In [4]:
data = data.fillna(data.mean())

In [5]:
X = data.drop('Healthy', axis=1)
y = data['Healthy']

#

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
models = [
        RandomForestClassifier(),
        LogisticRegression(),
        SVC(),
        DecisionTreeClassifier(),
        KNeighborsClassifier(),
        GaussianNB(),
        MLPClassifier(),
        AdaBoostClassifier(),
        GradientBoostingClassifier(),
        AdaBoostClassifier(),
        BaggingClassifier(),
        ExtraTreesClassifier(),
        XGBClassifier()
    ]

f1_scores = []
f1_scores_model = []

for model in models:
      model.fit(X_train, y_train)
      y_pred = model.predict(X_test)
      print(model)
      print(classification_report(y_test, y_pred))

RandomForestClassifier()
              precision    recall  f1-score   support

           0       0.77      0.75      0.76      1903
           1       0.86      0.87      0.86      3281

    accuracy                           0.83      5184
   macro avg       0.81      0.81      0.81      5184
weighted avg       0.82      0.83      0.82      5184

LogisticRegression()
              precision    recall  f1-score   support

           0       0.55      0.08      0.14      1903
           1       0.64      0.96      0.77      3281

    accuracy                           0.64      5184
   macro avg       0.60      0.52      0.45      5184
weighted avg       0.61      0.64      0.54      5184



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


SVC()
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1903
           1       0.63      1.00      0.78      3281

    accuracy                           0.63      5184
   macro avg       0.32      0.50      0.39      5184
weighted avg       0.40      0.63      0.49      5184

DecisionTreeClassifier()
              precision    recall  f1-score   support

           0       0.68      0.67      0.68      1903
           1       0.81      0.82      0.82      3281

    accuracy                           0.77      5184
   macro avg       0.75      0.75      0.75      5184
weighted avg       0.76      0.77      0.76      5184



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


KNeighborsClassifier()
              precision    recall  f1-score   support

           0       0.38      0.28      0.33      1903
           1       0.64      0.74      0.68      3281

    accuracy                           0.57      5184
   macro avg       0.51      0.51      0.51      5184
weighted avg       0.55      0.57      0.55      5184

GaussianNB()
              precision    recall  f1-score   support

           0       0.66      0.51      0.58      1903
           1       0.75      0.85      0.80      3281

    accuracy                           0.73      5184
   macro avg       0.71      0.68      0.69      5184
weighted avg       0.72      0.73      0.72      5184

MLPClassifier()
              precision    recall  f1-score   support

           0       0.38      0.93      0.54      1903
           1       0.75      0.13      0.22      3281

    accuracy                           0.42      5184
   macro avg       0.57      0.53      0.38      5184
weighted avg       0.6

After checking multiple models in decreasing order of f1-score and accuracy.
XGB Classifier consistently peroformed better and hence was finalized

After deciding on the model its hyperparameters were tuned and tested to maximize accuracy

In [None]:
from sklearn.model_selection import GridSearchCV

xgb_classifier = XGBClassifier()

# Define the parameter grid for grid search
param_grid = {
    'learning_rate': [0.1, 0.05, 0.01],  # Learning rate hyperparameter
    'n_estimators': [100, 200, 300],  # Number of boosting stages
    'max_depth': [3, 4, 5],  # Maximum depth of individual trees
    'subsample': [0.8, 1.0],  # Subsample ratio of training instances
    'colsample_bytree': [0.8, 1.0]  # Subsample ratio of columns
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and corresponding accuracy score
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best Accuracy Score: ", grid_search.best_score_)

Best Hyperparameters:  {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200, 'subsample': 0.8}
Best Accuracy Score:  0.8341048320880728


In [7]:
from sklearn.metrics import accuracy_score

model = XGBClassifier(colsample_bytree = 0.9, learning_rate = 0.12, max_depth = 4, n_estimators = 200, subsample = 0.8,gamma = 0.1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(model)
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.9, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0.1, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.12, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=200, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
0.847608024691358
0.8809164908049443
              precision    recall  f1-score   support

           0       0.80      0.77      0.79      1903
           1       0.87      0.89      0.88      3281

    accuracy                           0.85      518

Read and predict for the test data


In [8]:
test_data = pd.read_csv('Test_Data.csv')
test_data = test_data[X_train.columns]

After testing various imputation methods it was found that median gave the best results

In [9]:
categorical_cols = ['Specific ailments', 'Food preference', 'Smoker?', 'Living in?', 'Any heriditary condition?',
               'Follow Diet', 'Mental health management']
label_encoder = LabelEncoder()
for col in categorical_cols:
    test_data[col] = label_encoder.fit_transform(test_data[col])

#Deal with nan values
test_data = test_data.fillna(test_data.mean())

Writing predictions to csv file

In [10]:
y_test_pred = model.predict(test_data)

# Prepare submission
submission = pd.DataFrame({'predictions': y_test_pred})
submission = submission.astype(int)
# Save the submission to a CSV file
submission.to_csv('submission.csv', index=False)

After multiple iterations of the code (as there was randomness involved due to the under sampler) a maximum score of 34.9683 was obtained.

#Thank you for your time and consideration
#Peace