In [159]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [160]:
# Step 2: Load the training and testing datasets
train_data = pd.read_csv('Train_Data (2).csv')
test_data = pd.read_csv('Test_data (1).csv')

In [161]:
train_data.head()

Unnamed: 0,ID1,Specific ailments,ID2,Food preference,Age,BMI,Smoker?,Living in?,Any heriditary condition?,Follow Diet,Physical activity,Regular sleeping hours,Alcohol consumption,Social interaction,Taking supplements,Mental health management,Illness count last year,Healthy
0,2408,44,2668,DX6,49,20.50047,NO,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
1,25063,39,10363,DX3 DX4,20,26.07658,NO,URBAN,Stable,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1
2,26798,29,132,DX6,1,21.420866,NO,URBAN,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,31907,27,10499,DX1,30,25.203247,NO,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,26412,9,7963,DX6,40,19.355846,YES,RURAL,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1


In [162]:
train_data.shape

(25920, 18)

In [163]:
features = train_data.drop('Healthy', axis=1)
target = train_data['Healthy']

In [164]:
features.isnull().sum()

ID1                            0
Specific ailments              0
ID2                            0
Food preference               10
Age                            0
BMI                            0
Smoker?                        0
Living in?                     0
Any heriditary condition?      0
Follow Diet                  973
Physical activity            973
Regular sleeping hours       973
Alcohol consumption          973
Social interaction           973
Taking supplements           973
Mental health management     973
Illness count last year      973
dtype: int64

In [165]:
target.isnull().sum()

0

In [166]:
from sklearn.impute import SimpleImputer
# Columns with missing values
columns_with_missing = ['Follow Diet', 'Physical activity', 'Regular sleeping hours',
                        'Alcohol consumption', 'Social interaction', 'Taking supplements',
                        'Mental health management', 'Illness count last year']

# Impute missing values
imputer = SimpleImputer(strategy='mean')
features[columns_with_missing] = imputer.fit_transform(features[columns_with_missing])

# Verify that there are no more missing values
print(features.isnull().sum())
features.shape

ID1                           0
Specific ailments             0
ID2                           0
Food preference              10
Age                           0
BMI                           0
Smoker?                       0
Living in?                    0
Any heriditary condition?     0
Follow Diet                   0
Physical activity             0
Regular sleeping hours        0
Alcohol consumption           0
Social interaction            0
Taking supplements            0
Mental health management      0
Illness count last year       0
dtype: int64


(25920, 17)

In [167]:
# For categorical columns (e.g., 'Food preference'), replace missing values with the most frequent value
features['Food preference'].fillna(features['Food preference'].mode().iloc[0], inplace=True)

# Verify that there are no more missing values
print(features.isnull().sum())

ID1                          0
Specific ailments            0
ID2                          0
Food preference              0
Age                          0
BMI                          0
Smoker?                      0
Living in?                   0
Any heriditary condition?    0
Follow Diet                  0
Physical activity            0
Regular sleeping hours       0
Alcohol consumption          0
Social interaction           0
Taking supplements           0
Mental health management     0
Illness count last year      0
dtype: int64


In [168]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [169]:
categorical_columns = ['Food preference', 'Smoker?', 'Living in?', 'Any heriditary condition?']

# One-hot encoding
one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_features = one_hot_encoder.fit_transform(features[categorical_columns])

# Create column names for the one-hot encoded features
encoded_column_names = []
for i, column in enumerate(categorical_columns):
    unique_categories = features[column].unique()
    encoded_column_names.extend([f"{column}_{category}" for category in unique_categories])

# Convert the encoded features to a DataFrame
encoded_features_df = pd.DataFrame(encoded_features, columns=encoded_column_names)

# Drop the original categorical columns from the features DataFrame
features = features.drop(categorical_columns, axis=1)

# Concatenate the encoded features with the remaining features
features_encoded = pd.concat([features, encoded_features_df], axis=1)

# Verify the encoded features
print(features_encoded.head())

     ID1  Specific ailments    ID2  Age        BMI  Follow Diet  \
0   2408                 44   2668   49  20.500470          1.0   
1  25063                 39  10363   20  26.076580          0.0   
2  26798                 29    132    1  21.420866          1.0   
3  31907                 27  10499   30  25.203247          1.0   
4  26412                  9   7963   40  19.355846          1.0   

   Physical activity  Regular sleeping hours  Alcohol consumption  \
0                0.0                     0.0                  0.0   
1                0.0                     0.0                  0.0   
2                0.0                     0.0                  0.0   
3                0.0                     0.0                  0.0   
4                0.0                     0.0                  0.0   

   Social interaction  ...  Food preference_DX1 DX2 DX4 DX5   \
0                 0.0  ...                               0.0   
1                 1.0  ...                            



In [170]:
features_encoded.shape
features_encoded.head()


Unnamed: 0,ID1,Specific ailments,ID2,Age,BMI,Follow Diet,Physical activity,Regular sleeping hours,Alcohol consumption,Social interaction,...,Food preference_DX1 DX2 DX4 DX5,Food preference_DX1 DX2 DX3 DX5,Food preference_DX1 DX2 DX3 DX4 DX5,Food preference_DX1 DX2 DX3 DX4,Smoker?_NO,Smoker?_YES,Smoker?_Cannot say,Living in?_RURAL,Living in?_URBAN,Any heriditary condition?_Stable
0,2408,44,2668,49,20.50047,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
1,25063,39,10363,20,26.07658,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
2,26798,29,132,1,21.420866,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0
3,31907,27,10499,30,25.203247,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,26412,9,7963,40,19.355846,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0


In [171]:
# Scale the encoded features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features_encoded)

# Verify the scaled features
scaled_features_df = pd.DataFrame(scaled_features, columns=features_encoded.columns)
print(scaled_features_df.head())

        ID1  Specific ailments       ID2       Age       BMI  Follow Diet  \
0 -1.480257           1.162127 -0.993617  0.807115 -0.781165     0.347759   
1  0.898873           0.830039  1.137280 -0.677645  0.689686    -2.987707   
2  1.081075           0.165864 -1.695885 -1.650418 -0.538385     0.347759   
3  1.617600           0.033029  1.174941 -0.165659  0.459321     0.347759   
4  1.040539          -1.162486  0.472673  0.346327 -1.083090     0.347759   

   Physical activity  Regular sleeping hours  Alcohol consumption  \
0          -0.407934                -0.48466            -0.309112   
1          -0.407934                -0.48466            -0.309112   
2          -0.407934                -0.48466            -0.309112   
3          -0.407934                -0.48466            -0.309112   
4          -0.407934                -0.48466            -0.309112   

   Social interaction  ...  Food preference_DX1 DX2 DX4 DX5   \
0           -0.826870  ...                         -0.2968

In [172]:
print(test_data.isnull().sum())

ID1                            0
Specific ailments              0
ID2                            0
Food preference                3
Age                            0
BMI                            0
Smoker?                        0
Living in?                     0
Any heriditary condition?      0
Follow Diet                  262
Physical activity            262
Regular sleeping hours       262
Alcohol consumption          262
Social interaction           262
Taking supplements           262
Mental health management     262
Illness count last year      262
dtype: int64


In [173]:
test_data.shape

(6480, 17)

In [174]:
from sklearn.impute import SimpleImputer
# Columns with missing values
columns_with_missing = ['Follow Diet', 'Physical activity', 'Regular sleeping hours',
                        'Alcohol consumption', 'Social interaction', 'Taking supplements',
                        'Mental health management', 'Illness count last year']

# Impute missing values
imputer = SimpleImputer(strategy='median')
test_data[columns_with_missing] = imputer.fit_transform(test_data[columns_with_missing])
test_data.shape
test_data.head()

Unnamed: 0,ID1,Specific ailments,ID2,Food preference,Age,BMI,Smoker?,Living in?,Any heriditary condition?,Follow Diet,Physical activity,Regular sleeping hours,Alcohol consumption,Social interaction,Taking supplements,Mental health management,Illness count last year
0,28534,2,3306,DX6,38,18.879331,YES,URBAN,Stable,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,7970,5,5573,DX1,46,21.231991,NO,URBAN,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,22039,37,9305,DX4,11,17.867876,NO,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,12332,44,8274,DX5,10,26.886096,NO,RURAL,Stable,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0
4,4116,37,4558,DX3 DX4,3,23.362746,NO,RURAL,Stable,0.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0


In [175]:
print(test_data.isnull().sum())

ID1                          0
Specific ailments            0
ID2                          0
Food preference              3
Age                          0
BMI                          0
Smoker?                      0
Living in?                   0
Any heriditary condition?    0
Follow Diet                  0
Physical activity            0
Regular sleeping hours       0
Alcohol consumption          0
Social interaction           0
Taking supplements           0
Mental health management     0
Illness count last year      0
dtype: int64


In [176]:
# For categorical columns (e.g., 'Food preference'), replace missing values with the most frequent value
test_data['Food preference'].fillna(test_data['Food preference'].mode().iloc[0], inplace=True)

# Verify that there are no more missing values
print(test_data.isnull().sum())

ID1                          0
Specific ailments            0
ID2                          0
Food preference              0
Age                          0
BMI                          0
Smoker?                      0
Living in?                   0
Any heriditary condition?    0
Follow Diet                  0
Physical activity            0
Regular sleeping hours       0
Alcohol consumption          0
Social interaction           0
Taking supplements           0
Mental health management     0
Illness count last year      0
dtype: int64


In [177]:

categorical_columns_test = ['Food preference', 'Smoker?', 'Living in?', 'Any heriditary condition?']

# One-hot encoding
one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_features_test = one_hot_encoder.fit_transform(test_data[categorical_columns_test])

# Create column names for the one-hot encoded features
encoded_column_names = []
for i, column in enumerate(categorical_columns_test):
    unique_categories = test_data[column].unique()
    encoded_column_names.extend([f"{column}_{category}" for category in unique_categories])

# Convert the encoded features to a DataFrame
encoded_features_df_test = pd.DataFrame(encoded_features_test, columns=encoded_column_names)

# Drop the original categorical columns from the features DataFrame
test_data = test_data.drop(categorical_columns_test, axis=1)

# Concatenate the encoded features with the remaining features
features_encoded_test = pd.concat([test_data, encoded_features_df_test], axis=1)

# Verify the encoded features
print(features_encoded_test.head())
features_encoded_test.shape

     ID1  Specific ailments   ID2  Age        BMI  Follow Diet  \
0  28534                  2  3306   38  18.879331          0.0   
1   7970                  5  5573   46  21.231991          1.0   
2  22039                 37  9305   11  17.867876          1.0   
3  12332                 44  8274   10  26.886096          1.0   
4   4116                 37  4558    3  23.362746          0.0   

   Physical activity  Regular sleeping hours  Alcohol consumption  \
0                0.0                     0.0                  0.0   
1                0.0                     0.0                  0.0   
2                0.0                     0.0                  0.0   
3                0.0                     0.0                  1.0   
4                1.0                     0.0                  1.0   

   Social interaction  ...  Food preference_DX1 DX3 DX4 DX5   \
0                 0.0  ...                               0.0   
1                 0.0  ...                               0.0



(6480, 51)

In [178]:
# Scale the encoded features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features_encoded_test)

# Verify the scaled features
scaled_features_df_test = pd.DataFrame(scaled_features, columns=features_encoded_test.columns)
print(scaled_features_df_test.head())

        ID1  Specific ailments       ID2       Age       BMI  Follow Diet  \
0  1.257970          -1.626472 -0.814968  0.247857 -1.181364    -3.063432   
1 -0.890667          -1.426812 -0.183502  0.657777 -0.565163     0.326431   
2  0.579337           0.702899  0.856036 -1.135621 -1.446280     0.326431   
3 -0.434902           1.168774  0.568853 -1.186861  0.915740     0.326431   
4 -1.293354           0.702899 -0.466227 -1.545540 -0.007083    -3.063432   

   Physical activity  Regular sleeping hours  Alcohol consumption  \
0          -0.391446                 -0.4645            -0.301207   
1          -0.391446                 -0.4645            -0.301207   
2          -0.391446                 -0.4645            -0.301207   
3          -0.391446                 -0.4645             3.319979   
4           2.554630                 -0.4645             3.319979   

   Social interaction  ...  Food preference_DX1 DX3 DX4 DX5   \
0           -0.770013  ...                         -0.2822

In [179]:
from sklearn.model_selection import train_test_split

# Step 4: Split the training data
X_train, X_val, y_train, y_val = train_test_split(scaled_features_df, target, test_size=0.2, random_state=42)

In [180]:
print(X_train.shape,X_val.shape)

(20736, 51) (5184, 51)


In [181]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import f1_score, accuracy_score

# # Step 5: Train a machine learning model
# model = LogisticRegression()
# model.fit(X_train, y_train)

# # Step 6: Evaluate the model
# val_predictions = model.predict(X_val)

# # Calculate F1 score
# f1 = f1_score(y_val, val_predictions)

# # Calculate accuracy
# accuracy = accuracy_score(y_val, val_predictions)

# print("F1 score:", f1)
# print("Accuracy:", accuracy)


In [182]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import f1_score, accuracy_score

# # Step 5: Train a machine learning model (Random Forest)
# model = RandomForestClassifier()
# model.fit(X_train, y_train)

# # Step 6: Evaluate the model
# val_predictions = model.predict(X_val)

# # Calculate F1 score
# f1 = f1_score(y_val, val_predictions)

# # Calculate accuracy
# accuracy = accuracy_score(y_val, val_predictions)

# print("F1 score:", f1)
# print("Accuracy:", accuracy)


In [183]:
# from sklearn.neural_network import MLPClassifier
# from sklearn.metrics import f1_score, accuracy_score

# # Step 5: Train a machine learning model (Neural Network)
# model = MLPClassifier(hidden_layer_sizes=(100, 100), random_state=42)
# model.fit(X_train, y_train)

# # Step 6: Evaluate the model
# val_predictions = model.predict(X_val)

# # Calculate F1 score
# f1 = f1_score(y_val, val_predictions)

# # Calculate accuracy
# accuracy = accuracy_score(y_val, val_predictions)

# print("F1 score:", f1)
# print("Accuracy:", accuracy)


In [184]:
# from xgboost import XGBClassifier
# from sklearn.metrics import f1_score, accuracy_score

# # Step 5: Train a machine learning model (XGBoost)
# model = XGBClassifier()
# model.fit(X_train, y_train)

# # Step 6: Evaluate the model
# val_predictions = model.predict(X_val)

# # Calculate F1 score
# f1 = f1_score(y_val, val_predictions)

# # Calculate accuracy
# accuracy = accuracy_score(y_val, val_predictions)

# print("F1 score:", f1)
# print("Accuracy:", accuracy)


In [185]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score


# Step 5: Define the hyperparameter grid for grid search
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300]
}

# Step 6: Create an XGBoost classifier
model = XGBClassifier()

# Step 7: Perform grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, scoring=make_scorer(f1_score), cv=3)
grid_search.fit(X_train, y_train)

# Step 8: Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Step 9: Evaluate the best model
val_predictions = best_model.predict(X_val)
f1 = f1_score(y_val, val_predictions)
accuracy = accuracy_score(y_val, val_predictions)

print("Best Hyperparameters:", best_params)
print("F1 score:", f1)
print("Accuracy:", accuracy)


Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}
F1 score: 0.8765674573198369
Accuracy: 0.8423996913580247


In [186]:
scaled_features_df_test.head()

Unnamed: 0,ID1,Specific ailments,ID2,Age,BMI,Follow Diet,Physical activity,Regular sleeping hours,Alcohol consumption,Social interaction,...,Food preference_DX1 DX3 DX4 DX5,Food preference_DX2 DX3 DX4 DX5,Food preference_DX1 DX2 DX3 DX4 DX5,Food preference_DX1 DX2 DX3 DX5,Smoker?_YES,Smoker?_NO,Smoker?_Cannot say,Living in?_URBAN,Living in?_RURAL,Any heriditary condition?_Stable
0,1.25797,-1.626472,-0.814968,0.247857,-1.181364,-3.063432,-0.391446,-0.4645,-0.301207,-0.770013,...,-0.282206,-0.137943,-0.296612,1.290145,-0.021522,-1.132472,1.133538,-1.538825,1.538825,0.0
1,-0.890667,-1.426812,-0.183502,0.657777,-0.565163,0.326431,-0.391446,-0.4645,-0.301207,-0.770013,...,-0.282206,-0.137943,-0.296612,-0.775107,-0.021522,0.883024,-0.882194,-1.538825,1.538825,0.0
2,0.579337,0.702899,0.856036,-1.135621,-1.44628,0.326431,-0.391446,-0.4645,-0.301207,-0.770013,...,3.543513,-0.137943,-0.296612,-0.775107,-0.021522,0.883024,-0.882194,0.649846,-0.649846,0.0
3,-0.434902,1.168774,0.568853,-1.186861,0.91574,0.326431,-0.391446,-0.4645,3.319979,-0.770013,...,-0.282206,-0.137943,3.371411,-0.775107,-0.021522,0.883024,-0.882194,0.649846,-0.649846,0.0
4,-1.293354,0.702899,-0.466227,-1.54554,-0.007083,-3.063432,2.55463,-0.4645,3.319979,-0.770013,...,-0.282206,-0.137943,-0.296612,-0.775107,-0.021522,0.883024,-0.882194,0.649846,-0.649846,0.0


In [187]:
test_data_reordered = scaled_features_df_test[X_train.columns]


In [188]:
test_data_reordered.columns==X_train.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True])

In [189]:
test_data_reordered.head()

Unnamed: 0,ID1,Specific ailments,ID2,Age,BMI,Follow Diet,Physical activity,Regular sleeping hours,Alcohol consumption,Social interaction,...,Food preference_DX1 DX2 DX4 DX5,Food preference_DX1 DX2 DX3 DX5,Food preference_DX1 DX2 DX3 DX4 DX5,Food preference_DX1 DX2 DX3 DX4,Smoker?_NO,Smoker?_YES,Smoker?_Cannot say,Living in?_RURAL,Living in?_URBAN,Any heriditary condition?_Stable
0,1.25797,-1.626472,-0.814968,0.247857,-1.181364,-3.063432,-0.391446,-0.4645,-0.301207,-0.770013,...,-0.054228,1.290145,-0.296612,-0.140248,-1.132472,-0.021522,1.133538,1.538825,-1.538825,0.0
1,-0.890667,-1.426812,-0.183502,0.657777,-0.565163,0.326431,-0.391446,-0.4645,-0.301207,-0.770013,...,-0.054228,-0.775107,-0.296612,-0.140248,0.883024,-0.021522,-0.882194,1.538825,-1.538825,0.0
2,0.579337,0.702899,0.856036,-1.135621,-1.44628,0.326431,-0.391446,-0.4645,-0.301207,-0.770013,...,-0.054228,-0.775107,-0.296612,-0.140248,0.883024,-0.021522,-0.882194,-0.649846,0.649846,0.0
3,-0.434902,1.168774,0.568853,-1.186861,0.91574,0.326431,-0.391446,-0.4645,3.319979,-0.770013,...,-0.054228,-0.775107,3.371411,-0.140248,0.883024,-0.021522,-0.882194,-0.649846,0.649846,0.0
4,-1.293354,0.702899,-0.466227,-1.54554,-0.007083,-3.063432,2.55463,-0.4645,3.319979,-0.770013,...,-0.054228,-0.775107,-0.296612,-0.140248,0.883024,-0.021522,-0.882194,-0.649846,0.649846,0.0


In [192]:
test_predictions=best_model.predict(test_data_reordered)

In [193]:
# Create a submission DataFrame
submission_df = pd.DataFrame({'predictions': test_predictions})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission_h2.csv', index=False)

In [194]:
submission_df.shape 

(6480, 1)