### Obesity prediction using  machine learning models

In [68]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import xgboost as xgb
from xgboost import XGBClassifier,plot_importance
from sklearn.ensemble import (RandomForestClassifier,
                             AdaBoostClassifier,
                             GradientBoostingClassifier)
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import precision_recall_fscore_support
import sklearn.preprocessing as preprocessing

In [69]:
df= pd.read_csv("obesitydataset.csv")


In [70]:

le=preprocessing.LabelEncoder()

In [71]:
df['gender']=le.fit_transform(df['Gender'].astype(str))

In [72]:
df['famhistory']=le.fit_transform(df['family_history_with_overweight'].astype(str))
df['favc']=le.fit_transform(df['FAVC'].astype(str))
df['caec']=le.fit_transform(df['CAEC'].astype(str))
df['smoke']=le.fit_transform(df['SMOKE'].astype(str))

In [73]:
df['scc']=le.fit_transform(df['SCC'].astype(str))
df['calc']=le.fit_transform(df['CALC'].astype(str))
df['mtrans']=le.fit_transform(df['MTRANS'].astype(str))
df['nobeyes']=le.fit_transform(df['NObeyesdad'].astype(str))

In [74]:
df=df.drop(['Gender','family_history_with_overweight','FAVC','CAEC','SMOKE','SCC','CALC','MTRANS','NObeyesdad'],axis=1)

In [75]:
df.to_csv('obesitydatasetpreprocessed.csv')

In [76]:
y=df.nobeyes
X=df.drop(['nobeyes'],axis='columns')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train.shape, X_test.shape

((1688, 16), (423, 16))

### Category Boosting (CatBoost).

In [77]:
clf = CatBoostClassifier(
    iterations=1000,
  learning_rate=0.4,
  depth=5,
  colsample_bylevel=0.8,
  random_seed = 2020,
  bagging_temperature = 0.2,
  metric_period = None,
  custom_loss=['AUC', 'Accuracy']
)

In [78]:
cat_features = list(range(0, X.shape[1]))
print(cat_features)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]


In [79]:
clf.fit(X_train, y_train)

0:	learn: 1.3808692	total: 39.4ms	remaining: 39.4s
1:	learn: 1.0797037	total: 48.3ms	remaining: 24.1s
2:	learn: 0.8543911	total: 64.1ms	remaining: 21.3s
3:	learn: 0.7422935	total: 106ms	remaining: 26.3s
4:	learn: 0.6586774	total: 145ms	remaining: 28.8s
5:	learn: 0.5961040	total: 154ms	remaining: 25.4s
6:	learn: 0.5316042	total: 162ms	remaining: 23s
7:	learn: 0.4894393	total: 173ms	remaining: 21.4s
8:	learn: 0.4495417	total: 196ms	remaining: 21.6s
9:	learn: 0.4359268	total: 221ms	remaining: 21.8s
10:	learn: 0.3988574	total: 231ms	remaining: 20.8s
11:	learn: 0.3700589	total: 241ms	remaining: 19.8s
12:	learn: 0.3486471	total: 264ms	remaining: 20s
13:	learn: 0.3258914	total: 296ms	remaining: 20.9s
14:	learn: 0.3078422	total: 338ms	remaining: 22.2s
15:	learn: 0.2984022	total: 365ms	remaining: 22.4s
16:	learn: 0.2853406	total: 394ms	remaining: 22.8s
17:	learn: 0.2680428	total: 422ms	remaining: 23s
18:	learn: 0.2582855	total: 429ms	remaining: 22.2s
19:	learn: 0.2401771	total: 437ms	remaining:

<catboost.core.CatBoostClassifier at 0x7fda5a69bdd0>

In [80]:
clf.is_fitted()

True

In [81]:
clf.get_params()

{'iterations': 1000,
 'learning_rate': 0.4,
 'depth': 5,
 'random_seed': 2020,
 'custom_loss': ['AUC', 'Accuracy'],
 'bagging_temperature': 0.2,
 'colsample_bylevel': 0.8}

In [82]:
p=clf.predict(X_test)

In [83]:
accuracy_score(y_test, p)

0.9787234042553191

In [84]:
precision_recall_fscore_support(y_test, p, average='weighted')

(0.9792741367626641, 0.9787234042553191, 0.9788266637705172, None)