# BMI Prediction

In [None]:
#import library
import pickle
import pandas_profiling
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier
import xgboost
import math
from sklearn import metrics

# Data Extraction

In [None]:
df = pd.read_csv('../input/bmidataset/bmi.csv')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
#correlation of each variable
df.corr()

# Exploratory Data Analysis (EDA)

In [None]:
df['Index'].value_counts()

In [None]:
df['Gender'].value_counts()

In [None]:
#transform index column
df.Index[df['Index'] == 0] = 'Extremely Weak'
df.Index[df['Index'] == 1] = 'Weak'
df.Index[df['Index'] == 2] = 'Normal'
df.Index[df['Index'] == 3] = 'Overweight'
df.Index[df['Index'] == 4] = 'Obesity'
df.Index[df['Index'] == 5] = 'Extremely Obesity'
df.head()

Variable of index :
- 0 -> Extremely Week
- 1 -> Weak
- 2 -> Normal
- 3 -> Overweight
- 4 -> Obesity
- 5 -> Extremely Obesity

In [None]:
#analyze of index based on gender, height and weight
pd.pivot_table(df, index = ['Index', 'Gender'], values = ['Height', 'Weight'], 
               aggfunc = {'Height' : np.mean, 'Weight' : np.mean}).sort_values(['Height', 'Weight'], ascending = False)

In [None]:
#analyze the factor of BMI
chart = df.boxplot(column = ['Height', 'Weight'])
chart.set_xticklabels(chart.get_xticklabels(), rotation = 0)
plt.title('The Factor of Body Mass Index', fontsize = 15, color = 'maroon', pad = 12)
plt.xlabel('Factors')
plt.ylabel('Count')
plt.show()

In [None]:
#visualize average of height based on index and gender
chart = df.groupby(['Index', 'Gender'])['Height'].mean().unstack().plot(kind = 'bar')
chart.set_xticklabels(chart.get_xticklabels(), rotation = 45)
plt.title('Average of Height based on Index and Gender', fontsize = 15, pad = 12, color = 'maroon')
plt.xlabel('Index')
plt.ylabel('Average of Height')
plt.legend(bbox_to_anchor = (1.02, 1), loc = 2, borderaxespad = 0.)
plt.show()

In [None]:
#visualize average of weight based on index and gender
chart = df.groupby(['Index', 'Gender'])['Weight'].mean().unstack().plot(kind = 'bar')
chart.set_xticklabels(chart.get_xticklabels(), rotation = 45)
plt.title('Average of Weight based on Index and Gender', fontsize = 15, pad = 12, color = 'maroon')
plt.xlabel('Index')
plt.ylabel('Average of Weight')
plt.legend(bbox_to_anchor = (1.02, 1), loc = 2, borderaxespad = 0.)
plt.show()

In [None]:
#distribution of each variable
df.hist()
plt.show()

In [None]:
#profile report of each variable
pandas_profiling.ProfileReport(df)

# Classification Model

In [None]:
#handling categorical data
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])
df['Index'] = le.fit_transform(df['Index'])

In [None]:
#split data
X = df.drop('Index', axis = 1)
y = df['Index']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
#prepare models
classifiers = [
    KNeighborsClassifier(n_neighbors = 5),
    LinearSVC(),
    DecisionTreeClassifier(criterion = 'gini', max_depth = 3),
    RandomForestClassifier(random_state = 1, max_features = 'sqrt', n_jobs = 1, verbose = 1),
    XGBClassifier(),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

In [None]:
#report models
log_cols = ["Classifier", "Accuracy"]
log = pd.DataFrame(columns = log_cols)

for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    print("=" * 30)
    print(name)
    print('****Results****')
    
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    print("\n")
    
    log_entry = pd.DataFrame([[name, acc * 100]], columns = log_cols)
    log = log.append(log_entry)
    
print("=" * 30)

In [None]:
#visualize accuracy models
plt.figure(figsize = (10,6))
sns.set_color_codes("muted")
sns.barplot(x = 'Accuracy', y = 'Classifier', data = log, color = "lime")
plt.xlabel('Accuracy %')
plt.title('Accuracy Score of Classification Model')
plt.show()

Quadratic Discriminant Analysis is very suitable model to predict index from BMI (Body Mass Index) because has good accuracy score on 89%

In [None]:
#fitting model
qda = QuadraticDiscriminantAnalysis()
%time qda.fit(X_train, y_train)

In [None]:
#prediction
y_pred = qda.predict(X_test)
print(y_pred)

In [None]:
#check of MSE & RMSE 
mse = metrics.mean_squared_error(y_test, y_pred)
print('Mean Squared Error : '+ str(mse))
rmse = math.sqrt(metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error : '+ str(rmse))

In [None]:
#confusion matrix
matrix = pd.DataFrame((metrics.confusion_matrix(y_test, y_pred)), 
                      ('Extremely Weak', 'Weak', 'Normal', 'Overweight', 'Obesity', 'Extremely Obesity'), 
                      ('Extremely Weak', 'Weak', 'Normal', 'Overweight', 'Obesity', 'Extremely Obesity'))
print(matrix)

#visualize confusion matrix
plt.figure(figsize = (8,6))
heatmap = sns.heatmap(matrix, annot = True, annot_kws = {'size': 14}, fmt = 'd', cmap = 'YlGnBu')
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation = 0, ha = 'right', fontsize = 14)
heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation = 45, ha = 'right', fontsize = 14)

plt.title('Confusion Matrix\n(Quadratic Discriminant Analysis)', fontsize = 18, color = 'darkblue')
plt.ylabel('True label', fontsize = 14)
plt.xlabel('Predicted label', fontsize = 14)
plt.show()

In [None]:
#classification report
report = metrics.classification_report(y_test, y_pred)
print('Classification Report of Quadratic Discriminant Analysis : \n')
print(report)

# Deployment

In [None]:
#deploy the model
qda.fit(X, y)
pickle.dump(qda, open('quadratic_discriminant_analysis.pkl', 'wb'))