In [100]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Dataset preprocessing

## Read dataset and first analysis

In [101]:
pd.set_option('display.max_columns', None)

df = pd.read_csv('../resources/dataset/smoking_train_dataset.csv')
df

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,fasting blood sugar,Cholesterol,triglyceride,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,35,170,85,97.0,0.9,0.9,1,1,118,78,97,239,153,70,142,19.8,1,1.0,61,115,125,1,1
1,20,175,110,110.0,0.7,0.9,1,1,119,79,88,211,128,71,114,15.9,1,1.1,19,25,30,1,0
2,45,155,65,86.0,0.9,0.9,1,1,110,80,80,193,120,57,112,13.7,3,0.6,1090,1400,276,0,0
3,45,165,80,94.0,0.8,0.7,1,1,158,88,249,210,366,46,91,16.9,1,0.9,32,36,36,0,0
4,20,165,60,81.0,1.5,0.1,1,1,109,64,100,179,200,47,92,14.9,1,1.2,26,28,15,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38979,40,165,60,80.0,0.4,0.6,1,1,107,60,93,144,53,61,72,12.3,1,0.5,18,18,21,1,0
38980,45,155,55,75.0,1.5,1.2,1,1,126,72,91,227,100,76,131,12.5,2,0.6,23,11,12,0,0
38981,40,170,105,124.0,0.6,0.5,1,1,141,85,115,225,196,48,138,17.1,1,0.8,24,23,35,1,1
38982,40,160,55,75.0,1.5,1.5,1,1,95,69,102,206,48,79,116,12.0,1,0.6,24,20,17,0,1


In [102]:
df.describe()

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,fasting blood sugar,Cholesterol,triglyceride,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
count,38984.0,38984.0,38984.0,38984.0,38984.0,38984.0,38984.0,38984.0,38984.0,38984.0,38984.0,38984.0,38984.0,38984.0,38984.0,38984.0,38984.0,38984.0,38984.0,38984.0,38984.0,38984.0,38984.0
mean,44.127591,164.689488,65.938718,82.062115,1.014955,1.008768,1.025369,1.02619,121.475631,75.994408,99.342269,196.883491,126.749461,57.293146,115.081495,14.624264,1.086523,0.88603,26.198235,27.145188,39.905038,0.214421,0.367279
std,12.063564,9.187507,12.896581,9.326798,0.498527,0.493813,0.157246,0.159703,13.643521,9.658734,20.642741,36.353945,71.803143,14.617822,42.883163,1.566528,0.402107,0.220621,19.175595,31.309945,49.693843,0.410426,0.48207
min,20.0,130.0,30.0,51.0,0.1,0.1,1.0,1.0,71.0,40.0,46.0,55.0,8.0,4.0,1.0,4.9,1.0,0.1,6.0,1.0,2.0,0.0,0.0
25%,40.0,160.0,55.0,76.0,0.8,0.8,1.0,1.0,112.0,70.0,89.0,172.0,74.0,47.0,91.0,13.6,1.0,0.8,19.0,15.0,17.0,0.0,0.0
50%,40.0,165.0,65.0,82.0,1.0,1.0,1.0,1.0,120.0,76.0,96.0,195.0,108.0,55.0,113.0,14.8,1.0,0.9,23.0,21.0,26.0,0.0,0.0
75%,55.0,170.0,75.0,88.0,1.2,1.2,1.0,1.0,130.0,82.0,104.0,219.0,160.0,66.0,136.0,15.8,1.0,1.0,29.0,31.0,44.0,0.0,1.0
max,85.0,190.0,135.0,129.0,9.9,9.9,2.0,2.0,233.0,146.0,423.0,445.0,999.0,359.0,1860.0,21.1,6.0,11.6,1090.0,2914.0,999.0,1.0,1.0


In [103]:
df.isna().sum()

age                    0
height(cm)             0
weight(kg)             0
waist(cm)              0
eyesight(left)         0
eyesight(right)        0
hearing(left)          0
hearing(right)         0
systolic               0
relaxation             0
fasting blood sugar    0
Cholesterol            0
triglyceride           0
HDL                    0
LDL                    0
hemoglobin             0
Urine protein          0
serum creatinine       0
AST                    0
ALT                    0
Gtp                    0
dental caries          0
smoking                0
dtype: int64

## Data preprocessing

### One-Hot encoding

In [104]:
for col in df.columns:
    print(col, '\t',df[col].dtype)

age 	 int64
height(cm) 	 int64
weight(kg) 	 int64
waist(cm) 	 float64
eyesight(left) 	 float64
eyesight(right) 	 float64
hearing(left) 	 int64
hearing(right) 	 int64
systolic 	 int64
relaxation 	 int64
fasting blood sugar 	 int64
Cholesterol 	 int64
triglyceride 	 int64
HDL 	 int64
LDL 	 int64
hemoglobin 	 float64
Urine protein 	 int64
serum creatinine 	 float64
AST 	 int64
ALT 	 int64
Gtp 	 int64
dental caries 	 int64
smoking 	 int64


In [105]:
columns_to_onehot = []
for col in df.columns:
    if df[col].dtype == "int64":
        if df[col].unique().__len__() < 7:
            columns_to_onehot.append(col)
columns_to_onehot

['hearing(left)',
 'hearing(right)',
 'Urine protein',
 'dental caries',
 'smoking']

In [106]:
df_one_hot = pd.get_dummies(df[columns_to_onehot])
df = df.drop(columns_to_onehot, axis=1)
df = df.join(df_one_hot)

### Normalize data

In [107]:
columns_to_normalize = []
for col in df.columns:
    if df[col].dtype == "float64":
        columns_to_normalize.append(col)

In [108]:
scaler = StandardScaler()
df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

## Features importance

In [109]:
X = df.drop(columns=["smoking"]).values
y = df["smoking"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=42)

In [110]:
pred = RandomForestClassifier(100, random_state=42)
pred.fit(X_train, y_train)

y_pred_test = pred.predict(X_test)
accuracy_score(y_test, y_pred_test)

0.8008413708187975

In [111]:
confusion_matrix(y_test, y_pred_test)

array([[5248,  934],
       [1007, 2557]])

In [112]:
feature_names = df.drop(columns=["smoking"]).columns
sorted(zip(feature_names, pred.feature_importances_), key=lambda x: x[1], reverse=True)

[('hemoglobin', 0.10956568649287891),
 ('height(cm)', 0.10035774030905763),
 ('Gtp', 0.09601156404974914),
 ('triglyceride', 0.06684433637483003),
 ('HDL', 0.05179890972575346),
 ('LDL', 0.05032959146010147),
 ('Cholesterol', 0.048772835306881514),
 ('waist(cm)', 0.048692847440483876),
 ('fasting blood sugar', 0.04832203196461611),
 ('ALT', 0.0478490545743216),
 ('weight(kg)', 0.047251216422707715),
 ('systolic', 0.04455082725583897),
 ('serum creatinine', 0.04361683463251126),
 ('AST', 0.043072922086198624),
 ('relaxation', 0.04202544158060978),
 ('age', 0.03945631417321746),
 ('eyesight(right)', 0.027770074652094256),
 ('eyesight(left)', 0.027407207301650936),
 ('dental caries', 0.008273638532754398),
 ('Urine protein', 0.004549712864585533),
 ('hearing(right)', 0.0018083387209030365),
 ('hearing(left)', 0.00167287407825426)]