# Predicting smoker status using Machine Learning

* We're going to use the dataset from kaggle competition.
* First, we will clean the data that is we will fill the null values.
* Then, we will convert the non integer values from object or String to integers.
* We will call the different model such `RandomForest` and `LogisticRegression`.
* We will split the data into training and validation set.
* Fit the model and predict.
* Plot the ROC or AUC curve and check it.
* Hypertune the model with `GridSearch`.
* Submit the model to kaggle website.

In [1]:
# Importing the dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model  import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
np.random.seed(42)

In [2]:
df = pd.read_csv("train/train.csv")

In [3]:
df.head()

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,0,55,165,60,81.0,0.5,0.6,1,1,135,...,40,75,16.5,1,1.0,22,25,27,0,1
1,1,70,165,65,89.0,0.6,0.7,2,2,146,...,57,126,16.2,1,1.1,27,23,37,1,0
2,2,20,170,75,81.0,0.4,0.5,1,1,118,...,45,93,17.4,1,0.8,27,31,53,0,1
3,3,35,180,95,105.0,1.5,1.2,1,1,131,...,38,102,15.9,1,1.0,20,27,30,1,0
4,4,30,165,60,80.5,1.5,1.0,1,1,121,...,44,93,15.4,1,0.8,19,13,17,0,1


In [4]:
len(df)

159256

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159256 entries, 0 to 159255
Data columns (total 24 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   159256 non-null  int64  
 1   age                  159256 non-null  int64  
 2   height(cm)           159256 non-null  int64  
 3   weight(kg)           159256 non-null  int64  
 4   waist(cm)            159256 non-null  float64
 5   eyesight(left)       159256 non-null  float64
 6   eyesight(right)      159256 non-null  float64
 7   hearing(left)        159256 non-null  int64  
 8   hearing(right)       159256 non-null  int64  
 9   systolic             159256 non-null  int64  
 10  relaxation           159256 non-null  int64  
 11  fasting blood sugar  159256 non-null  int64  
 12  Cholesterol          159256 non-null  int64  
 13  triglyceride         159256 non-null  int64  
 14  HDL                  159256 non-null  int64  
 15  LDL              

In [6]:
df_tmp = df.copy()

In [7]:
df_tmp.isna().sum()

id                     0
age                    0
height(cm)             0
weight(kg)             0
waist(cm)              0
eyesight(left)         0
eyesight(right)        0
hearing(left)          0
hearing(right)         0
systolic               0
relaxation             0
fasting blood sugar    0
Cholesterol            0
triglyceride           0
HDL                    0
LDL                    0
hemoglobin             0
Urine protein          0
serum creatinine       0
AST                    0
ALT                    0
Gtp                    0
dental caries          0
smoking                0
dtype: int64

In [8]:
df_tmp["bmi"] = df_tmp["weight(kg)"]/((df_tmp["height(cm)"]/100)**2)

In [9]:
df_tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159256 entries, 0 to 159255
Data columns (total 25 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   159256 non-null  int64  
 1   age                  159256 non-null  int64  
 2   height(cm)           159256 non-null  int64  
 3   weight(kg)           159256 non-null  int64  
 4   waist(cm)            159256 non-null  float64
 5   eyesight(left)       159256 non-null  float64
 6   eyesight(right)      159256 non-null  float64
 7   hearing(left)        159256 non-null  int64  
 8   hearing(right)       159256 non-null  int64  
 9   systolic             159256 non-null  int64  
 10  relaxation           159256 non-null  int64  
 11  fasting blood sugar  159256 non-null  int64  
 12  Cholesterol          159256 non-null  int64  
 13  triglyceride         159256 non-null  int64  
 14  HDL                  159256 non-null  int64  
 15  LDL              

In [10]:
df_tmp.head()

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking,bmi
0,0,55,165,60,81.0,0.5,0.6,1,1,135,...,75,16.5,1,1.0,22,25,27,0,1,22.038567
1,1,70,165,65,89.0,0.6,0.7,2,2,146,...,126,16.2,1,1.1,27,23,37,1,0,23.875115
2,2,20,170,75,81.0,0.4,0.5,1,1,118,...,93,17.4,1,0.8,27,31,53,0,1,25.951557
3,3,35,180,95,105.0,1.5,1.2,1,1,131,...,102,15.9,1,1.0,20,27,30,1,0,29.320988
4,4,30,165,60,80.5,1.5,1.0,1,1,121,...,93,15.4,1,0.8,19,13,17,0,1,22.038567


In [11]:
# Creating and X and y
X = df_tmp.drop("smoking", axis=1)
y = df_tmp["smoking"]

In [12]:
X, y

(            id  age  height(cm)  weight(kg)  waist(cm)  eyesight(left)  \
 0            0   55         165          60       81.0             0.5   
 1            1   70         165          65       89.0             0.6   
 2            2   20         170          75       81.0             0.4   
 3            3   35         180          95      105.0             1.5   
 4            4   30         165          60       80.5             1.5   
 ...        ...  ...         ...         ...        ...             ...   
 159251  159251   40         155          45       69.0             1.5   
 159252  159252   50         155          75       82.0             1.0   
 159253  159253   40         160          50       66.0             1.5   
 159254  159254   50         165          75       92.0             1.2   
 159255  159255   40         145          45       76.4             1.0   
 
         eyesight(right)  hearing(left)  hearing(right)  systolic  ...  HDL  \
 0                 

In [13]:
# Creating train and validation data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [14]:
len(X_train), len(X_val), len(y_train), len(y_val)

(127404, 31852, 127404, 31852)

In [15]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
model.score(X_train, y_train)

0.9999921509528743

In [16]:
model.score(X_val, y_val)

0.7705324626397086

In [17]:
train_preds = model.predict(X_train)

In [18]:
val_preds = model.predict(X_val)

In [19]:
from sklearn.metrics import recall_score, accuracy_score, precision_score
def show_scores(model):
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_val)
    scores = {"Train Accuracy": accuracy_score(y_train, train_preds),
              "Valid Accuracy": accuracy_score(y_val, val_preds),
              "Training Recall": recall_score(y_train, train_preds),
              "Valid Recall": recall_score(y_val, val_preds),
              "Training Precision": precision_score(y_train, train_preds),
              "Valid Precision": precision_score(y_val, val_preds)}
    return scores

In [20]:
# Checking the scores
show_scores(model)

{'Train Accuracy': 0.9999921509528743,
 'Valid Accuracy': 0.7705324626397086,
 'Training Recall': 0.9999820092112838,
 'Valid Recall': 0.7972137323192835,
 'Training Precision': 1.0,
 'Valid Precision': 0.7156712608473711}

In [21]:
# HyperTuning
rf_grid = {"n_estimators": np.arange(10, 100, 10),
           "max_depth": [None, 3, 5, 10],
           "min_samples_split": np.arange(2, 20, 2),
           "max_features": [0.5, 1, "sqrt", "log2"],
           "min_samples_leaf": np.arange(1, 20 ,2)}

rs_model = RandomizedSearchCV(RandomForestClassifier(n_jobs=-1,
                                                     random_state=42),
                              param_distributions=rf_grid,
                              n_iter=10,
                              cv=5,
                              verbose=True)

rs_model.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [23]:
show_scores(rs_model)

{'Train Accuracy': 0.835907820790556,
 'Valid Accuracy': 0.7721336179831721,
 'Training Recall': 0.8774647380541163,
 'Valid Recall': 0.8122112445802829,
 'Training Precision': 0.7758000890755233,
 'Valid Precision': 0.7122732655987035}

In [24]:
rs_model.best_params_

{'n_estimators': 60,
 'min_samples_split': 14,
 'min_samples_leaf': 15,
 'max_features': 'log2',
 'max_depth': None}

In [25]:
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
log_model.score(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7236272016577188

In [26]:
log_reg_grid = {"C": np.logspace(-4, 4, 20),
                "solver": ["liblinear"]}

In [27]:
np.random.seed(42)

# Setup random hyperparameter search for LogisticRegression
rs_log_reg = RandomizedSearchCV(LogisticRegression(),
                                param_distributions=log_reg_grid,
                                cv=5,
                                n_iter=20,
                                verbose=True)

# Fit random hyperparameter search model
rs_log_reg.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [None]:
model_score(rs_log_reg)