# Loading Required Library

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier



from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split


from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from pprint import pprint


import warnings
warnings.filterwarnings('ignore')


# Loading Training Set and Test Set

In [2]:
train_df =pd.read_csv("2021-01-21_zeta-disease_training-data_dsi-take-home-challenge.csv")
test_df =pd.read_csv("2021-01-21_zeta-disease_prediction-data_dsi-take-home-challenge.csv")

# Looking at Sample Data

In [3]:
train_df.head()

Unnamed: 0,age,weight,bmi,blood_pressure,insulin_test,liver_stress_test,cardio_stress_test,years_smoking,zeta_disease
0,54,189,27.1,80,0,1.5038,0,10,0
1,23,150,38.5,68,71,0.3868,55,2,0
2,47,186,29.9,90,0,0.2728,0,7,0
3,18,150,30.8,70,1033,0.6598,56,0,0
4,24,160,32.4,74,125,0.7608,59,2,0


# Lets check how is data corelated

In [4]:
corr = train_df.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,age,weight,bmi,blood_pressure,insulin_test,liver_stress_test,cardio_stress_test,years_smoking,zeta_disease
age,1.0,0.157705,0.0353475,0.193066,-0.0342942,0.0424164,-0.106039,0.372373,0.192925
weight,0.157705,1.0,0.214262,0.122349,0.304295,0.146779,0.0536287,0.100834,0.471155
bmi,0.0353475,0.214262,1.0,0.240513,0.217265,0.116649,0.264861,0.0286833,0.271856
blood_pressure,0.193066,0.122349,0.240513,1.0,0.0794248,0.0457639,0.193221,0.0956134,0.0457385
insulin_test,-0.0342942,0.304295,0.217265,0.0794248,1.0,0.17523,0.417894,-0.00629206,0.126504
liver_stress_test,0.0424164,0.146779,0.116649,0.0457639,0.17523,1.0,0.172048,0.0218174,0.184738
cardio_stress_test,-0.106039,0.0536287,0.264861,0.193221,0.417894,0.172048,1.0,-0.0803578,0.0360902
years_smoking,0.372373,0.100834,0.0286833,0.0956134,-0.00629206,0.0218174,-0.0803578,1.0,0.195261
zeta_disease,0.192925,0.471155,0.271856,0.0457385,0.126504,0.184738,0.0360902,0.195261,1.0


# Analysing Data 

In [5]:
train_df.describe()

Unnamed: 0,age,weight,bmi,blood_pressure,insulin_test,liver_stress_test,cardio_stress_test,years_smoking,zeta_disease
count,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0
mean,30.985,172.4075,32.201625,69.565,85.8875,0.544496,43.12125,4.05125,0.34875
std,13.824025,31.942438,8.549155,19.874784,126.333656,0.348711,30.409949,4.176173,0.476873
min,18.0,94.0,0.0,0.0,0.0,0.1408,0.0,0.0,0.0
25%,21.0,149.0,27.3,62.0,0.0,0.3078,0.0,1.0,0.0
50%,26.0,167.0,32.05,72.0,45.0,0.4453,53.0,3.0,0.0
75%,38.0,192.0,36.525,80.0,130.0,0.6998,62.0,6.0,1.0
max,109.0,308.0,86.1,157.0,1077.0,3.4813,214.0,40.0,1.0


In [6]:
test_df.describe()

Unnamed: 0,age,weight,bmi,blood_pressure,insulin_test,liver_stress_test,cardio_stress_test,years_smoking,zeta_disease
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,0.0
mean,34.75,178.8,34.48,78.5,145.05,1.5696,61.95,6.05,
std,11.511436,27.935264,6.628773,14.005638,75.964309,0.230356,9.703363,3.471311,
min,19.0,120.0,25.8,59.0,50.0,1.2498,43.0,2.0,
25%,26.25,153.25,30.25,69.75,76.25,1.4118,55.75,3.0,
50%,34.5,188.5,33.15,72.5,137.0,1.4833,60.0,5.5,
75%,44.25,197.75,37.6,89.25,167.75,1.7378,68.0,7.5,
max,60.0,216.0,50.7,108.0,362.0,2.0508,83.0,13.0,


In [7]:
train_df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 9 columns):
age                   800 non-null int64
weight                800 non-null int64
bmi                   800 non-null float64
blood_pressure        800 non-null int64
insulin_test          800 non-null int64
liver_stress_test     800 non-null float64
cardio_stress_test    800 non-null int64
years_smoking         800 non-null int64
zeta_disease          800 non-null int64
dtypes: float64(2), int64(7)
memory usage: 56.4 KB


In [8]:
test_df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 9 columns):
age                   20 non-null int64
weight                20 non-null int64
bmi                   20 non-null float64
blood_pressure        20 non-null int64
insulin_test          20 non-null int64
liver_stress_test     20 non-null float64
cardio_stress_test    20 non-null int64
years_smoking         20 non-null int64
zeta_disease          0 non-null float64
dtypes: float64(3), int64(6)
memory usage: 1.5 KB


# Verify If dataset is Balanced or Imbalanced

In [9]:
train_df['zeta_disease'].value_counts()

0    521
1    279
Name: zeta_disease, dtype: int64

## Here we can see that the count varies alot for different classes in training dataset. So dataset is imbalanced.This will affect accuracy.

# Merge both training set and test set for preprocessing

In [10]:
frames = [train_df, test_df]
train_test_df  = pd.concat(frames)

# Since zeta_disease is our dependent variable so keep it seprate as label and keep all indepent varialbe togther to make a dataset

In [11]:

train_zeta_disease = train_test_df['zeta_disease']


In [12]:
train_test_df=train_test_df.drop(['zeta_disease'],axis=1)


In [13]:
train_test_df

Unnamed: 0,age,weight,bmi,blood_pressure,insulin_test,liver_stress_test,cardio_stress_test,years_smoking
0,54,189,27.1,80,0,1.5038,0,10
1,23,150,38.5,68,71,0.3868,55,2
2,47,186,29.9,90,0,0.2728,0,7
3,18,150,30.8,70,1033,0.6598,56,0
4,24,160,32.4,74,125,0.7608,59,2
...,...,...,...,...,...,...,...,...
15,46,213,26.5,70,133,1.4788,55,12
16,29,173,50.7,91,221,1.4878,83,3
17,36,202,42.8,72,273,1.8748,72,13
18,27,197,29.1,72,362,1.4298,69,4


# Since we have preprocessed our dataset lets seprate it as train and test datasets

In [14]:
train_x = train_test_df[0:800]
test_x = train_test_df[800:820]
train_y = train_zeta_disease[0:800]
test_y = train_zeta_disease[800:820]
X_train_full= np.array(train_x)
y_train_full=np.array(train_y)
X_test= np.array(test_x)
y_test=np.array(test_y)

# Split our training data in training set and validation set.80% data for training and 20% data for validation set

In [15]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=0.20, random_state=42)


# Since our dataset contains all numerical variables so we dont need to make dummy variables or one hot encoding. And we can try simplest Logistic regression Model

In [16]:
clf = LogisticRegression(random_state=0,max_iter=100).fit(X_train,y_train)
y_pred=clf.predict(X_valid)
print("Accuracy of Logistic Regression Model : " , accuracy_score(y_valid, y_pred))
confusion_matrix(y_valid, y_pred)

Accuracy of Logistic Regression Model :  0.73125


array([[91, 18],
       [25, 26]])

# Logistic Regression gave us 73.125% accuracy on validation set. It can be by chance as well so lets corss validate and find best parameters

In [17]:
logistic_Reg = LogisticRegression()
C = np.logspace(-5, 5, 50)
penalty = ['l1', 'l2']
max_iter =[25,50,75,100,125,150,175,200,225,250,275,300]
parameters={"C":C, "penalty":["l1","l2"],"max_iter":max_iter}# l1 lasso l2 ridge


clf_GS = GridSearchCV(logistic_Reg, parameters,cv=10)
clf_GS.fit(X_train,y_train)
print("tuned hpyerparameters :(best parameters) ",clf_GS.best_params_)
print("accuracy :",clf_GS.best_score_)

tuned hpyerparameters :(best parameters)  {'C': 0.19306977288832497, 'max_iter': 100, 'penalty': 'l2'}
accuracy : 0.7765625


# We have got best parameters for Logistic Regression. Lets apply it and find out accuracy on validation set.

In [18]:
clf = LogisticRegression(C= 0.19306977288832497, max_iter= 100, penalty= 'l2').fit(X_train,y_train)
y_pred=clf.predict(X_valid)
print("Accuracy of Logistic Regression Model : " , accuracy_score(y_valid, y_pred))
confusion_matrix(y_valid, y_pred)

Accuracy of Logistic Regression Model :  0.7375


array([[91, 18],
       [24, 27]])

# Logistic Regression gave us 73.75% with cross validated hyperparameters. Lets try Random Foreset

In [19]:

clf = RandomForestClassifier(max_depth=20, max_features = 'auto' ,bootstrap =True ,random_state=0)
clf.fit(X_train, y_train)
y_pred= clf.predict(X_valid)
print("Accuracy of Logistic Regression Model : " , accuracy_score(y_valid, y_pred))
confusion_matrix(y_valid, y_pred)

Accuracy of Logistic Regression Model :  0.78125


array([[94, 15],
       [20, 31]])

# Random Forest also got 78.125% accuracy. Lets cross validate and find best parameters

In [20]:
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [1,10,20,30,40,50,60,70,80,90,100]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}



base_estimator = RandomForestClassifier(random_state=0)
sh = RandomizedSearchCV(base_estimator, random_grid, cv=10,verbose=2,
                        n_jobs = -1).fit(X_train, y_train)
print("tuned hpyerparameters :(best parameters) ",sh.best_params_)
print("accuracy :",sh.best_score_)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   28.7s finished


tuned hpyerparameters :(best parameters)  {'n_estimators': 680, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': True}
accuracy : 0.7796875


# Lets fit Random forest with best hyperparameters

In [21]:
clf = RandomForestClassifier(n_estimators= 680,min_samples_split= 10,min_samples_leaf=2, max_depth=50, max_features = 'sqrt' ,bootstrap =True)
clf.fit(X_train, y_train)
y_pred= clf.predict(X_valid)
print("Accuracy of Random Forest Classifier Model : " , accuracy_score(y_valid, y_pred))
confusion_matrix(y_valid, y_pred)

Accuracy of Random Forest Classifier Model :  0.775


array([[95, 14],
       [22, 29]])

# Random forset gave 77.5% accuracy which is higher than Logistic Regression accuracy which was 73.75% .So i will use Random Forest

# Lets fit whole training data with Random Forest algorithm

In [23]:
clf = RandomForestClassifier(n_estimators= 680,min_samples_split= 10,min_samples_leaf=2, max_depth=50, max_features = 'sqrt' ,bootstrap =True)
clf.fit(X_train_full,y_train_full)
y_pred= clf.predict(X_test)


# Drop zeta_disease column from test set and add predi

In [26]:
test_df =test_df.drop(['zeta_disease'],axis=1)


In [27]:
test_df['zeta_disease']= y_pred.tolist()

In [28]:
test_df

Unnamed: 0,age,weight,bmi,blood_pressure,insulin_test,liver_stress_test,cardio_stress_test,years_smoking,zeta_disease
0,24,151,39.5,69,72,1.3968,56,4,0.0
1,27,179,35.5,89,156,1.6608,43,6,1.0
2,34,147,26.9,76,74,1.6958,53,2,0.0
3,35,206,32.4,73,127,1.4608,61,6,1.0
4,60,193,29.8,62,192,1.7798,65,9,1.0
5,45,120,36.5,108,50,1.2978,54,12,0.0
6,20,139,38.2,61,77,1.5818,68,3,0.0
7,23,137,31.2,70,73,1.4168,59,7,0.0
8,36,195,30.5,59,141,1.4498,59,6,1.0
9,19,193,25.8,84,66,1.7938,50,3,0.0


In [33]:
test_df.to_csv("output.csv",index = False)

Output file is output.csv

# Conclusion

I tried two methods: Logistic Regression and Random Forest. And found better accuracy on Random Forest. And other reasons are
1. Random Forest is usually robust to outliers and can handle them automatically.

2.  Random Forest is comparatively less impacted by noise.

3. Less prone to overfitting as it uses Ensemble Learning technique


And since data is imbalanced accuracy can be increased with more data. And results can be improved with more experiments. Since we have only 800 records for training data , Deep learning methods won't be useful here.
I strongly agree that accuracy can be increased further with more experiment.
