### Logistic regression

#### Importing all the library

In [1]:
pip install imblearn

Note: you may need to restart the kernel to use updated packages.


In [2]:
from imblearn.datasets import fetch_datasets

In [3]:
thyroid_sick_df = fetch_datasets()['thyroid_sick']

In [4]:
thyroid_sick_df

{'data': array([[41.,  1.,  0., ...,  1.,  0.,  0.],
        [23.,  1.,  0., ...,  0.,  0.,  0.],
        [46.,  0.,  1., ...,  0.,  0.,  0.],
        ...,
        [74.,  1.,  0., ...,  0.,  0.,  0.],
        [72.,  0.,  1., ...,  0.,  0.,  1.],
        [64.,  1.,  0., ...,  0.,  0.,  0.]]),
 'target': array([-1, -1, -1, ..., -1, -1, -1], dtype=int64),
 'DESCR': 'thyroid_sick'}

#### Split the data into x and y(target feature)

In [5]:
import pandas as pd

In [6]:
X = pd.DataFrame(thyroid_sick_df['data'])

In [7]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,42,43,44,45,46,47,48,49,50,51
0,41.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,1.14,0.0,1.0,109.0,1.0,0.0,0.0,1.0,0.0,0.0
1,23.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.31,1.0,0.0,88.0,1.0,1.0,0.0,0.0,0.0,0.0
2,46.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.91,0.0,1.0,120.0,1.0,1.0,0.0,0.0,0.0,0.0
3,70.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,1.20,1.0,0.0,50.0,1.0,1.0,0.0,0.0,0.0,0.0
4,70.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.87,0.0,1.0,70.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3767,30.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.46,1.0,0.0,245.0,1.0,1.0,0.0,0.0,0.0,0.0
3768,68.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,1.08,0.0,1.0,114.0,1.0,0.0,0.0,0.0,0.0,1.0
3769,74.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,1.07,0.0,1.0,105.0,1.0,1.0,0.0,0.0,0.0,0.0
3770,72.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.94,0.0,1.0,87.0,1.0,0.0,0.0,0.0,0.0,1.0


In [8]:
Y = pd.DataFrame(thyroid_sick_df['target'])

In [9]:
Y

Unnamed: 0,0
0,-1
1,-1
2,-1
3,-1
4,-1
...,...
3767,-1
3768,-1
3769,-1
3770,-1


In [10]:
Y.columns

RangeIndex(start=0, stop=1, step=1)

In [11]:
x = thyroid_sick_df['data']

In [12]:
x

array([[41.,  1.,  0., ...,  1.,  0.,  0.],
       [23.,  1.,  0., ...,  0.,  0.,  0.],
       [46.,  0.,  1., ...,  0.,  0.,  0.],
       ...,
       [74.,  1.,  0., ...,  0.,  0.,  0.],
       [72.,  0.,  1., ...,  0.,  0.,  1.],
       [64.,  1.,  0., ...,  0.,  0.,  0.]])

In [13]:
y = thyroid_sick_df['target']

In [14]:
y

array([-1, -1, -1, ..., -1, -1, -1], dtype=int64)

#### Split the data into X_train, X_test, y_train, y_test

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=4)

In [17]:
X_train.shape

(2640, 52)

In [18]:
y_train.shape

(2640,)

#### Check that the data is balanced or not

In [19]:
y_train

array([-1, -1,  1, ..., -1, -1, -1], dtype=int64)

In [20]:
#y_train.columns=['Class']

In [21]:
y_train

array([-1, -1,  1, ..., -1, -1, -1], dtype=int64)

In [23]:
len([c for c in y_train if c==-1])

2487

In [24]:
len([c for c in y_train if c==1])

153

In [None]:
# Data is imbalance

# Model with imbalance data point

# SMOTE (Synthetic Minority Oversampling Technique)

#### Balanace the data using SMOTE(Synthetic Minority Oversampling Technique)

In [25]:
from imblearn.over_sampling import SMOTE

In [26]:
sm = SMOTE()

In [27]:
X_train_smote, y_train_smote = sm.fit_resample(X_train.astype('float'),y_train)

#### Count the row in each class of target feature using class Counter

In [28]:
from collections import Counter

In [29]:
print('Before SMOTE: ', Counter(y_train))
print('After SMOTE: ', Counter(y_train_smote))

Before SMOTE:  Counter({-1: 2487, 1: 153})
After SMOTE:  Counter({-1: 2487, 1: 2487})


# Logistic Regresion with SMOTE

#### import logistic Regresion class from sklearn

In [30]:
from sklearn.linear_model import LogisticRegression

In [31]:
LR_obj = LogisticRegression()

In [32]:
LR_obj

LogisticRegression()

#### Fit the training data into logistic regression model

In [33]:
model =LR_obj.fit(X_train_smote,y_train_smote)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### Now, we have our fit model lets see how well we can predict the test values

In [34]:
y_predict_smote = model.predict(X_test)

#### Now lets evaluate the model---- Check accuracy

In [37]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, precision_recall_curve, f1_score

In [64]:
accuracy_lr = accuracy_score(y_test,y_predict_smote)

In [65]:
accuracy_lr

0.8939929328621908

In [39]:
confusion_matrix(y_test,y_predict_smote)

array([[945, 109],
       [ 11,  67]], dtype=int64)

In [66]:
precision_lr = precision_score(y_test, y_predict_smote)

In [67]:
precision_lr

0.3806818181818182

In [68]:
recall_lr = recall_score(y_test, y_predict_smote)

In [69]:
recall_lr

0.8589743589743589

In [71]:
f1_lr = f1_score(y_test, y_predict_smote)

In [72]:
f1_lr

0.5275590551181102

#  Decision Tree Classification with SMOTE

#### import Decision Tree Classification class from sklearn

In [43]:
from sklearn.tree import DecisionTreeClassifier

In [44]:
model1 = DecisionTreeClassifier().fit(X_train_smote, y_train_smote)

In [45]:
y_predict_smote1 = model1.predict(X_test)

In [75]:
accuracy_dtc = accuracy_score(y_test,y_predict_smote1)

In [76]:
accuracy_dtc

0.9840989399293286

In [47]:
confusion_matrix(y_test,y_predict_smote1)

array([[1048,    6],
       [  12,   66]], dtype=int64)

In [77]:
precision_dtc = precision_score(y_test, y_predict_smote1)

In [78]:
precision_dtc

0.9166666666666666

In [79]:
recall_dtc = recall_score(y_test, y_predict_smote1)

In [80]:
recall_dtc

0.8461538461538461

In [81]:
f1_dtc = f1_score(y_test, y_predict_smote1)

In [82]:
f1_dtc

0.8799999999999999

# Random Over Sampling

#### Import RandomOverSampler class for balancing data points

In [51]:
from imblearn.over_sampling import RandomOverSampler

In [52]:
ros = RandomOverSampler(0.75)



In [53]:
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

In [54]:
X_train_ros

array([[51.,  1.,  0., ...,  0.,  0.,  0.],
       [50.,  1.,  0., ...,  0.,  0.,  0.],
       [83.,  0.,  1., ...,  0.,  0.,  1.],
       ...,
       [80.,  1.,  0., ...,  0.,  0.,  1.],
       [62.,  0.,  1., ...,  0.,  0.,  1.],
       [70.,  1.,  0., ...,  0.,  0.,  0.]])

#### Count the data points in each class of target feature using class Counter

In [55]:
print('Before Random Over Sampling: ', Counter(y_train))
print('After Random Over Sampling: ', Counter(y_train_ros))

Before Random Over Sampling:  Counter({-1: 2487, 1: 153})
After Random Over Sampling:  Counter({-1: 2487, 1: 1865})


# Logistic Regression with Random Sampling

In [56]:
model_ros = LogisticRegression().fit(X_train_ros,y_train_ros)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [57]:
model_ros

LogisticRegression()

In [58]:
y_predict_ros = model_ros.predict(X_test)

#### Now lets evaluate the model---- Check accuracy

In [85]:
accuracy_lr_ros = accuracy_score(y_test,y_predict_ros)

In [86]:
accuracy_lr_ros

0.9081272084805654

In [87]:
precision_lr_ros = precision_score(y_test,y_predict_ros)

In [88]:
precision_lr_ros

0.41975308641975306

In [89]:
recall_lr_ros = recall_score(y_test,y_predict_ros)

In [90]:
recall_lr_ros

0.8717948717948718

In [91]:
f1_lr_ros = f1_score(y_test,y_predict_ros)

In [92]:
f1_lr_ros

0.5666666666666667

In [63]:
confusion_matrix(y_test,y_predict_ros)

array([[960,  94],
       [ 10,  68]], dtype=int64)

# Decision Tree Classification with Random Sampling

In [98]:
model_dtc_ros = DecisionTreeClassifier().fit(X_train_ros,y_train_ros)

In [100]:
y_predict_dtc_ros = model_dtc_ros.predict(X_test)

#### Now lets evaluate the model---- Check accuracy

In [101]:
accuracy_dtc_ros = accuracy_score(y_test,y_predict_dtc_ros)

In [102]:
accuracy_dtc_ros

0.984982332155477

In [103]:
precision_dtc_ros = precision_score(y_test,y_predict_dtc_ros)

In [104]:
precision_dtc_ros

0.9552238805970149

In [105]:
recall_dtc_ros = recall_score(y_test,y_predict_dtc_ros)

In [106]:
recall_dtc_ros

0.8205128205128205

In [107]:
f1_dtc_ros = f1_score(y_test,y_predict_dtc_ros)

In [108]:
f1_dtc_ros

0.8827586206896552

# Compare the model with SMOTE over sampling technique

### Logistic Regression with SMOTE sampling

In [73]:
Logistic_Regression_smote = pd.DataFrame([accuracy_lr,precision_lr,recall_lr,f1_lr],['Accuracy','Precision','Recall','F1'], columns=['Score'] )

In [74]:
Logistic_Regression_smote

Unnamed: 0,Score
Accuracy,0.893993
Precision,0.380682
Recall,0.858974
F1,0.527559


### Decision Tree Classification with SMOTE sampling

In [94]:
Decision_Tree_smote = pd.DataFrame([accuracy_dtc,precision_dtc,recall_dtc,f1_dtc],['Accuracy','Precision','Recall','F1'], columns=['Score'] )

In [95]:
Decision_Tree_smote

Unnamed: 0,Score
Accuracy,0.984099
Precision,0.916667
Recall,0.846154
F1,0.88


# Compare the model with Random over sampling technique

### Logistic Regression with Random over sampling

In [96]:
Logistic_Regression_ros = pd.DataFrame([accuracy_lr_ros,precision_lr_ros,recall_lr_ros,f1_lr_ros],['Accuracy','Precision','Recall','F1'], columns=['Score'] )

In [97]:
Logistic_Regression_ros

Unnamed: 0,Score
Accuracy,0.908127
Precision,0.419753
Recall,0.871795
F1,0.566667


### Decision tree classification with Random over sampling

In [109]:
Decision_Tree_ros = pd.DataFrame([accuracy_dtc_ros,precision_dtc_ros,recall_dtc_ros,f1_dtc_ros],['Accuracy','Precision','Recall','F1'], columns=['Score'] )

In [110]:
Decision_Tree_ros

Unnamed: 0,Score
Accuracy,0.984982
Precision,0.955224
Recall,0.820513
F1,0.882759
