# Imbalanced Data
Using the dataset for the risk of heart attack with class imbalance:

Create a logistic regression model and measure the performance of it.
By experimenting with different methods and class ratios; overcome class imbalance, determine the best performing method and class ratio.

In [1]:
import pandas as pd
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

df= pd.read_csv('cleveland-0_vs_4.csv')

In [2]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,negative
1,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,negative
2,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,negative
3,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,negative
4,57.0,0.0,4.0,120.0,354.0,0.0,0.0,163.0,1.0,0.6,1.0,0.0,3.0,negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172,64.0,1.0,4.0,145.0,212.0,0.0,2.0,132.0,0.0,2.0,2.0,2.0,6.0,positive
173,38.0,1.0,1.0,120.0,231.0,0.0,0.0,182.0,1.0,3.8,2.0,0.0,7.0,positive
174,61.0,1.0,4.0,138.0,166.0,0.0,2.0,125.0,1.0,3.6,2.0,1.0,3.0,positive
175,58.0,1.0,4.0,114.0,318.0,0.0,1.0,140.0,0.0,4.4,3.0,3.0,6.0,positive


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177 entries, 0 to 176
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       177 non-null    float64
 1   sex       177 non-null    float64
 2   cp        177 non-null    float64
 3   trestbps  177 non-null    float64
 4   chol      177 non-null    float64
 5   fbs       177 non-null    float64
 6   restecg   177 non-null    float64
 7   thalach   177 non-null    float64
 8   exang     177 non-null    float64
 9   oldpeak   177 non-null    float64
 10  slope     177 non-null    float64
 11  ca        177 non-null    object 
 12  thal      177 non-null    object 
 13  num       177 non-null    object 
dtypes: float64(11), object(3)
memory usage: 19.5+ KB


# 1. EDA and Regression Model

In [4]:
for i in ["ca", "thal", "num"]:
    print(f"{i} Column values:\n {df[i].unique()}\n")

ca Column values:
 ['0.0' '2.0' '1.0' '3.0' '<null>']

thal Column values:
 ['6.0' '3.0' '7.0' '<null>']

num Column values:
 ['negative' 'positive']



In [5]:
df.num = df.num.replace('negative', 0)
df.num = df.num.replace('positive', 1)

In [6]:
print(df.loc[df['ca'] == '<null>'])
print(df.loc[df['thal'] == '<null>'])

      age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
85   52.0  1.0  3.0     138.0  223.0  0.0      0.0    169.0    0.0      0.0   
142  58.0  1.0  2.0     125.0  220.0  0.0      0.0    144.0    0.0      0.4   
146  38.0  1.0  3.0     138.0  175.0  0.0      0.0    173.0    0.0      0.0   

     slope      ca thal  num  
85     1.0  <null>  3.0    0  
142    2.0  <null>  7.0    0  
146    1.0  <null>  3.0    0  
     age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
44  53.0  0.0  3.0     128.0  216.0  0.0      2.0    115.0    0.0      0.0   

    slope   ca    thal  num  
44    1.0  0.0  <null>    0  


In [7]:
df = df.drop([44,85,142,146])

In [8]:
df[["ca", "thal", "num"]] = df[["ca", "thal", "num"]].apply(pd.to_numeric)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 173 entries, 0 to 176
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       173 non-null    float64
 1   sex       173 non-null    float64
 2   cp        173 non-null    float64
 3   trestbps  173 non-null    float64
 4   chol      173 non-null    float64
 5   fbs       173 non-null    float64
 6   restecg   173 non-null    float64
 7   thalach   173 non-null    float64
 8   exang     173 non-null    float64
 9   oldpeak   173 non-null    float64
 10  slope     173 non-null    float64
 11  ca        173 non-null    float64
 12  thal      173 non-null    float64
 13  num       173 non-null    int64  
dtypes: float64(13), int64(1)
memory usage: 20.3 KB


In [10]:
df.astype(str)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
2,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
3,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0
4,57.0,0.0,4.0,120.0,354.0,0.0,0.0,163.0,1.0,0.6,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172,64.0,1.0,4.0,145.0,212.0,0.0,2.0,132.0,0.0,2.0,2.0,2.0,6.0,1
173,38.0,1.0,1.0,120.0,231.0,0.0,0.0,182.0,1.0,3.8,2.0,0.0,7.0,1
174,61.0,1.0,4.0,138.0,166.0,0.0,2.0,125.0,1.0,3.6,2.0,1.0,3.0,1
175,58.0,1.0,4.0,114.0,318.0,0.0,1.0,140.0,0.0,4.4,3.0,3.0,6.0,1


In [11]:
print("Positive Ratio: %{:.2f}".format(sum(df.num)/len(df.num)*100),sum(df.num))
print("Negative Ratio: %{:.2f}".format((len(df.num)-sum(df.num))/len(df.num)*100),(len(df.num)-sum(df.num)))

Positive Ratio: %7.51 13
Negative Ratio: %92.49 160


***----This part will be discussed with the Mentor!-----***

X = df.iloc[:, :-1]
y = df.iloc[:, 1]

from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.20, random_state=111)

#Let's train the model with training data.
log_reg.fit(X_train, y_train)

#We can use .score () function to measure the performance of our model. This function returns the ratio of correct predictions. Let's check this for both training and test data.
train_accuracy = log_reg.score(X_train, y_train)
test_accuracy = log_reg.score(X_test, y_test)

print('One-vs.-Rest', '-'*30, 
      'Accuracy on Train Data : {:.2f}'.format(train_accuracy), 
      'Accuracy on Test Data  : {:.2f}'.format(test_accuracy), sep='\n')

log_reg_mnm = LogisticRegression(multi_class='multinomial', solver='lbfgs')
log_reg_mnm.fit(X_train, y_train)

train_accuracy = log_reg_mnm.score(X_train, y_train)
test_accuracy = log_reg_mnm.score(X_test, y_test)

print('Multinomial (Softmax)', '-'*20, 
      'Accuracy on Train Data : {:.2f}'.format(train_accuracy), 
      'Accuracy on Test Data  : {:.2f}'.format(test_accuracy), sep='\n')

**When we check the results, we have 100% and 100% accuracy on train and test datas.**

***----This part will be discussed with the Mentor!-----***

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
def create_model(X, y):
    X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.20, random_state=111, stratify = y)
    
    logreg_model = LogisticRegression()
    logreg_model.fit(X_train, y_train)

    pred_train = logreg_model.predict(X_train)
    pred_test = logreg_model.predict(X_test)
    
    conf_mtx_train = confusion_matrix(y_train, pred_train)
    conf_mtx_test = confusion_matrix(y_test, pred_test)
    
    print("Accuracy : {}\n".format(logreg_model.score(X_test, y_test)))
    
    print("Train Dataset")
    print(classification_report(y_train, pred_train))
    
    print("Test Dataset")
    print(classification_report(y_test, pred_test))
    
    return  None

In [13]:
X = df.drop('num', axis=1)
y = df['num']
create_model(X,y)

Accuracy : 0.9428571428571428

Train Dataset
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       128
           1       1.00      0.60      0.75        10

    accuracy                           0.97       138
   macro avg       0.98      0.80      0.87       138
weighted avg       0.97      0.97      0.97       138

Test Dataset
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        32
           1       1.00      0.33      0.50         3

    accuracy                           0.94        35
   macro avg       0.97      0.67      0.73        35
weighted avg       0.95      0.94      0.93        35



### Train Dataset
* Tahminlerimizin %97si Negatif tahmin edildi. Gerçekte negatif olanların %100ü negatif çıktı. F1-score %98
* Tahminlerimizin %100u Pozitif tahmin edildi. Gerçekte pozitif olanların %60ı. pozitif çıktı. F1-score %75

### Test Dataset
* Tahminlerimizin %94ü Negatif tahmin edildi. Gerçekte negatif olanların %100ü negatif çıktı. F1-score %97
* Tahminlerimizin %100ü Pozitif tahmin edildi. Gerçekte pozitif olanların %33ü. pozitif çıktı. F1-score %50

# 2. Resampling Dataset

### 2.1. Up-Sampling

In [14]:
from sklearn.utils import resample

In [15]:
negative = df[df.num == 0]
positive = df[df.num == 1]

positive_upsampled = resample(positive,
                                     replace = True,
                                     n_samples = len(negative),
                                     random_state = 111)

upsampled_df = pd.concat([negative, positive_upsampled])
upsampled_df.num.value_counts()

1    160
0    160
Name: num, dtype: int64

In [16]:
X = upsampled_df.drop('num', axis=1)
y = upsampled_df['num']
create_model(X,y)

Accuracy : 0.953125

Train Dataset
              precision    recall  f1-score   support

           0       1.00      0.95      0.97       128
           1       0.95      1.00      0.97       128

    accuracy                           0.97       256
   macro avg       0.97      0.97      0.97       256
weighted avg       0.97      0.97      0.97       256

Test Dataset
              precision    recall  f1-score   support

           0       1.00      0.91      0.95        32
           1       0.91      1.00      0.96        32

    accuracy                           0.95        64
   macro avg       0.96      0.95      0.95        64
weighted avg       0.96      0.95      0.95        64



* When we resample the data, there is a increase in most of the test dataset metrics. 
* However, test recall ratio is lower than train ratio model overfits!

### 2.2. Down-Sampling

In [17]:
negative = df[df.num == 0]
positive = df[df.num == 1]

positive_downsampled = resample(negative,
                                     replace = True,
                                     n_samples = len(positive),
                                     random_state = 111)

downsampled_df = pd.concat([positive, positive_downsampled])
downsampled_df.num.value_counts()

1    13
0    13
Name: num, dtype: int64

In [18]:
X = downsampled_df.drop('num', axis=1)
y = downsampled_df['num']

create_model(X,y)

Accuracy : 1.0

Train Dataset
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        10

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

Test Dataset
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00         3

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6



with 13 data model is biassed!

## 3. Producing Synthetic Samples

### 3.1. SMOTE

In [20]:
from imblearn.over_sampling import SMOTE

y = df.num
X = df.drop('num', axis=1)

sm = SMOTE(random_state=27, sampling_strategy=1.0)
X_smote, y_smote = sm.fit_resample(X, y)

In [21]:
create_model(X_smote, y_smote)

Accuracy : 0.953125

Train Dataset
              precision    recall  f1-score   support

           0       1.00      0.96      0.98       128
           1       0.96      1.00      0.98       128

    accuracy                           0.98       256
   macro avg       0.98      0.98      0.98       256
weighted avg       0.98      0.98      0.98       256

Test Dataset
              precision    recall  f1-score   support

           0       0.97      0.94      0.95        32
           1       0.94      0.97      0.95        32

    accuracy                           0.95        64
   macro avg       0.95      0.95      0.95        64
weighted avg       0.95      0.95      0.95        64



### 3.2. ADASYN

In [22]:
from imblearn.over_sampling import ADASYN

In [24]:
y = df.num
X = df.drop('num', axis=1)

ad = ADASYN()
X_adasyn, y_adasyn = ad.fit_resample(X, y)

In [25]:
create_model(X_adasyn, y_adasyn)

Accuracy : 0.96875

Train Dataset
              precision    recall  f1-score   support

           0       1.00      0.97      0.98       128
           1       0.97      1.00      0.98       127

    accuracy                           0.98       255
   macro avg       0.98      0.98      0.98       255
weighted avg       0.98      0.98      0.98       255

Test Dataset
              precision    recall  f1-score   support

           0       1.00      0.94      0.97        32
           1       0.94      1.00      0.97        32

    accuracy                           0.97        64
   macro avg       0.97      0.97      0.97        64
weighted avg       0.97      0.97      0.97        64



**SMOTE has given better results than the scikit-learn library**