# Imbalance Data

In [None]:
'''
How to deal with imbalanced data

(1) Use the right evaluation metrics: 

    Applying inappropriate evaluation metrics for a model generated using imbalanced data can be dangerous. 
    Accuracy is not the metric to use when working with an imbalanced dataset. We have seen that it is misleading.

    There are metrics that have been designed to tell you a more truthful story when working with imbalanced classes.

    In the case of unbalanced datasets, other alternative evaluation metrics can be applied such as:

        Precision
        Recall
        F1 score
        AUC (also known as "roc_auc" in sklearn)
    
AND you can use one of the following (or none of them):
    
(2) Downsampling your data: (not always the best option)
    You can change the dataset that you use to build your predictive model to have more balanced data. See the code below.

(3) Use, when available, the parameter class_weight = 'balanced' of sklearn (not always the best option).
    
'''

In [None]:
#IMPORT DATA
import pandas as pd
df = pd.read_csv('diabetes.csv')
df.head()

In [None]:
#Print class freq. through pandas 
print(df.groupby('target').size()) #"target" is the name of the target column, change it accordingly to your dataset

#some imports to plot 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')

#Visualize Class Counts
sns.countplot(y=df.target ,data=df) #"target" is the name of the target column, change it accordingly to your dataset
plt.xlabel("count of each class")
plt.ylabel("classes")
plt.show()

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score

y=df['target']
y_pred=np.zeros(len(y))

accuracy_score(y, y_pred)


In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

## Downsampling

In [None]:
from sklearn.utils import resample

#Down-sample Majority Class
#1) Separate majority and minority classes
df_majority = df[df.target==0] #"target" is the name of the target column, change it accordingly to your dataset
df_minority = df[df.target==1] #"target" is the name of the target column, change it accordingly to your dataset
 
#2) Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                   replace=False,
                                   n_samples=268,     # number of samples into the minority class (change it accordingly to your dataset)
                                   random_state=123)  # reproducible results
 
#3) Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
 
#4) Display new class counts
df_downsampled.target.value_counts() #"target" is the name of the target column, change it accordingly to your dataset

## Oversampling

In [None]:
from sklearn.utils import resample

#Over-sample Minority Class
#1) Separate majority and minority classes
df_majority = df[df.target==0] #"target" is the name of the target column, change it accordingly to your dataset
df_minority = df[df.target==1] #"target" is the name of the target column, change it accordingly to your dataset
 
#2) Oversample minority class
df_minority_oversampled = resample(df_minority, 
                                   replace=True,
                                   n_samples=500,     # number of samples into the minority class (change it accordingly to your dataset)
                                   random_state=123)  # reproducible results
 
#3) Combine oversampled minority class with majority class
df_oversampled = pd.concat([df_minority_oversampled, df_majority])
 
#4) Display new class counts
df_oversampled.target.value_counts() #"target" is the name of the target column, change it accordingly to your dataset