# Imbalanced Dataset Machine Learning
This types of datasets does not heavily impact ensemble techniques or aloriths where decision trees are on count

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('creditcard.csv')

In [3]:
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [4]:
df.Class.value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [5]:
x = df.drop('Class',axis=1)

In [6]:
y = df.Class

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import KFold,GridSearchCV
import numpy as np

In [9]:
log_class = LogisticRegression()
grid = {'C':10.0**np.arange(-2,3),'penalty':['l1','l2']}
cv = KFold(n_splits = 5,random_state = None,shuffle = False)

In [11]:
clf = GridSearchCV(log_class,grid,cv = cv,n_jobs = -1,scoring = 'f1_macro')

In [12]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size = 0.7)
clf.fit(x_train,y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                         'penalty': ['l1', 'l2']},
             scoring='f1_macro')

In [13]:
y_pred = clf.predict(x_test)

In [14]:
confusion_matrix(y_test,y_pred)

array([[85261,    32],
       [   60,    90]], dtype=int64)

In [15]:
accuracy_score(y_test,y_pred)

0.9989232587807076

In [17]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85293
           1       0.74      0.60      0.66       150

    accuracy                           1.00     85443
   macro avg       0.87      0.80      0.83     85443
weighted avg       1.00      1.00      1.00     85443



In [20]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(x_train,y_train)
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))


[[85281    12]
 [   31   119]]
0.9994967405170698


In [21]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85293
           1       0.91      0.79      0.85       150

    accuracy                           1.00     85443
   macro avg       0.95      0.90      0.92     85443
weighted avg       1.00      1.00      1.00     85443



## Under Sampling
Reduceing points of maximum labels <br>
This will in turn hamper the dataset and will not perform good

In [32]:
from Collections import Counter

ModuleNotFoundError: No module named 'Collections'

In [34]:

from imblearn.under_sampling import NearMiss
ns = NearMiss(0.80)
x_train_ns,y_train_ns = ns.fit_sample(x_train,y_train)
# print(" The number of classes before fit {}".format((Counter(y_train))))
# print(" The number of classes after fit {}".format((Counter(y_train_ns))))

In [35]:
classifier.fit(x_train_ns,y_train_ns)
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[67303 17990]
 [    7   143]]
0.7893683508303782
              precision    recall  f1-score   support

           0       1.00      0.79      0.88     85293
           1       0.01      0.95      0.02       150

    accuracy                           0.79     85443
   macro avg       0.50      0.87      0.45     85443
weighted avg       1.00      0.79      0.88     85443



## Over Sampling
We increase the number of data points for the label having less number of data

In [37]:
from imblearn.over_sampling import RandomOverSampler
os = RandomOverSampler(0.5)
x_train_os,y_train_os = os.fit_sample(x_train,y_train)
# print(" The number of classes before fit {}".format((Counter(y_train))))
# print(" The number of classes after fit {}".format((Counter(y_train_os))))


In [38]:
classifier.fit(x_train_os,y_train_os)
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85280    13]
 [   28   122]]
0.9995201479348805
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85293
           1       0.90      0.81      0.86       150

    accuracy                           1.00     85443
   macro avg       0.95      0.91      0.93     85443
weighted avg       1.00      1.00      1.00     85443



## SMOTETomek
Includes both over and under sampling

In [43]:
from imblearn.combine import SMOTETomek

In [None]:
sm = SMOTETomek(0.75)
x_train_sm,y_train_sm = sm.fit_sample(x_train,y_train)

In [None]:
classifier.fit(x_train_sm,y_train_sm)
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

## Ensemble Techniques