https://www.geeksforgeeks.org/ml-handling-imbalanced-data-with-smote-and-near-miss-algorithm-in-python/

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
!pip install imbalanced-learn
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from IPython.display import Image  
from six import StringIO  
from sklearn.tree import export_graphviz



In [61]:
#  Read the CSV and Perform Basic Data Cleaning
data = pd.read_csv('../data/combined_wine.csv', sep=',')
# Drop the null columns where all values are null
data = data.dropna(axis='columns', how='all')
# Drop the null rows
data = data.dropna()

#the n_neighbors error is happening because there are so few "9" class members - elimintating them should remove the
#problem (just as proof of concept, we want these points)

data = data.drop(data[(data.quality == 9)].index)



data.groupby(["quality"]).count()

#no quality = 9 row now

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
quality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3,30,30,30,30,30,30,30,30,30,30,30,30
4,216,216,216,216,216,216,216,216,216,216,216,216
5,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138
6,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836
7,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079
8,193,193,193,193,193,193,193,193,193,193,193,193


In [62]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6492 entries, 0 to 6496
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         6492 non-null   float64
 1   volatile acidity      6492 non-null   float64
 2   citric acid           6492 non-null   float64
 3   residual sugar        6492 non-null   float64
 4   chlorides             6492 non-null   float64
 5   free sulfur dioxide   6492 non-null   float64
 6   total sulfur dioxide  6492 non-null   float64
 7   density               6492 non-null   float64
 8   pH                    6492 non-null   float64
 9   sulphates             6492 non-null   float64
 10  alcohol               6492 non-null   float64
 11  quality               6492 non-null   int64  
 12  type                  6492 non-null   int64  
dtypes: float64(11), int64(2)
memory usage: 710.1 KB
None


In [63]:
# normalise the quality column 
data['normquality'] = StandardScaler().fit_transform(np.array(data['quality']).reshape(-1, 1)) 
# drop irrevelent columns as they are not relevant for prediction purpose  
# df = df.drop(['fixed acidity', 'citric acid', 'residual sugar', 'free sulfur dioxide', 'total sulfur dioxide', 'pH', 'sulphates'], axis = 1) 
  
# as you can see there are 492 fraud transactions. 
data['type'].value_counts() 

0    4893
1    1599
Name: type, dtype: int64

In [64]:
X = data.drop(columns=["quality"])
y = data["quality"]
print(X.shape, y.shape)

(6492, 13) (6492,)


In [65]:
# Let's split the data into training and testing sets:
X_train = data.values[0:5000]
X_test = data.values[5000:]

In [66]:
from sklearn.model_selection import train_test_split

# split into 70:30 ration
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# describes info about train and test set
print("X_train dataset: ", X_train.shape) 
print("y_train dataset: ", y_train.shape) 
print("X_test dataset: ", X_test.shape) 
print("y_test dataset: ", y_test.shape) 

X_train dataset:  (4544, 13)
y_train dataset:  (4544,)
X_test dataset:  (1948, 13)
y_test dataset:  (1948,)


In [67]:
# logistic regression object 
lr = LogisticRegression(max_iter=10000) 
  
# train the model on train set 
lr.fit(X_train, y_train.ravel()) 
  
predictions = lr.predict(X_test) 
  
# print classification report 
print(classification_report(y_test, predictions)) 

              precision    recall  f1-score   support

           3       1.00      0.10      0.18        10
           4       0.89      1.00      0.94        70
           5       1.00      1.00      1.00       617
           6       1.00      1.00      1.00       879
           7       1.00      1.00      1.00       319
           8       1.00      1.00      1.00        53

    accuracy                           1.00      1948
   macro avg       0.98      0.85      0.85      1948
weighted avg       1.00      1.00      0.99      1948



In [26]:
!pip install mlrose

Collecting mlrose
  Downloading mlrose-1.3.0-py3-none-any.whl (27 kB)
Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py): started
  Building wheel for sklearn (setup.py): finished with status 'done'
  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1320 sha256=898dc9853637cdab0e20cce5b10e93df3ea60963f08a05ab9ce3292030cae517
  Stored in directory: c:\users\ptsai\appdata\local\pip\cache\wheels\22\0b\40\fd3f795caaa1fb4c6cb738bc1f56100be1e57da95849bfc897
Successfully built sklearn
Installing collected packages: sklearn, mlrose
Successfully installed mlrose-1.3.0 sklearn-0.0


In [68]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1))) 
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0))) 
  
# import SMOTE module from imblearn library 
#!pip install mlrose (done in cell above), six
#!pip instal imblearn done in first cell
from imblearn.over_sampling import SMOTE
import six
import sys
sys.modules['sklearn.externals.six'] = six
from six import StringIO
import mlrose

sm = SMOTE(random_state = 2) 
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel()) 
  
print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape)) 
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape)) 
  
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1))) 
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0))) 

Before OverSampling, counts of label '1': 0
Before OverSampling, counts of label '0': 0 

After OverSampling, the shape of train_X: (11742, 13)
After OverSampling, the shape of train_y: (11742,) 

After OverSampling, counts of label '1': 0
After OverSampling, counts of label '0': 0


In [None]:
lr1 = LogisticRegression(max_iter=10000) 
lr1.fit(X_train_res, y_train_res.ravel()) 
predictions = lr1.predict(X_test) 
  
# print classification report 
print(classification_report(y_test, predictions)) 