In [1]:
%load_ext autoreload
%autoreload 2

from pandas import read_excel
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import random
import timeit
import pickle

from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import confusion_matrix, mean_squared_error, classification_report, f1_score
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import svm, tree    #https://scikit-learn.org/stable/modules/svm.html
                                 #https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
from sklearn.cluster import KMeans

from modAL.models import ActiveLearner             #https://modal-python.readthedocs.io/en/latest/content/models/ActiveLearner.html
from modAL.uncertainty import entropy_sampling     #https://modal-python.readthedocs.io/en/latest/content/apireference/uncertainty.html
from modAL.disagreement import vote_entropy_sampling

from imblearn.over_sampling import SMOTEN
from imblearn.under_sampling import EditedNearestNeighbours, ClusterCentroids

from collections import Counter

import sys
sys.path.insert(0, '/home/jovyan/Thesis_ActLearn_DOP_2022/main/active_learning/')
import functions as fun

In [2]:
# Loading data sets
full_data_BatchA = pd.read_csv('/home/jovyan/Thesis_ActLearn_DOP_2022/main/active_learning/data/full_data_BatchA.csv')
y = full_data_BatchA['Label'].to_numpy()
X_morgan = full_data_BatchA.drop(['Label'], axis = 1).to_numpy()

Messy part

In [None]:
# split into 80:20 ration
X_train, X_test, y_train, y_test = train_test_split(X_morgan, y, test_size = 0.2, random_state = 0)
  
# describes info about train and test set
print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

In [None]:
# logistic regression object
lr = LogisticRegression()
# train the model on train set
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))

In [None]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0)))

# over-sampling minority class : 0.1 corresponds to 100 of oversampling
sampler = SMOTEN(sampling_strategy = .1 n_jobs= -1, random_state=0)
X_train_res, y_train_res = sampler.fit_resample(X_train, y_train)

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0)))

In [None]:
lr1 = LogisticRegression()
lr1.fit(X_train_res, y_train_res.ravel())
predictions1 = lr1.predict(X_test)
  
# print classification report
print(classification_report(y_test, predictions1))

In [None]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0)))

cc = ClusterCentroids(random_state=42)
X_train_res, y_train_res = cc.fit_resample(X_train, y_train)

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0)))

In [None]:
lr2 = LogisticRegression()
lr2.fit(X_train_res, y_train_res.ravel())
predictions2 = lr2.predict(X_test)
  
# print classification report
print(classification_report(y_test, predictions2))

### Let's create a basic experiment to select the rates of undersampling and oversampling

Firts I want to split the positive points in a uniforme way

In [3]:
print(f'Initial class count in Batch A:\n {Counter(y)}')

Initial class count in Batch A:
 Counter({0: 5120, 1: 51})


In [4]:
# Split proportionally
X_train, X_test, y_train, y_test = train_test_split(X_morgan, y, test_size=0.2, stratify=y, random_state=6752)
print(f'Class count in TRAIN: {Counter(y_train)}')
print(f'Class count in TEST: {Counter(y_test)}')

Class count in TRAIN: Counter({0: 4095, 1: 41})
Class count in TEST: Counter({0: 1025, 1: 10})


Example of combining random oversampling and undersampling for imbalanced data

In [5]:
print(f'Initial class count in TRAIN: {Counter(y_train)}')
# define oversampling strategy
over_sampler = SMOTEN(sampling_strategy = 0.1, n_jobs= -1, random_state=0)
# fit and apply the transform
x_train_re, y_train_re = over_sampler.fit_resample(X_train, y_train)
# summarize class distribution
print(f'Class count in TRAIN after over-sampling: {Counter(y_train_re)}')
# define undersampling strategy
under_sampler = ClusterCentroids(sampling_strategy = 0.5, random_state = 42)
# fit and apply the transform
x_train_re, y_train_re = under_sampler.fit_resample(x_train_re, y_train_re)
# summarize class distribution
print(f'Class count in TRAIN after under-sampling: {Counter(y_train_re)}')

Initial class count in TRAIN: Counter({0: 4095, 1: 41})
Class count in TRAIN after over-sampling: Counter({0: 4095, 1: 409})
Class count in TRAIN after under-sampling: Counter({0: 818, 1: 409})


In [6]:
#Training a model on the initial data set
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)
y_predicted = clf.predict(X_test)
  
# print classification report
print(f'F1-score: {f1_score(y_test, y_predicted)}')

F1-score: 0.0


In [7]:
#Training a model
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(x_train_re, y_train_re)
y_predicted_re = clf.predict(X_test)
  
# print classification report
print(f'F1-score: {f1_score(y_test, y_predicted_re)}')

F1-score: 0.020689655172413793


Wow!! We manage to increase the score by 2%!!!

In [None]:
print(f'Initial class count in TRAIN: {Counter(y_train)}')

for o in np.arange(0.1, 1.1, 0.1):
    # define oversampling strategy
    over_sampler = SMOTEN(sampling_strategy = o, n_jobs= -1, random_state=0)
    # fit and apply the transform
    x_train_re, y_train_re = over_sampler.fit_resample(X_train, y_train)
    # summarize class distribution
    print(f'Class count in TRAIN after over-sampling: {Counter(y_train_re)}')
    
    for u in np.arange(0.1, 1.1, 0.1):
        # define undersampling strategy
        under_sampler = ClusterCentroids(sampling_strategy = u, random_state = 42)
        # fit and apply the transform
        x_train_re, y_train_re = under_sampler.fit_resample(x_train_re, y_train_re)
        # summarize class distribution
        print(f'Class count in TRAIN after under-sampling: {Counter(y_train_re)}')

Continuacion: Agregar el training y guardar los scores en una matrix para despues hacer un heatmap

In [None]:
#Training a model
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(x_train_re, y_train_re)
y_predicted_re = clf.predict(X_test)
  
# print classification report
print(f'F1-score: {f1_score(y_test, y_predicted_re)}')

In [None]:
np.arange(0.1, 1.1, 0.1)