In [1]:
%load_ext autoreload
%autoreload 2

from pandas import read_excel
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import random
import timeit
import pickle

from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import confusion_matrix, mean_squared_error, classification_report, f1_score
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import svm, tree    #https://scikit-learn.org/stable/modules/svm.html
                                 #https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

from modAL.models import ActiveLearner             #https://modal-python.readthedocs.io/en/latest/content/models/ActiveLearner.html
from modAL.uncertainty import entropy_sampling     #https://modal-python.readthedocs.io/en/latest/content/apireference/uncertainty.html
from modAL.disagreement import vote_entropy_sampling

from imblearn.over_sampling import SMOTEN
from imblearn.under_sampling import EditedNearestNeighbours, ClusterCentroids

from sklearn.cluster import KMeans

import sys
sys.path.insert(0, '/home/jovyan/Thesis_ActLearn_DOP_2022/main/active_learning/')
import functions as fun

In [2]:
# Loading data sets
full_data_BatchA = pd.read_csv('/home/jovyan/Thesis_ActLearn_DOP_2022/main/active_learning/data/full_data_BatchA.csv')
y = full_data_BatchA['Label'].to_numpy()
X_morgan = full_data_BatchA.drop(['Label'], axis = 1).to_numpy()

In [None]:
# split into 80:20 ration
X_train, X_test, y_train, y_test = train_test_split(X_morgan, y, test_size = 0.2, random_state = 0)
  
# describes info about train and test set
print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

In [None]:
# logistic regression object
lr = LogisticRegression()
# train the model on train set
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)

# print classification report
print(classification_report(y_test, predictions))

In [None]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0)))

# over-sampling minority class : 0.1 corresponds to 100 of oversampling
sampler = SMOTEN(sampling_strategy = .1 n_jobs= -1, random_state=0)
X_train_res, y_train_res = sampler.fit_resample(X_train, y_train)

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0)))

In [None]:
lr1 = LogisticRegression()
lr1.fit(X_train_res, y_train_res.ravel())
predictions1 = lr1.predict(X_test)
  
# print classification report
print(classification_report(y_test, predictions1))

In [None]:
int(sum(y_train == 0)*(50/100))

In [None]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0)))

# 
kmeans = KMeans(n_clusters=int(sum(y_train == 0)*(50/100)), random_state=0)
cc = ClusterCentroids(random_state=42, estimator = kmeans)
X_train_res, y_train_res = cc.fit_resample(X_train, y_train)

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0)))

In [None]:
lr2 = LogisticRegression()
lr2.fit(X_train_res, y_train_res.ravel())
predictions2 = lr2.predict(X_test)
  
# print classification report
print(classification_report(y_test, predictions2))

### Let's create a basic experiment to select the rates of undersampling and oversampling

Firts I want to split the positive points in a uniforme way

In [3]:
print("We have {} counts of label '1' in the whole dataset".format(sum(y == 1)))
print("We have {} counts of label '0' in the whole dataset".format(sum(y == 0)))

We have 51 counts of label '1' in the whole dataset
We have 5120 counts of label '0' in the whole dataset


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_morgan, y, test_size=0.2, stratify=y, random_state=6752)
print("Now we have {} counts of label '1' in the train dataset".format(sum(y_train == 1)))
print("Now we have {} counts of label '1' in the test dataset".format(sum(y_test == 1)))

Now we have 41 counts of label '1' in the train dataset
Now we have 10 counts of label '1' in the test dataset


In [5]:
# example of combining random oversampling and undersampling for imbalanced data
from collections import Counter
print(Counter(y_train))
# define oversampling strategy
over = SMOTEN(sampling_strategy = 0.1, n_jobs= -1, random_state=0)
# fit and apply the transform
X, y = over.fit_resample(X_train, y_train)
# summarize class distribution
print(Counter(y))
# define undersampling strategy
under = ClusterCentroids(sampling_strategy = 0.5, random_state = 42)
# fit and apply the transform
X, y = under.fit_resample(X, y)
# summarize class distribution
print(Counter(y))

Counter({0: 4095, 1: 41})
Counter({0: 4095, 1: 409})
Counter({0: 818, 1: 409})


In [6]:
lr2 = LogisticRegression()
lr2.fit(X, y.ravel())
predictions2 = lr2.predict(X_test)
  
# print classification report
print(classification_report(y_test, predictions2))

              precision    recall  f1-score   support

           0       0.99      0.94      0.97      1025
           1       0.03      0.20      0.06        10

    accuracy                           0.94      1035
   macro avg       0.51      0.57      0.51      1035
weighted avg       0.98      0.94      0.96      1035



In [7]:
print(f1_score(y_test,predictions2))

0.05714285714285715
