In [1]:
%load_ext autoreload
%autoreload 2

from pandas import read_excel
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import random
import timeit
import pickle

from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import confusion_matrix, mean_squared_error, classification_report, f1_score
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import svm, tree    #https://scikit-learn.org/stable/modules/svm.html
                                 #https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier

from modAL.models import ActiveLearner             #https://modal-python.readthedocs.io/en/latest/content/models/ActiveLearner.html
from modAL.uncertainty import entropy_sampling     #https://modal-python.readthedocs.io/en/latest/content/apireference/uncertainty.html
from modAL.disagreement import vote_entropy_sampling

from imblearn.over_sampling import SMOTEN
from imblearn.under_sampling import EditedNearestNeighbours, ClusterCentroids, RandomUnderSampler 

from collections import Counter

import sys
sys.path.insert(0, '/home/jovyan/Thesis_ActLearn_DOP_2022/main/active_learning/')
import functions as fun

In [2]:
# Loading data sets
full_data_BatchA = pd.read_csv('/home/jovyan/Thesis_ActLearn_DOP_2022/main/active_learning/data/full_data_BatchA.csv')
y = full_data_BatchA['Label'].to_numpy()
X_morgan = full_data_BatchA.drop(['Label'], axis = 1).to_numpy()

In [3]:
print(f'Initial class count in Batch A:\n {Counter(y)}')

Initial class count in Batch A:
 Counter({0: 5120, 1: 51})


In [5]:
# Split proportionally
X_train, X_test, y_train, y_test = train_test_split(X_morgan, y, test_size=0.2, stratify=y, random_state=6752)
print(f'Class count in TRAIN: {Counter(y_train)}')
print(f'Class count in TEST: {Counter(y_test)}')

Class count in TRAIN: Counter({0: 4095, 1: 41})
Class count in TEST: Counter({0: 1025, 1: 10})


Example of combining random oversampling and undersampling for imbalanced data

In [None]:
print(f'Initial class count in TRAIN: {Counter(y_train)}')
# define oversampling strategy 1\10, 409/
over_sampler = SMOTEN(sampling_strategy = 0.1, n_jobs= -1, random_state=0)
# fit and apply the transform
x_train_over, y_train_over = over_sampler.fit_resample(X_train, y_train)
# summarize class distribution
print(f'Class count in TRAIN after over-sampling: {Counter(y_train_over)}')

In [None]:
x_train_over[0]

In [None]:
# define undersampling strategy
under_sampler = RandomUnderSampler(sampling_strategy = 0.9 ,random_state=42)
# fit and apply the transform
x_train_under, y_train_under = under_sampler.fit_resample(x_train_over, y_train_over)
# summarize class distribution
print(f'Class count in TRAIN after under-sampling: {Counter(y_train_under)}')

In [None]:
x_train_under[0]

In [None]:
counter = 0
for i in np.arange(0,862):
    if (x_train_over[0]==x_train_under[i]).all():
        counter += 1
        
print(counter)

### Experiment again...

First only over sampling the minority class

In [7]:
clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 3), n_estimators = 100, random_state = 84723)
counter = 0
over_samp_matrix = []
print(f'Initial class count in TRAIN: {Counter(y_train)}')

for o in np.arange(0.1, 0.2, 0.1):
    
    print(f'************ Row {counter}')
    
    # define oversampling strategy
    over_sampler = SMOTEN(sampling_strategy = o, n_jobs= -1, random_state=0)
    # fit and apply the transform
    x_train_re_ov, y_train_re_ov = over_sampler.fit_resample(X_train, y_train)

    # summarize class distribution
    print('Re-sampling with (o = {}, u = 0). Final count: {}'.format(o, Counter(y_train_re_ov)))
    if len(y_train_re_ov) != len(np.unique(x_train_re_ov, axis=0)):
        print("Repetition")

    clf.fit(x_train_re_ov, y_train_re_ov)
    y_predicted_re_ov = clf.predict(X_test)
    score = f1_score(y_test, y_predicted_re_ov)
    print(f'---Score: {score}')

    over_samp_matrix.append(score)

Initial class count in TRAIN: Counter({0: 4095, 1: 41})
************ Row 0
Re-sampling with (o = 0.1, u = 0). Final count: Counter({0: 4095, 1: 409})
Repetition
---Score: 0.0


In [11]:
len(np.unique(X_morgan, axis=0))

5137

In [None]:
?np.unique

In [None]:
rep = 0

for i in np.arange(4,7780):
    if (x_train_re[3]==x_train_re[i]).all():
        rep+=1

print(rep)

In [None]:
counter = 0
under_samp_matrix = []
print(f'Initial class count in TRAIN: {Counter(y_train)}')

for u in np.arange(1.0, 0.0, -0.1):
    
    print(f'************ Row {counter}')
    
    # define undersampling strategy
    under_sampler = RandomUnderSampler(sampling_strategy = u ,random_state=42)
    # fit and apply the transform
    x_train_re_un, y_train_re_un = under_sampler.fit_resample(X_train, y_train)

    # summarize class distribution
    print('Re-sampling with (o = 0, u = {}). Final count: {}'.format(u, Counter(y_train_re_un)))

    clf.fit(x_train_re_un, y_train_re_un)
    y_predicted_re_un = clf.predict(X_test)
    score = f1_score(y_test, y_predicted_re_un)
    print(f'---Score: {score}')

    under_samp_matrix.append(score)