In [1]:
%load_ext autoreload
%autoreload 2

import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import random
import timeit
import pickle

from sklearn.model_selection import cross_val_score, KFold, train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, mean_squared_error, classification_report, f1_score, mean_squared_log_error, recall_score
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn import svm, tree    #https://scikit-learn.org/stable/modules/svm.html
                                 #https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler

#-------------ACTIVE LEARNING LIBRARY
from modAL.models import ActiveLearner             #https://modal-python.readthedocs.io/en/latest/content/models/ActiveLearner.html
from modAL.uncertainty import entropy_sampling     #https://modal-python.readthedocs.io/en/latest/content/apireference/uncertainty.html
from modAL.disagreement import vote_entropy_sampling

#------------IMBALANCED DATA SETS LIBRARY
from imblearn.over_sampling import SMOTEN
from imblearn.under_sampling import EditedNearestNeighbours, ClusterCentroids, RandomUnderSampler 

from collections import Counter

import sys
sys.path.insert(0, '/home/jovyan/Thesis_ActLearn_DOP_2022/main/active_learning/')
import functions as fun
import reg_icp as ricp
import emcm 

In [2]:
# Loading data sets
cell_profiler = pd.read_csv('/home/jovyan/Thesis_ActLearn_DOP_2022/main/supervised_learning/regression_data_batchA.csv')
print(f'There are {len(cell_profiler)} rows in the Batch A')

There are 5167 rows in the Batch A


In [3]:
# Selecting features according to the paper
group1 = [col for col in cell_profiler.columns if ('Granu' in col  and 'SYTO' in col) or ('Inten' in col and 'SYTO' in col) and not 'Location' in col and not 'Radial' in col]
group2 = [col for col in cell_profiler.columns if ('Granu' in col  and 'CONC' in col) or ('Inten' in col and 'CONC' in col) and not 'Location' in col and not 'Radial' in col]
group3 = [col for col in cell_profiler.columns if 'Correla' in col or 'Neig' in col]

filtered_features = group1+group2+group3

# Filtering
filtered_cell_profiler = cell_profiler[filtered_features+['Target']]

In [9]:
# Output values
X_filtered = filtered_cell_profiler.drop(['Target'], axis = 1).to_numpy()
y = 1000*filtered_cell_profiler.iloc[:,-1].to_numpy()

In [10]:
# Parameters for ML model
train_size = 0.1
test_size = 0.3

# split dataset into train(- %), test(- %), unlabel(- %)
x_train, y_train, x_test, y_test, x_pool, y_pool = fun.split(x_dataset = X_filtered, y_dataset = y, 
                                                             ini_train_size = train_size, test_size = test_size)

In [11]:
print(f'The inicial training set has size {len(x_train)}')
print(f'The inicial pool set has size {len(x_pool)}')
print(f'The inicial test set has size {len(x_test)}')

The inicial training set has size 361
The inicial pool set has size 3255
The inicial test set has size 1551


In [12]:
# Creating the best models for analysis
linear_model = Ridge(alpha=1.3, random_state=0)
best_tree = DecisionTreeRegressor(max_depth = 5, random_state=0)
best_ada = AdaBoostRegressor(base_estimator = best_tree, n_estimators = 100, random_state=0)

In [13]:
# Parameters for AL
N_QUERIES = int(2*len(x_pool)/3)
k_members = 3

#Define query strategy 
query_str = emcm.emcm_query
#learner = ActiveLearner(estimator= best_ridge, query_strategy = query_str, X_training = x_train, y_training = y_train)
    
performance_history = []
cf_matrix_history = []

#Fit model to initial data
linear_model.fit(x_train, y_train)

#Making predictions
y_pred = linear_model.predict(x_test)

#Calculate and report our model's accuracy.
model_accuracy = mean_squared_error(y_pred , y_test)

# Save our model's performance for plotting.
performance_history.append(model_accuracy)

# Allow our model to query our unlabeled dataset for the most
# informative points according to our query strategy emcm.
for index in range(N_QUERIES):

    #Query for a new point
    query_index, query_instance = query_str(x_train, y_train, x_pool, linear_model, k_members)
    
    # Teach our ActiveLearner model the record it has requested.
    XX, yy = x_pool[query_index].reshape(1, -1), y_pool[query_index].reshape(1, )
    x_train = np.append(x_train, XX, axis = 0)
    y_train = np.append(y_train, yy, axis = 0)

    # Remove the queried instance from the unlabeled pool.
    x_pool, y_pool = np.delete(x_pool, query_index, axis=0), np.delete(y_pool, query_index)
    
    # Re- training in new data
    linear_model.fit(x_train, y_train)
    
    #Predict given the new point
    y_pred = linear_model.predict(x_test)
    
    #Store performance
    model_accuracy = mean_squared_error(y_pred , y_test)
    performance_history.append(model_accuracy)


    if index % 100 == 0:
        print('Accuracy after query {n}: {acc:0.4f}'.format(n=index + 1, acc=model_accuracy))

Accuracy after query 1: 0.4219
Accuracy after query 101: 0.4295
Accuracy after query 201: 0.4049
Accuracy after query 301: 0.3967
Accuracy after query 401: 0.3889
Accuracy after query 501: 0.3817
Accuracy after query 601: 0.3806
Accuracy after query 701: 0.3741
Accuracy after query 801: 0.3671
Accuracy after query 901: 0.3666
Accuracy after query 1001: 0.3649
Accuracy after query 1101: 0.3639
Accuracy after query 1201: 0.3643
Accuracy after query 1301: 0.3642
Accuracy after query 1401: 0.3635
Accuracy after query 1501: 0.3631
Accuracy after query 1601: 0.3625
Accuracy after query 1701: 0.3615
Accuracy after query 1801: 0.3615
Accuracy after query 1901: 0.3614
Accuracy after query 2001: 0.3604
Accuracy after query 2101: 0.3591


2170