In [1]:
%load_ext autoreload
%autoreload 2

import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import random
import timeit
import pickle

from sklearn.model_selection import cross_val_score, KFold, train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, mean_squared_error, classification_report, f1_score, mean_squared_log_error, recall_score
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn import svm, tree    #https://scikit-learn.org/stable/modules/svm.html
                                 #https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler

#-------------ACTIVE LEARNING LIBRARY
from modAL.models import ActiveLearner             #https://modal-python.readthedocs.io/en/latest/content/models/ActiveLearner.html
from modAL.uncertainty import entropy_sampling     #https://modal-python.readthedocs.io/en/latest/content/apireference/uncertainty.html
from modAL.disagreement import vote_entropy_sampling

#------------IMBALANCED DATA SETS LIBRARY
from imblearn.over_sampling import SMOTEN
from imblearn.under_sampling import EditedNearestNeighbours, ClusterCentroids, RandomUnderSampler 

from collections import Counter

import sys
sys.path.insert(0, '/home/jovyan/Thesis_ActLearn_DOP_2022/main/active_learning/')
import functions as fun
import reg_icp as ricp

In [2]:
# Loading data sets
cell_profiler = pd.read_csv('/home/jovyan/Thesis_ActLearn_DOP_2022/main/supervised_learning/regression_data_batchA.csv')
print(f'There are {len(cell_profiler)} rows in the Batch A')

There are 5167 rows in the Batch A


In [3]:
# Selecting features according to the paper
group1 = [col for col in cell_profiler.columns if ('Granu' in col  and 'SYTO' in col) or ('Inten' in col and 'SYTO' in col) and not 'Location' in col and not 'Radial' in col]
group2 = [col for col in cell_profiler.columns if ('Granu' in col  and 'CONC' in col) or ('Inten' in col and 'CONC' in col) and not 'Location' in col and not 'Radial' in col]
group3 = [col for col in cell_profiler.columns if 'Correla' in col or 'Neig' in col]

filtered_features = group1+group2+group3

# Filtering
filtered_cell_profiler = cell_profiler[filtered_features+['Target']]

In [38]:
# Output values
X_filtered = filtered_cell_profiler.drop(['Target'], axis = 1).to_numpy()
y = 1000*filtered_cell_profiler.iloc[:,-1]

In [39]:
# Parameters for ML model
train_size = 0.1
test_size = 0.3

# split dataset into train(- %), test(- %), unlabel(- %)
x_train, y_train, x_test, y_test, x_pool, y_pool = fun.split(x_dataset = X_filtered, y_dataset = y, 
                                                             ini_train_size = train_size, test_size = test_size)

In [40]:
print(f'The inicial training set has size {len(x_train)}')
print(f'The inicial pool set has size {len(x_pool)}')
print(f'The inicial test set has size {len(x_test)}')

The inicial training set has size 361
The inicial pool set has size 3255
The inicial test set has size 1551


In [49]:
"""EXPECTED MODEL CHANGE MAXIMISATION FOR ACTIVE LEARNING IN LR
Input:
X_labeled = small labeled data set (called D in the paper) with n points
y_labeled
X_pool = the unlabelled pool set
linear_model = the linear regression model (called f(x;theta)) in the paper
K_members = number of regressors in the ensemble

Ouput:
x_star = the instance to be sampled for active learning
"""

def emcm_query(X_labeled, y_labeled, X_pool, linear_model, K_members):
    
    # 0. Train the linear regresor in X_labeled to build f(x;theta)
    fx = linear_model.fit(X_labeled, y_labeled)
    
    # 1. Construct an ensemble with boostrap examples
    Bk = AdaBoostRegressor(base_estimator = linear_model, n_estimators = K_members, random_state=5563).fit(X_labeled, y_labeled)
    
    # 2. for each x_candidate in x_pool do
    modelChange_per_candidate = np.zeros(X_pool.shape[0])
    for x_candidate_idx, x_candidate_value in enumerate(X_pool):
        gradient_per_candidate = []
        # 3. for each member in the ensemble
        for k in range(K_members):
            
            # 4. y_k(x_candidate_value) = f_k(x_candidate_value)
            yk_candidate = Bk.estimators_[k].predict([x_candidate_value]) 
            
            # 5.Calculate the derivative using Eq.13:
            delta_lk = (fx.predict([x_candidate_value]) - yk_candidate) * x_candidate_value
            gradient_per_candidate.append(delta_lk)
        
        # 7. Estimate the true model change by expectation calculation over K possible labels with Eq.14
        modelChange_per_candidate[x_candidate_idx] = (1/K_members)*np.sum(np.linalg.norm(gradient_per_candidate))
        
    # 8.Select the x that maximases the change
    x_star_idx = np.argmax(modelChange_per_candidate)
    x_star_value = X_pool[x_star_idx]
    
    return x_star_idx, x_star_value
    