In [None]:
%load_ext autoreload
%autoreload 2

import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import random
import timeit
import pickle

from sklearn.model_selection import cross_val_score, KFold, train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, mean_squared_error, classification_report, f1_score, mean_squared_log_error, recall_score
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn import svm, tree    #https://scikit-learn.org/stable/modules/svm.html
                                 #https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler

#-------------ACTIVE LEARNING LIBRARY
from modAL.models import ActiveLearner             #https://modal-python.readthedocs.io/en/latest/content/models/ActiveLearner.html
from modAL.uncertainty import entropy_sampling     #https://modal-python.readthedocs.io/en/latest/content/apireference/uncertainty.html
from modAL.disagreement import vote_entropy_sampling

#------------IMBALANCED DATA SETS LIBRARY
from imblearn.over_sampling import SMOTEN
from imblearn.under_sampling import EditedNearestNeighbours, ClusterCentroids, RandomUnderSampler 

from collections import Counter

import sys
sys.path.insert(0, '/home/jovyan/Thesis_ActLearn_DOP_2022/main/active_learning/')
import functions as fun

In [None]:
# Loading data sets
cell_profiler = pd.read_csv('/home/jovyan/Thesis_ActLearn_DOP_2022/main/supervised_learning/regression_data_batchA.csv')
print(f'There are {len(cell_profiler)} rows in the Batch A')

### Feature selection

So far I have filtered all the columns that contained something related to the MITO chanell. Now, according to the paper "A phenomic approach for antiviral drug discovery", there are features, specifically correlation, 
neighbours, concavalin and syto, that are more important. Hence, I am going to filter more to use only that information. 

In [15]:
# Selecting features according to the paper
group1 = [col for col in cell_profiler.columns if ('Granu' in col  and 'SYTO' in col) or ('Inten' in col and 'SYTO' in col) and not 'Location' in col and not 'Radial' in col]
group2 = [col for col in cell_profiler.columns if ('Granu' in col  and 'CONC' in col) or ('Inten' in col and 'CONC' in col) and not 'Location' in col and not 'Radial' in col]
group3 = [col for col in cell_profiler.columns if 'Correla' in col or 'Neig' in col]

filtered_features = group1+group2+group3

# Filtering
filtered_cell_profiler = cell_profiler[filtered_features+['Target','cell_profiler_vector']]

In [16]:
filtered_cell_profiler

Unnamed: 0,Granularity_10_illumSYTO_nuclei,Granularity_11_illumSYTO_nuclei,Granularity_12_illumSYTO_nuclei,Granularity_13_illumSYTO_nuclei,Granularity_14_illumSYTO_nuclei,Granularity_15_illumSYTO_nuclei,Granularity_16_illumSYTO_nuclei,Granularity_1_illumSYTO_nuclei,Granularity_2_illumSYTO_nuclei,Granularity_3_illumSYTO_nuclei,...,Correlation_RWC_illumHOECHST_illumPHAandWGA_cytoplasm,Correlation_RWC_illumHOECHST_illumSYTO_cytoplasm,Correlation_RWC_illumPHAandWGA_illumCONC_cytoplasm,Correlation_RWC_illumPHAandWGA_illumHOECHST_cytoplasm,Correlation_RWC_illumPHAandWGA_illumSYTO_cytoplasm,Correlation_RWC_illumSYTO_illumCONC_cytoplasm,Correlation_RWC_illumSYTO_illumHOECHST_cytoplasm,Correlation_RWC_illumSYTO_illumPHAandWGA_cytoplasm,Target,cell_profiler_vector
0,4.361841,3.584432,3.342588,2.438684,2.455154,1.976695,2.001378,9.488776,2.380306,3.854341,...,0.775435,0.756380,0.722137,0.591659,0.828587,0.809824,0.656284,0.913003,0.008777,"[193.1111111, 1800.254037, 2425.918124, 1093.8..."
1,3.959050,3.414357,2.863476,2.486072,2.171766,1.809120,1.267149,11.744559,2.641746,4.127440,...,0.768947,0.765124,0.704358,0.532513,0.819351,0.795736,0.601653,0.897047,0.008477,"[242.8888889, 1728.176283, 2305.202711, 1115.8..."
2,4.349187,3.838771,3.244301,2.603814,2.302234,2.055301,1.868436,12.560360,2.116105,3.498266,...,0.752523,0.735974,0.704603,0.580412,0.803257,0.808617,0.651608,0.897853,0.008487,"[199.0, 1752.473795, 2362.270982, 1103.36528, ..."
3,3.992664,3.158369,2.529678,1.804955,1.437475,1.391254,1.186035,16.442088,2.074824,3.725266,...,0.731893,0.734542,0.743543,0.518442,0.797670,0.828538,0.578705,0.861295,0.005784,"[226.5555556, 1986.790362, 2626.938955, 1131.6..."
4,4.042045,3.297629,2.982852,2.552167,2.170422,1.917469,1.704661,11.297141,2.378388,3.986028,...,0.765918,0.757066,0.678658,0.540312,0.809617,0.783428,0.616865,0.900889,0.008265,"[197.2222222, 1727.657115, 2330.338123, 1086.8..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5162,4.586029,3.987638,3.237160,2.911563,2.687102,2.303301,1.853467,9.320676,2.862562,4.295627,...,0.765387,0.752188,0.726400,0.570664,0.830599,0.814182,0.637734,0.915446,0.009815,"[203.7777778, 1667.161061, 2267.327924, 1145.1..."
5163,4.703513,4.036346,3.828274,2.991866,2.378707,2.247013,1.926051,8.647649,2.924413,4.307183,...,0.768007,0.759339,0.744845,0.573191,0.843336,0.824301,0.638707,0.918787,0.011134,"[203.7777778, 1615.980907, 2212.39064, 1117.78..."
5164,5.035118,4.401190,3.585528,2.949741,2.646095,2.207923,2.020640,8.502608,3.045520,4.290687,...,0.763024,0.759621,0.756895,0.564578,0.856821,0.826230,0.622314,0.921777,0.010978,"[158.5555556, 1604.295761, 2231.107003, 1066.2..."
5165,4.381050,3.690412,3.451646,2.912654,2.418330,2.176244,1.666272,8.621927,3.232583,4.465332,...,0.771411,0.758128,0.735672,0.574595,0.845911,0.815877,0.637306,0.922736,0.009783,"[172.5555556, 1598.984046, 2182.85086, 1092.24..."


### Now training

1. Creating training and tests sets

In [21]:
# We get the input values
X =  cell_profiler.iloc[:,:-2]
# Output values
y = 1000*cell_profiler.iloc[:,-2]

# Re-scale
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X))
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1491,1492,1493,1494,1495,1496,1497,1498,1499,1500
0,-0.107728,1.239198,1.167609,-0.482631,1.140747,-0.518719,1.099841,-0.501259,1.120302,-0.884358,...,-0.755966,-1.026410,1.000601,-1.426433,0.245300,0.762549,2.842807,-0.245682,2.989801,0.860631
1,2.013660,0.305987,-0.086490,0.188351,1.412834,0.188224,1.401132,0.188508,1.407069,-1.351787,...,-1.407816,-0.417903,-0.119504,2.041143,-0.605380,0.116426,0.621445,0.585670,-0.402089,-0.130295
2,0.143240,0.620574,0.506386,-0.192122,0.695846,-0.207581,0.671964,-0.199984,0.684649,-0.767405,...,-0.864495,0.926178,0.781536,-0.385767,0.450136,-0.639883,-0.441966,0.439557,0.697913,0.797335
3,1.317579,3.654337,3.255991,0.672248,0.016592,0.568471,-0.096899,0.620369,-0.039750,-2.156958,...,-3.945358,-0.247766,-1.004849,-0.194872,2.165622,1.118229,-1.807797,0.624545,-0.189090,-0.031203
4,0.067476,0.299265,0.174639,-0.697601,0.899622,-0.706254,0.892362,-0.701520,0.896577,-0.956002,...,-0.416337,0.188694,0.198654,-0.389778,0.181537,1.067298,1.306801,-0.398901,0.216276,0.202172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5162,0.346855,-0.483994,-0.479967,1.081808,0.522437,1.100619,0.534492,1.091570,0.529071,0.354302,...,0.245601,-0.474582,-0.152525,-2.860311,0.726424,-0.986981,0.602433,-0.234770,-0.664562,-0.065029
5163,0.346855,-1.146638,-1.050704,0.247744,-1.492874,0.285837,-1.455111,0.267289,-1.474053,0.799845,...,0.810322,0.305472,0.480695,-1.454795,0.830986,-0.878519,1.688469,-0.796419,-0.865290,2.146827
5164,-1.580388,-1.297929,-0.856262,-1.324741,-0.752605,-1.289287,-0.713442,-1.306814,-0.735356,2.303194,...,1.354873,-0.553068,2.148547,1.417388,0.326906,-0.627323,-0.792318,0.586307,-2.965615,-0.005542
5165,-0.983747,-1.366701,-1.357590,-0.531515,0.956323,-0.478852,0.987686,-0.505672,0.972539,0.397202,...,0.856758,1.441532,-0.664472,0.767670,-0.318569,1.269042,-0.086749,-0.183306,0.141610,-1.031048


In [22]:
# We get the input values
X_filtered =  filtered_cell_profiler.iloc[:,:-2]

# Re-scale
scaler = StandardScaler()
X_filtered = pd.DataFrame(scaler.fit_transform(X_filtered))
X_filtered

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,274,275,276,277,278,279,280,281,282,283
0,-0.257914,-0.503403,0.368115,-1.095521,0.177077,-0.430929,0.795305,-0.046790,-1.321457,-0.836086,...,0.170116,0.076296,0.748799,0.169360,-0.087611,0.648681,-0.203668,-0.033160,0.743814,0.100900
1,-1.351104,-0.978272,-1.053952,-0.941708,-0.821612,-1.089779,-2.370635,1.530151,-0.441446,0.073379,...,-0.776891,0.733642,0.276178,0.731734,-0.770309,-1.259507,-0.719190,-0.907220,-1.347615,-1.314692
2,-0.292257,0.206746,0.076387,-0.559541,-0.361829,-0.121880,0.222068,2.100450,-2.210764,-2.021869,...,-0.614305,-0.786486,-0.920356,-1.143140,-0.760895,0.285831,-1.617476,-0.108020,0.564839,-1.243213
3,-1.259874,-1.693025,-2.044711,-3.152486,-3.409335,-2.732680,-2.720392,4.814035,-2.349717,-1.265924,...,-0.800588,3.368160,-2.423337,-1.235297,0.734348,-1.713448,-1.929259,1.127889,-2.226118,-4.486775
4,-1.125852,-1.304193,-0.699630,-0.727178,-0.826347,-0.663788,-0.484115,1.217376,-1.327913,-0.397548,...,-1.279884,-0.715382,0.055487,0.213475,-1.757143,-1.007877,-1.262469,-1.670823,-0.765251,-0.973857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5162,0.350542,0.622401,0.055192,0.439353,0.994491,0.853169,0.157524,-0.164303,0.301825,0.633466,...,0.196056,0.113013,0.016828,-0.100313,0.076090,-0.028682,-0.091369,0.237186,0.033689,0.317694
5163,0.669399,0.758400,1.809696,0.700002,-0.092329,0.631864,0.470503,-0.634794,0.510016,0.671948,...,0.126944,0.582826,0.207708,0.359684,0.784367,0.052846,0.619493,0.865013,0.070952,0.614092
5164,1.569386,1.777094,1.089193,0.563271,0.849976,0.478177,0.878359,-0.736187,0.917663,0.617015,...,1.293937,0.712350,-0.155374,0.377773,1.247087,-0.225002,1.372138,0.984667,-0.556620,0.879421
5165,-0.205780,-0.207491,0.691815,0.442894,0.047307,0.353626,-0.649646,-0.652776,1.547321,1.198609,...,0.470867,0.184642,0.455690,0.281784,0.432123,0.098151,0.763248,0.342401,0.017309,0.964534


2) K-fold cross validationd and linear regression

In [24]:
y_frame = pd.DataFrame({'MITO': y.values.flatten()})
y_frame = fun.labelling_v2(y_frame, col_reference=0, labels_position = 1, label_positive=1, label_negative=0, threshold=7.5)
y_frame = y_frame.groupby('Labels_0').count()
print(f'Percentage of positives in the total dataframe with {7.5} as threshold: {y_frame.MITO[1]*100 / y_frame.MITO.sum()}%')

Percentage of positives in the total dataframe with 7.5 as threshold: 2.5933810721888912%
