In [1]:
%load_ext autoreload
%autoreload 2

import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import random
import timeit
import pickle

from sklearn.model_selection import cross_val_score, KFold, train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, mean_squared_error, classification_report, f1_score, mean_squared_log_error, recall_score
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn import svm, tree    #https://scikit-learn.org/stable/modules/svm.html
                                 #https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler

#-------------ACTIVE LEARNING LIBRARY
from modAL.models import ActiveLearner             #https://modal-python.readthedocs.io/en/latest/content/models/ActiveLearner.html
from modAL.uncertainty import entropy_sampling     #https://modal-python.readthedocs.io/en/latest/content/apireference/uncertainty.html
from modAL.disagreement import vote_entropy_sampling

#------------IMBALANCED DATA SETS LIBRARY
from imblearn.over_sampling import SMOTEN
from imblearn.under_sampling import EditedNearestNeighbours, ClusterCentroids, RandomUnderSampler 

from collections import Counter

import sys
sys.path.insert(0, '/home/jovyan/Thesis_ActLearn_DOP_2022/main/active_learning/')
import functions as fun

In [2]:
# Loading data sets
sss_batchB = pd.read_csv('/home/jovyan/covid-data/COVID_BatchB_ready_POLINA.csv')
print(f'There are {sss_batchB.shape} rows and cols in the Batch B')

There are (5169, 1884) rows and cols in the Batch B


In [3]:
# Look for duplicated columns
dup_cols = fun.getDuplicateColumns(sss_batchB)
print(f'The duplicate columns are:\n {dup_cols}')

The duplicate columns are:
 ['Location_Center_Y_cells', 'Location_Center_X_cells', 'AreaShape_MaxFeretDiameter_cytoplasm', 'Intensity_MeanIntensity_illumMITO_cells.1', 'AreaShape_BoundingBoxMaximum_Y_cytoplasm', 'Location_Center_X_nuclei', 'AreaShape_MinFeretDiameter_cytoplasm', 'AreaShape_BoundingBoxArea_cytoplasm', 'Location_Center_Y_cytoplasm', 'Location_Center_Y_nuclei', 'AreaShape_BoundingBoxMinimum_X_cytoplasm', 'AreaShape_BoundingBoxMaximum_X_cytoplasm', 'Location_Center_X_cytoplasm', 'AreaShape_BoundingBoxMinimum_Y_cytoplasm']


In [4]:
# I need to keep Intensity_MeanIntensity_illumMITO_cells.1 for later computations
iaux = dup_cols.index("Intensity_MeanIntensity_illumMITO_cells.1")
dup_cols[iaux]= 'Intensity_MeanIntensity_illumMITO_cells'

# Drop repeated columns
sss_batchB = sss_batchB.drop(dup_cols, axis=1)

In [5]:
compound_volume0 = ['CBK309099','CBK290175','CBK290175','CBK290175','CBK290175','CBK308178','CBK308178','CBK308178','CBK308178',
                    'CBK309637','CBK308501','CBK309637','CBK308501','CBK309637','CBK308501','CBK309637','CBK308501','CBK309392',
                    'CBK309392','CBK290631','CBK309251','CBK309251','CBK309251','CBK309251','CBK290950','CBK290950','CBK290950','CBK290950']

# Removing compounds from cell profiler data frame
compound_volume0_idx = [sss_batchB.index[sss_batchB['ID_covid'] == comp].to_list() for comp in compound_volume0]
compound_volume0_idx = [item for sublist in compound_volume0_idx for item in sublist]
sss_batchB = sss_batchB.drop(compound_volume0_idx)
print(f'The shape if Batch B is {sss_batchB.shape} so far')

The shape if Batch B is (5160, 1870) so far


In [6]:
filter_col = [col for col in sss_batchB.columns if 'MITO' in col]
print(f'There are {len(filter_col)} columns containig info about MITO')
print(filter_col[-1])

There are 364 columns containig info about MITO
Intensity_MeanIntensity_illumMITO_cells.1


In [7]:
filter_col = filter_col[:-1] # IF Intensity_MeanIntensity_illumMITO_cells.1 is in the last position

In [9]:
# Drop MITO columns
sss_batchB = sss_batchB.drop(filter_col, axis=1)

In [10]:
print(f'Now the shape of Batch A is: {sss_batchB.shape}')

Now the shape of Batch A is: (5160, 1507)


### Now we add the information for training + cleaning

First the cell profiler

In [11]:
#Remove columns that are "human-made", hence, not useful 
cell_profiler = sss_batchB.drop(['ID_covid','PlateWellCompound','principal component 1','Ypredicted','Labels'], 1)
cell_profiler = cell_profiler.rename(columns={'Intensity_MeanIntensity_illumMITO_cells.1': 'Target'})
cell_profiler.head()

  cell_profiler = sss_batchB.drop(['ID_covid','PlateWellCompound','principal component 1','Ypredicted','Labels'], 1)


Unnamed: 0,Count_nuclei,AreaShape_Area_nuclei,AreaShape_BoundingBoxArea_nuclei,AreaShape_BoundingBoxMaximum_X_nuclei,AreaShape_BoundingBoxMaximum_Y_nuclei,AreaShape_BoundingBoxMinimum_X_nuclei,AreaShape_BoundingBoxMinimum_Y_nuclei,AreaShape_Center_X_nuclei,AreaShape_Center_Y_nuclei,AreaShape_Compactness_nuclei,...,RadialDistribution_ZernikePhase_illumSYTO_8_2_cytoplasm,RadialDistribution_ZernikePhase_illumSYTO_8_4_cytoplasm,RadialDistribution_ZernikePhase_illumSYTO_8_6_cytoplasm,RadialDistribution_ZernikePhase_illumSYTO_8_8_cytoplasm,RadialDistribution_ZernikePhase_illumSYTO_9_1_cytoplasm,RadialDistribution_ZernikePhase_illumSYTO_9_3_cytoplasm,RadialDistribution_ZernikePhase_illumSYTO_9_5_cytoplasm,RadialDistribution_ZernikePhase_illumSYTO_9_7_cytoplasm,RadialDistribution_ZernikePhase_illumSYTO_9_9_cytoplasm,Target
0,155.0,1763.321836,2392.479974,1040.469101,1121.808755,992.439454,1073.791338,1015.952347,1097.333064,1.146009,...,-0.079766,-0.062142,0.028843,-0.050329,0.002352,-0.048181,0.101544,-0.051319,0.009876,0.014975
1,177.666667,1778.49845,2375.8407,1074.039635,1102.38867,1025.847336,1054.453537,1049.46608,1077.953347,1.129649,...,-0.011443,-0.007264,0.105147,0.02487,0.097344,-0.034144,0.026505,-0.058919,0.039985,0.013005
2,195.777778,1749.652953,2346.427388,1088.93604,1080.753557,1041.279389,1032.913473,1064.616892,1056.32678,1.137315,...,0.053927,0.050008,-0.048896,0.042183,-0.003522,0.021115,0.016468,-0.013247,-0.070871,0.013788
3,216.571429,2047.965141,2694.202403,1113.122298,1116.560567,1061.842105,1065.028003,1086.982445,1090.28035,1.120502,...,-0.057359,-0.103094,-0.044613,0.105152,-0.068018,0.031788,0.02098,-0.000444,0.049044,0.006809
4,157.125,1743.363036,2376.359863,1108.081848,1074.743853,1060.249785,1026.842564,1083.631113,1050.306489,1.149625,...,-0.004677,0.049752,0.052904,0.00883,0.070198,-0.087356,0.059546,0.05379,-0.011938,0.013995


### Feature selection

So far I have filtered all the columns that contained something related to the MITO chanell. Now, according to the paper "A phenomic approach for antiviral drug discovery", there are features, specifically correlation, 
neighbours, concavalin and syto, that are more important. Hence, I am going to filter more to use only that information. 

In [13]:
# Selecting features according to the paper
group1 = [col for col in cell_profiler.columns if ('Granu' in col  and 'SYTO' in col) or ('Inten' in col and 'SYTO' in col) and not 'Location' in col and not 'Radial' in col]
group2 = [col for col in cell_profiler.columns if ('Granu' in col  and 'CONC' in col) or ('Inten' in col and 'CONC' in col) and not 'Location' in col and not 'Radial' in col]
group3 = [col for col in cell_profiler.columns if 'Correla' in col or 'Neig' in col]

filtered_features = group1+group2+group3

# Filtering
filtered_cell_profiler_B = cell_profiler[filtered_features+['Target']]

In [15]:
filtered_cell_profiler_B

Unnamed: 0,Granularity_10_illumSYTO_nuclei,Granularity_11_illumSYTO_nuclei,Granularity_12_illumSYTO_nuclei,Granularity_13_illumSYTO_nuclei,Granularity_14_illumSYTO_nuclei,Granularity_15_illumSYTO_nuclei,Granularity_16_illumSYTO_nuclei,Granularity_1_illumSYTO_nuclei,Granularity_2_illumSYTO_nuclei,Granularity_3_illumSYTO_nuclei,...,Correlation_RWC_illumHOECHST_illumCONC_cytoplasm,Correlation_RWC_illumHOECHST_illumPHAandWGA_cytoplasm,Correlation_RWC_illumHOECHST_illumSYTO_cytoplasm,Correlation_RWC_illumPHAandWGA_illumCONC_cytoplasm,Correlation_RWC_illumPHAandWGA_illumHOECHST_cytoplasm,Correlation_RWC_illumPHAandWGA_illumSYTO_cytoplasm,Correlation_RWC_illumSYTO_illumCONC_cytoplasm,Correlation_RWC_illumSYTO_illumHOECHST_cytoplasm,Correlation_RWC_illumSYTO_illumPHAandWGA_cytoplasm,Target
0,5.036074,4.045871,3.701231,2.976809,2.923950,2.166730,2.287887,12.074159,2.213671,3.594409,...,0.668297,0.788363,0.776957,0.670813,0.544883,0.838390,0.776913,0.616491,0.917410,0.014975
1,4.321689,3.623019,3.061378,2.656599,2.155505,1.921066,1.756346,13.189354,2.268531,3.724780,...,0.676479,0.784949,0.779240,0.674424,0.529992,0.838668,0.773285,0.597442,0.908687,0.013005
2,4.755632,3.627676,3.381285,2.693872,2.195047,2.133893,1.536719,9.976382,2.778201,4.143379,...,0.682199,0.780595,0.772278,0.703561,0.537257,0.839807,0.793541,0.605488,0.916414,0.013788
3,4.164948,3.212185,2.550792,2.159480,1.721807,1.293209,1.468429,14.068313,2.125489,3.714472,...,0.711177,0.771760,0.781160,0.692886,0.497564,0.833036,0.780067,0.561020,0.884716,0.006809
4,4.389827,3.652513,3.104480,2.698492,2.429443,2.057912,1.880075,11.564207,2.176082,3.715027,...,0.657812,0.791037,0.786559,0.645229,0.549538,0.844488,0.746402,0.622248,0.915533,0.013995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5164,4.076411,3.837551,3.065481,3.036789,2.646879,2.225419,1.959465,11.987888,2.494503,3.624097,...,0.675844,0.784177,0.784154,0.680145,0.539775,0.843267,0.774568,0.607549,0.905907,0.015038
5165,4.653916,3.878406,3.286776,3.034016,2.635375,2.446386,2.077198,8.541350,2.688379,4.062471,...,0.702157,0.784581,0.787652,0.717875,0.511995,0.847052,0.801154,0.583767,0.921299,0.013815
5166,4.481191,3.767212,3.352855,3.245919,2.523410,1.960557,1.631931,8.787539,2.711964,3.947765,...,0.689623,0.787520,0.765525,0.754183,0.593841,0.850788,0.830121,0.655829,0.931191,0.016064
5167,4.873445,3.996099,3.261522,2.913878,2.606938,2.307421,2.053859,8.602307,2.886905,4.049433,...,0.690528,0.781952,0.782137,0.708196,0.529648,0.843113,0.795628,0.604710,0.920878,0.013780


## Now training

### 1. Creating training and tests sets

In [17]:
# We get the input values
X_filtered =  filtered_cell_profiler_B.iloc[:,:-1]

# Re-scale
scaler = StandardScaler()
X_filtered = pd.DataFrame(scaler.fit_transform(X_filtered))
X_filtered

# Output values
y = 1000*cell_profiler.iloc[:,-1]