In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_regression, mutual_info_regression, SelectFromModel, VarianceThreshold
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV

# Loading data and pre-processing

In [28]:
gdsc_ge = pd.read_csv('data/Processed/gdsc_cell_ge.csv').fillna(0).set_index('CCL')

In [29]:
ctrp_ge = pd.read_csv('data/Processed/ctrp_cell_ge.csv').fillna(0).set_index('CCL')

In [30]:
ctrp_ge.shape

(449, 19851)

In [31]:
gdsc_ge.shape

(706, 19562)

In [38]:
data = pd.concat([gdsc_ge, ctrp_ge], sort = False, keys = ['gdsc', 'ctrp']).fillna(0)
data.shape

(1155, 22313)

In [41]:
data.index.levels[0]

Index(['gdsc', 'ctrp'], dtype='object')

In [43]:
from methods import pre
pre(data)

Unnamed: 0_level_0,Unnamed: 1_level_0,A1BG,A1CF,A2M,A4GALT,A4GNT,AAAS,AACS,AACSP1,AADAC,AADAT,...,ZNF818P,ZNF826P,ZNF836,ZNF850,ZNF865,ZNF891,ZSCAN12P1,ZSCAN16-AS1,ZSWIM8-AS1,ZXDA
Unnamed: 0_level_1,CCL,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
gdsc,22RV1,3.537942,6.364651,5.332441,3.241125,3.262633,4.722157,4.942126,3.537010,2.989766,3.841171,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
gdsc,2313287,3.370950,6.284884,3.485675,3.313028,3.096527,4.873621,4.213177,3.067134,2.983678,3.139248,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
gdsc,5637,2.927335,2.892365,3.181651,4.295357,3.205598,5.249042,4.495021,2.788374,3.213285,5.326825,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
gdsc,639V,3.953010,2.858072,2.892599,2.960059,3.121154,4.445200,4.017422,2.985819,3.109708,5.804184,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
gdsc,647V,2.804009,2.944390,3.178071,3.029297,3.024326,4.636786,4.724453,3.017293,3.320550,3.749442,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ctrp,ZR751,6.694777,3.754769,3.769402,4.574043,4.416972,6.098093,8.485920,3.364098,4.393612,7.023499,...,3.905325,3.692998,5.716371,4.754836,4.278497,3.468215,4.535992,3.817745,6.061334,5.532236
ctrp,ZR7530,6.141592,3.820361,4.617683,4.666993,4.387402,7.078184,8.318758,3.519714,5.091833,4.912098,...,5.495265,3.637149,5.185964,5.713535,4.509307,3.438513,5.915627,3.866042,5.395049,6.090865
ctrp,OE21,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ctrp,DOV13,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [19]:
data.index.levshape[0]

2

In [129]:
from methods import pre
data = pre(data, p = 0.01, t=4)

In [130]:
data.shape

(784, 16971)

In [99]:
gdsc_dr = pd.read_csv('data/Processed/gdsc_poz_dr.csv').fillna(0)
ctrp_dr = pd.read_csv('data/Processed/ctrp_poz_dr.csv').fillna(0)

In [100]:
from methods import combine
X_gdsc = combine(data.loc['gdsc'], gdsc_dr, '17-AAG')
X_ctrp = combine(data.loc['ctrp'], ctrp_dr, '17-AAG')

In [101]:
com = pd.concat([X_gdsc, X_ctrp], sort=False, keys = ['gdsc', 'ctrp']).fillna(0)

In [113]:
from sklearn.model_selection import train_test_split
 
X_train, X_test, y_train, y_test = train_test_split(com.drop('DR', axis=1), com['DR'], stratify=com.index.get_level_values(0))

In [114]:
from methods import fs
X_train_fs, X_test_fs, var = fs(f_regression, X_train, X_test, y_train, n=0.01)

In [115]:
X_train = pd.DataFrame(X_train_fs, index=X_train.index)

In [116]:
from methods import feda
X_train = feda([X_train.loc['gdsc'].to_numpy(), X_train.loc['ctrp'].to_numpy()])
#data = feda([data.loc['gdsc'].to_numpy(), data.loc['ctrp'].to_numpy()])

In [117]:
from methods import drp
from methods import tuning

space = {
    'n_estimators' : [1, 10,50, 100,200],
    'max_depth' : [3,5, 10, 20, 50, 80]
}

t2 = {
    'degree': [2, 3, 4, 5],
    'epsilon' : [0.1, 0.2, 0.3, 0.9],
    'C':[0.01, 0.1, 1, 10, 100]
}

mod = drp(RandomForestRegressor(), X_train, y_train, tuning = tuning(space, iterations=300, cv=5, scoring='r2'))
mod



RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=3, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [118]:
var = [i for i, x in enumerate(var) if x]

In [120]:
X_test = pd.DataFrame(X_test_fs, index=X_test.index)
X_test = feda([X_test.loc['gdsc'].to_numpy(), X_test.loc['ctrp'].to_numpy()])


In [121]:
from sklearn.metrics import r2_score
y_pred = mod.predict(X_test)
r2_score(y_test, y_pred)

0.0059988540607845975

# Feature Selection

from methods import fs
from methods import tuning
from sklearn.datasets import load_digits
X, y = load_digits(return_X_y=True)
print(X.shape)

space = {
    'n_estimators' : [1, 10,50, 100,200],
    'max_depth' : [3,5, 10, 20, 50, 80]
}
t = tuning(space, iterations = 500)

In [13]:
X.var()

36.20173240585726

In [14]:
X = fs(VarianceThreshold, X, y, n=10.0)
X.shape

(1797, 43)

# Domain Adaptation

# Drug Response

In [None]:
t2 = {
    'degree': [2, 3, 4, 5],
    'C':[0.01, 0.1, 1, ]
}

In [None]:
from methods import drp
drp(SVR(), X, y, tuning = t2)