In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_regression, mutual_info_regression, SelectFromModel, VarianceThreshold
from methods import pre


## Speeding up pre-processing

In [2]:
gdsc_ge = pd.read_csv('data/Processed/gdsc_cell_ge.csv').fillna(0).set_index('CCL')

In [3]:
print(type(gdsc_ge.keys()))

<class 'pandas.core.indexes.base.Index'>


In [5]:
gdsc_ge.shape

(706, 19562)

In [6]:
gdsc_ge.shape[0]

706

In [8]:
%%time
under = (gdsc_ge.to_numpy()>5).T.astype(np.int8)

n = [np.count_nonzero(i) > 0.1*gdsc_ge.shape[0] for i in under]
names = {gdsc_ge.keys()[k]:v for k, v in enumerate(n)}
indices = [k for k,v in names.items() if v]
index = gdsc_ge[indices]

CPU times: user 117 ms, sys: 6.79 ms, total: 124 ms
Wall time: 125 ms


In [9]:
%%time
index2 = pre(gdsc_ge,t=5, p=0.1)

CPU times: user 6.17 s, sys: 127 ms, total: 6.3 s
Wall time: 6.36 s


In [10]:
index.shape

(706, 9919)

In [11]:
index2.shape

(706, 9919)

## Domain Adaptation experiments

In [113]:
X_a = np.random.rand(3, 2) * 3

In [114]:
X_b = np.random.rand(3, 2) * 4

In [115]:
X_c = np.random.rand(3, 2) * 5

In [116]:
def jump(domain, n, data):
    result = []
    for i, ele in enumerate(data):
        result.append(ele)
        for j in range(0, domain):
            result.append(0)
        result.append(ele)
        for j in range(0, n-domain-1):
            result.append(0)
    return result
    

In [117]:
def feda(domains):
    n = len(domains)
    
    samples = 0
    for i in domains:
        samples += i.shape[0]
    
    features = domains[0].shape[1]*(n+1)
    
    new = np.zeros(features)
    for i, data in enumerate(domains):
        for j in data:
            new = np.vstack([new, jump(i, n, j)])
            
    return new[1:]

In [118]:
feda([X_a, X_b, X_c])

array([[2.39552445, 2.39552445, 0.        , 0.        , 0.8923882 ,
        0.8923882 , 0.        , 0.        ],
       [2.09713092, 2.09713092, 0.        , 0.        , 1.99024727,
        1.99024727, 0.        , 0.        ],
       [2.17421497, 2.17421497, 0.        , 0.        , 2.66618415,
        2.66618415, 0.        , 0.        ],
       [3.77029806, 0.        , 3.77029806, 0.        , 0.52768015,
        0.        , 0.52768015, 0.        ],
       [3.03560501, 0.        , 3.03560501, 0.        , 1.3673342 ,
        0.        , 1.3673342 , 0.        ],
       [2.41889914, 0.        , 2.41889914, 0.        , 3.77142649,
        0.        , 3.77142649, 0.        ],
       [0.70066211, 0.        , 0.        , 0.70066211, 3.89969596,
        0.        , 0.        , 3.89969596],
       [1.67451535, 0.        , 0.        , 1.67451535, 0.97901028,
        0.        , 0.        , 0.97901028],
       [0.52645206, 0.        , 0.        , 0.52645206, 1.13973433,
        0.        , 0.      

## Mess

In [1]:
from methods import fs

In [2]:
fs.__doc__

' Returns a subset of {X_train} and {X_test} with features being selected by the method {model}\n    :param int n: it can be the variance thereshold or the number of chosen features \n    :\n    '

In [9]:
from classes import drug
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_regression, mutual_info_regression, SelectFromModel, VarianceThreshold

In [10]:
%%time
gdsc_ge = pd.read_csv('data/Processed/gdsc_cell_ge.csv').fillna(0).set_index('CCL')
ctrp_ge = pd.read_csv('data/Processed/ctrp_cell_ge.csv').fillna(0).set_index('CCL')
gdsc_dr = pd.read_csv('data/Processed/gdsc_poz_dr.csv').fillna(0)
ctrp_dr = pd.read_csv('data/Processed/ctrp_poz_dr.csv').fillna(0)

CPU times: user 20.8 s, sys: 1 s, total: 21.8 s
Wall time: 22.8 s


In [11]:
%%time
aag = drug('17-AAG', {'ctrp': ctrp_ge, 'gdsc': gdsc_ge}, {'ctrp': ctrp_dr, 'gdsc': gdsc_dr})

CPU times: user 909 ms, sys: 104 ms, total: 1.01 s
Wall time: 1.07 s


In [12]:
aag.to_json()

data
<class 'pandas.core.frame.DataFrame'>
dr
<class 'pandas.core.frame.DataFrame'>
ge
<class 'pandas.core.frame.DataFrame'>
name
<class 'str'>


In [13]:
%%time
aag.pre()

NameError: name 'gdsc_ge' is not defined

In [5]:
%%time
aag.combine()

CPU times: user 5.84 s, sys: 856 ms, total: 6.7 s
Wall time: 7.07 s


In [6]:
%%time
aag.split()

CPU times: user 147 ms, sys: 99 ms, total: 246 ms
Wall time: 256 ms


In [7]:
%%time
aag.fs(f_regression, n=0.01)

CPU times: user 170 ms, sys: 7.18 ms, total: 177 ms
Wall time: 129 ms


In [9]:
aag.X['fs_train'].shape

(827, 206)

In [8]:
%%time
aag.feda()

CPU times: user 868 ms, sys: 713 ms, total: 1.58 s
Wall time: 1.64 s


In [9]:
%%time
aag.train(DecisionTreeRegressor())

CPU times: user 142 ms, sys: 3.77 ms, total: 146 ms
Wall time: 148 ms


In [12]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error

aag.metrics([r2_score, mean_absolute_error, mean_squared_error, median_absolute_error])

{'r2_score': -0.9490635815229291,
 'mean_absolute_error': 0.329503024438953,
 'mean_squared_error': 0.16236408243543393,
 'median_absolute_error': 0.303505584280524}

In [15]:
aag.to_json()

X
<class 'dict'>
data
<class 'pandas.core.frame.DataFrame'>
dr
<class 'pandas.core.frame.DataFrame'>
ge
<class 'pandas.core.frame.DataFrame'>
model
<class 'sklearn.tree._classes.DecisionTreeRegressor'>
name
<class 'str'>
y
<class 'dict'>
