In [4]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from drugOrg import importDrugs
from functools import reduce
# from drugOrg import tempFilter
# from dataHelpers import getCellLineComps

In [5]:
def getCellLineComps():
    '''Import cell line components --- rank 25 cp'''
    filename = os.path.join(path, './data/HDF5/cell_comps_25.hdf5')
    with h5py.File(filename, 'r') as f:
        data = f["comps"][:]
        f.close()
    return data.T

def tempFilter(drugData):
    '''temporarily uses known cell lines and factors for initial regression testing
    Inputs: one compound (e.g. drugArr[0]) from the drugArr (a 2d numpy array)
    
    Outputs:
    two 2d numpy arrays containing the drugArr and factors with common cell lines
    '''
    factCells = pd.read_csv('data/cellLines(aligned,precut).csv', header=None, index_col=False).values
    factors = getCellLineComps()
    factFiltered, drugFiltered = filterCells(factCells, factors, drugData)
    return factFiltered, drugFiltered

def filterCells(factCells, factors, drugData):
    '''aligns factors and drug data by common cell lines'''
    commonCL = reduce(np.intersect1d, (factCells, drugData[:,0]))
    factIdx = np.where(np.in1d(factCells, commonCL))[0]
    drugIdx = np.where(np.in1d(drugData[:,0], commonCL))[0]
    factFiltered = factors[factIdx, :]
    drugFiltered = drugData[drugIdx, :]
    return factFiltered, drugFiltered

In [67]:
# def intDrugsAndCells(drugs, cells):
#     '''
#     Finds the cells lines in the drug dataset 

#     Returns:
#             Numpy array of unique common cell lines
#     '''
#     drugsAndCells = reduce(np.intersect1d, (drugs, cells))
#     return drugsAndCells

In [27]:
cell_comps = getCellLineComps()
# drugs = importDrugs()

In [None]:
# drugs[0].shape
# first_drugs = drugs[0]
# drugs_cell_lines = first_drugs[:, 0]
# cell_lines = cell_comps[:, 0]
# print(cell_comps)
# intDrugsAndCells(first_drugs, cell_lines)

In [77]:
full_drugs = importDrugs()
factors, drugs = tempFilter(full_drugs[0])
print(factors.shape)
print(factors[0])
print(drugs.shape)
print(drugs[0,-1])

(410, 25)
[-0.00563914 -0.01336823 -0.00087393 -0.00292835  0.03347058  0.02140546
  0.00455992 -0.02708515  0.0299087  -0.01411919  0.03808385  0.02727541
  0.03224995  0.02750328  0.03357634  0.00079153 -0.00884383 -0.00940239
  0.01988173 -0.00968551  0.01846536  0.02487351  0.00270583 -0.01838954
  0.01904194]
(410, 13)
3.0622


In [79]:
X_train, X_test, y_train, y_test = train_test_split(factors, drugs[:,-1], test_size=0.1, random_state=42)

In [80]:
clf = svm.SVR()

In [82]:
clf.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [83]:
clf.predict(X_test)

array([3.30550731, 3.98669431, 3.55343772, 3.37893917, 3.78584425,
       3.57192733, 3.70687459, 2.96836683, 3.58735447, 3.91745767,
       3.88673156, 3.83154676, 4.40216406, 3.15067086, 3.17040931,
       3.68634796, 3.53353502, 3.35926744, 3.447714  , 3.68358832,
       3.42867234, 3.53287711, 3.30547294, 3.75270292, 3.10383853,
       3.54143917, 3.90799517, 3.58466044, 3.15844545, 3.10494808,
       3.26837281, 3.3020111 , 3.396255  , 3.84517456, 3.32692888,
       3.13906278, 2.9971561 , 3.63588909, 3.37035771, 3.12678874,
       3.79679192])