In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import KFold
from sklearn.utils import resample

In [None]:
train_df = pd.read_csv('train_features.csv')
test_df = pd.read_csv('test_features.csv')
target_df = pd.read_csv('train_targets_scored.csv')

In [None]:
train_df['cp_type'].replace(['ctl_vehicle', 'trt_cp'], [0,1], inplace=True)
test_df['cp_type'].replace(['ctl_vehicle', 'trt_cp'], [0,1], inplace=True)

train_df['cp_dose'].replace(['D1','D2'],[0,1],inplace = True)
test_df['cp_dose'].replace(['D1','D2'],[0,1],inplace = True)

In [None]:
#concatenate feature and target matrices and remove subject ID
x = train_df.drop(['sig_id'],axis=1)
y = target_df.drop(['sig_id'],axis=1)
df = pd.concat([x,y],axis=1)

In [None]:
df

Unnamed: 0,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,1,24,0,1.0620,0.5577,-0.2479,-0.6208,-0.1944,-1.0120,-1.0220,...,0,0,0,0,0,0,0,0,0,0
1,1,72,0,0.0743,0.4087,0.2991,0.0604,1.0190,0.5207,0.2341,...,0,0,0,0,0,0,0,0,0,0
2,1,48,0,0.6280,0.5817,1.5540,-0.0764,-0.0323,1.2390,0.1715,...,0,0,0,0,0,0,0,0,0,0
3,1,48,0,-0.5138,-0.2491,-0.2656,0.5288,4.0620,-0.8095,-1.9590,...,0,0,0,0,0,0,0,0,0,0
4,1,72,1,-0.3254,-0.4009,0.9700,0.6919,1.4180,-0.8244,-0.2800,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23809,1,24,1,0.1394,-0.0636,-0.1112,-0.5080,-0.4713,0.7201,0.5773,...,0,0,0,0,0,0,0,0,0,0
23810,1,24,1,-1.3260,0.3478,-0.3743,0.9905,-0.7178,0.6621,-0.2252,...,0,0,0,0,0,0,0,0,0,0
23811,0,48,1,0.3942,0.3756,0.3109,-0.7389,0.5505,-0.0159,-0.2541,...,0,0,0,0,0,0,0,0,0,0
23812,1,24,0,0.6660,0.2324,0.4392,0.2044,0.8531,-0.0343,0.0323,...,0,0,0,0,0,0,0,0,0,0


In [None]:
sampling_threshold = 10
low_signal_moa = []
subj_per_moa = np.array(y.sum())
for i in range(len(subj_per_moa)):
    if subj_per_moa[i] <= sampling_threshold:
        low_signal_moa.append(i)

In [None]:
#upsample for classes with low number of samples
count = 1
for i in low_signal_moa:
    no_moa = df[df.iloc[:,i+len(x.columns)]==0]
    yes_moa  = df[df.iloc[:,i+len(x.columns)]==1]
    upsample = resample(yes_moa,replace=True,n_samples = 200,random_state=0)
    if count==1: 
        sample_df = pd.concat([no_moa,yes_moa,upsample])
    else: 
        sample_df = pd.concat([sample_df,upsample])
    count += 1

In [None]:
#separate features and targets after upsampling
col_names = list(sample_df.columns)
x_2 = sample_df[col_names[0:875]]
y_2 = sample_df[col_names[875:len(col_names)]]

#find number of additional samples
additional_samples = len(x_2) - len(x)
print(additional_samples)

4400


In [None]:
x_2

Unnamed: 0,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,1,24,0,1.0620,0.5577,-0.2479,-0.6208,-0.1944,-1.0120,-1.0220,...,0.2862,0.2584,0.8076,0.5523,-0.1912,0.6584,-0.3981,0.2139,0.3801,0.4176
1,1,72,0,0.0743,0.4087,0.2991,0.0604,1.0190,0.5207,0.2341,...,-0.4265,0.7543,0.4708,0.0230,0.2957,0.4899,0.1522,0.1241,0.6077,0.7371
2,1,48,0,0.6280,0.5817,1.5540,-0.0764,-0.0323,1.2390,0.1715,...,-0.7250,-0.6297,0.6103,0.0223,-1.3240,-0.3174,-0.6417,-0.2187,-1.4080,0.6931
3,1,48,0,-0.5138,-0.2491,-0.2656,0.5288,4.0620,-0.8095,-1.9590,...,-2.0990,-0.6441,-5.6300,-1.3780,-0.8632,-1.2880,-1.6210,-0.8784,-0.3876,-0.8154
4,1,72,1,-0.3254,-0.4009,0.9700,0.6919,1.4180,-0.8244,-0.2800,...,0.0042,0.0048,0.6670,1.0690,0.5523,-0.3031,0.1094,0.2885,-0.3786,0.7125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8596,1,48,1,-0.3301,-0.7767,-0.2608,0.0123,-0.2699,0.5022,-0.3551,...,0.0334,1.0800,0.7250,0.7386,0.5741,-0.0523,0.5849,0.1674,-0.1437,0.6433
6445,1,72,0,-0.1878,0.2326,-0.0411,0.1541,-0.4233,-0.5150,-0.3011,...,0.8659,0.7097,-0.2932,-0.1398,-0.9524,-0.0869,-0.1712,-0.4063,-0.5613,-0.1197
6445,1,72,0,-0.1878,0.2326,-0.0411,0.1541,-0.4233,-0.5150,-0.3011,...,0.8659,0.7097,-0.2932,-0.1398,-0.9524,-0.0869,-0.1712,-0.4063,-0.5613,-0.1197
4111,1,72,1,-0.0188,-0.0872,0.6514,0.7055,0.4569,0.0512,0.4469,...,0.5886,0.4505,0.4846,-0.2238,-1.0310,-0.0158,0.3336,-0.5933,0.0583,-0.0645


In [None]:
def classifier(parameters, x, y, test):
    loss_array = []
    xval = KFold(n_splits = 4, shuffle = True)
    x_temp, y_temp, test_temp = x.to_numpy(), y.to_numpy(), test.to_numpy()
    for train_i, test_i in xval.split(x_temp):
        x_train, y_train = x_temp[train_i], y_temp[train_i]
        x_test, y_test = x_temp[test_i], y_temp[test_i]
        model = MultiOutputClassifier(parameters)
        implementation = model.fit(x_train,y_train)
        predict = np.transpose(np.array(implementation.predict_proba(x_test))[:,:,-1])
        predict_test = np.transpose(np.array(implementation.predict_proba(test_temp))[:,:,-1])
        loss = log_loss(np.ravel(y_test),np.ravel(predict))
        loss_array.append(loss)
    print('average loss: ' + str(np.average(loss_array)))
    return implementation,predict_test


In [None]:
imp = SVC(C = 1e-3, random_state = 0, kernel = 'linear', probability = True, decision_function_shape= 'ovo')
test = classifier(imp,x_2,y_2,test_df.drop('sig_id',axis=1))