In [1]:
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
descriptors = np.load('descriptors.npy')
targets = np.load('targets.npy')

In [3]:
descriptors.shape

(678, 89, 485)

In [4]:
targets.shape

(678, 89)

In [5]:
i, j = np.where(targets==1)

In [6]:
descriptors_undersampled = descriptors[i, j, :]

In [7]:
descriptors_undersampled.shape

(803, 485)

In [8]:
targets_undersampled = targets[np.where(targets==1)]

In [9]:
targets_undersampled.shape

(803,)

In [10]:
descriptors_undersampled_with_targets = np.zeros((descriptors_undersampled.shape[0],descriptors_undersampled.shape[1]+1))
descriptors_undersampled_with_targets[:,:-1] = descriptors_undersampled 
descriptors_undersampled_with_targets[:,-1] = targets_undersampled

In [11]:
descriptors_undersampled_with_targets.shape

(803, 486)

In [12]:
targets_undersampled = targets[np.where(targets==0)]
targets_undersampled = targets_undersampled[:803]

In [13]:
i, j = np.where(targets==0)
descriptors_undersampled = descriptors[i, j, :]
descriptors_undersampled = descriptors_undersampled[:803,]

In [14]:
descriptors_undersampled.shape

(803, 485)

In [15]:
descriptors_undersampled_nulls = np.zeros((descriptors_undersampled.shape[0],descriptors_undersampled.shape[1]+1))
descriptors_undersampled_nulls[:,:-1] = descriptors_undersampled 
descriptors_undersampled_nulls[:,-1] = targets_undersampled

In [16]:
descriptors_undersampled_nulls.shape, descriptors_undersampled_with_targets.shape

((803, 486), (803, 486))

In [17]:
descriptors_undersampled_with_targets = np.vstack((descriptors_undersampled_with_targets, descriptors_undersampled_nulls))

In [18]:
descriptors_undersampled_with_targets[:,:-1].shape

(1606, 485)

In [19]:
from numpy.random import shuffle

In [20]:
shuffle(descriptors_undersampled_with_targets)

In [21]:
import sklearn
from sklearn.model_selection import train_test_split

In [22]:
x_train, x_test, y_train, y_test = train_test_split(descriptors_undersampled_with_targets[:,:-1], 
                                                    descriptors_undersampled_with_targets[:,-1], 
                                                    test_size=0.20, random_state=42)

In [23]:
print("x train: ",x_train.shape)
print("x test: ",x_test.shape)
print("y train: ",y_train.shape)
print("y test: ",y_test.shape)

x train:  (1284, 485)
x test:  (322, 485)
y train:  (1284,)
y test:  (322,)


In [24]:
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

In [25]:
print("x train: ",x_train.shape)
print("x test: ",x_test.shape)
print("y train: ",y_train.shape)
print("y test: ",y_test.shape)

x train:  (1284, 485)
x test:  (322, 485)
y train:  (1284, 1)
y test:  (322, 1)


In [26]:
import warnings
warnings.filterwarnings("ignore")

In [27]:
from sklearn.metrics import f1_score, make_scorer, top_k_accuracy_score, precision_recall_fscore_support
from sklearn.metrics import matthews_corrcoef as mcc
from sklearn.experimental import enable_halving_search_cv
from sklearn.feature_selection import f_classif
from sklearn.model_selection import cross_validate as cv
from sklearn.pipeline import Pipeline
from tqdm import tqdm

# Models from Article

## RandomForestClassifier

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
rfc = RandomForestClassifier(n_estimators=500, n_jobs=-1)
mcc_scorer = make_scorer(mcc)

In [30]:
rfc.fit(x_train, y_train)

RandomForestClassifier(n_estimators=500, n_jobs=-1)

In [31]:
y_preds = rfc.predict_proba(x_test)

In [32]:
y_predict = []
for el in range(len(y_preds)):
    y_predict.append(np.argmax(y_preds[el]))

In [33]:
y_real = y_test.reshape(-1, )

In [34]:
mcc(y_predict, y_real)

0.7825292916895139

In [35]:
precision_recall_fscore_support(y_predict, y_real)

(array([0.9047619, 0.88     ]),
 array([0.86363636, 0.91666667]),
 array([0.88372093, 0.89795918]),
 array([154, 168]))

In [36]:
mcc_scorer = make_scorer(mcc)
cv(rfc, x_train, y_train, cv=10, scoring=mcc_scorer)

{'fit_time': array([2.45053315, 1.02388716, 0.97552323, 0.97286415, 0.99793029,
        0.96008706, 0.97013354, 0.96864843, 1.03185272, 0.96079135]),
 'score_time': array([0.30357623, 0.28105879, 0.31673837, 0.29422879, 0.31608415,
        0.28897262, 0.32701302, 0.31202483, 0.30313444, 0.28609633]),
 'test_score': array([0.73647514, 0.62770563, 0.67744376, 0.80357601, 0.62813525,
        0.61023578, 0.78251624, 0.71889548, 0.71916972, 0.65631906])}

## ExtraTreesClassifier

In [94]:
etc = sklearn.ensemble.ExtraTreesClassifier()

In [95]:
etc.fit(x_train, y_train)

ExtraTreesClassifier()

In [96]:
y_preds = etc.predict_proba(x_test)

In [98]:
y_predict = []
for el in range(len(y_preds)):
    y_predict.append(np.argmax(y_preds[el]))

In [99]:
y_real = y_test.reshape(-1, )

In [100]:
mcc(y_predict, y_real)

0.7950106881589029

In [101]:
precision_recall_fscore_support(y_predict, y_real)

(array([0.91156463, 0.88571429]),
 array([0.87012987, 0.92261905]),
 array([0.89036545, 0.90379009]),
 array([154, 168]))

## SVM

In [37]:
x_train_svm = x_train - np.mean(x_train)
descriptors_svm = x_train_svm / np.std(x_train_svm)

In [38]:
from sklearn.svm import SVC

In [39]:
svm = SVC(C=50, gamma=1)

In [40]:
mcc_scorer = make_scorer(mcc)
cv(svm, descriptors_svm, y_train, cv=10, scoring=mcc_scorer)

{'fit_time': array([0.36381435, 0.30833673, 0.3300097 , 0.31035137, 0.29814887,
        0.30661821, 0.29594612, 0.32179713, 0.31151938, 0.29904556]),
 'score_time': array([0.0518229 , 0.0535295 , 0.05416584, 0.05229974, 0.05262065,
        0.05188346, 0.0467124 , 0.05250359, 0.04707026, 0.04869604]),
 'test_score': array([0.72256986, 0.61327561, 0.70687148, 0.7756157 , 0.62813525,
        0.58051117, 0.73446466, 0.68791209, 0.67640108, 0.59413919])}

In [41]:
svm.fit(descriptors_svm, y_train)

SVC(C=50, gamma=1)

In [42]:
x_test_svm = x_test - np.mean(x_test)
descriptors_svm_test = x_test_svm / np.std(x_test_svm)

In [44]:
y_preds = svm.predict(descriptors_svm_test)

In [47]:
y_real = y_test.reshape(-1, )

In [49]:
mcc(y_preds, y_real)

0.7054079378805208

In [50]:
precision_recall_fscore_support(y_preds, y_real)

(array([0.81632653, 0.88571429]),
 array([0.85714286, 0.85164835]),
 array([0.83623693, 0.86834734]),
 array([140, 182]))

## GradientBoostingClassifier

In [55]:
from sklearn.ensemble import GradientBoostingClassifier
gbt = GradientBoostingClassifier(learning_rate=0.2, subsample=0.6, max_depth=6, n_estimators=900)

In [57]:
cv(gbt, x_train, y_train, cv=10, scoring=mcc_scorer)

{'fit_time': array([6.76550865, 6.73514485, 6.50930858, 6.60883713, 6.47080064,
        6.3400867 , 6.5082593 , 6.58210325, 6.13940215, 6.32564116]),
 'score_time': array([0.00478745, 0.00477719, 0.00433016, 0.00470352, 0.00434732,
        0.00512171, 0.00467086, 0.00456381, 0.00475478, 0.0045774 ]),
 'test_score': array([0.78282828, 0.58557701, 0.70873279, 0.83044733, 0.60986423,
        0.57798953, 0.79473925, 0.67345807, 0.71889548, 0.67784088])}

In [58]:
gbt.fit(x_train, y_train)

GradientBoostingClassifier(learning_rate=0.2, max_depth=6, n_estimators=900,
                           subsample=0.6)

In [59]:
y_preds = gbt.predict_proba(x_test)

In [61]:
y_predict = []
for el in range(len(y_preds)):
    y_predict.append(np.argmax(y_preds[el]))

In [62]:
y_real = y_test.reshape(-1, )

In [63]:
mcc(y_predict, y_real)

0.741745275258775

In [64]:
precision_recall_fscore_support(y_predict, y_real)

(array([0.93197279, 0.81142857]),
 array([0.80588235, 0.93421053]),
 array([0.86435331, 0.86850153]),
 array([170, 152]))

# New Models

## LightGBM Classifier

In [74]:
import lightgbm as lgb

In [75]:
lgb_clf = lgb.LGBMClassifier()

In [76]:
lgb_clf.fit(x_train, y_train)

LGBMClassifier()

In [77]:
y_preds = lgb_clf.predict_proba(x_test)

In [79]:
y_predict = []
for el in range(len(y_preds)):
    y_predict.append(np.argmax(y_preds[el]))

In [81]:
y_real = y_test.reshape(-1, )

In [82]:
mcc(y_predict, y_real)

0.7748758134087765

In [83]:
precision_recall_fscore_support(y_predict, y_real)

(array([0.93197279, 0.84571429]),
 array([0.83536585, 0.93670886]),
 array([0.88102894, 0.88888889]),
 array([164, 158]))

## AdaBoostClassifier

In [84]:
from sklearn.ensemble import AdaBoostClassifier

In [92]:
n_estimators_list = [150, 200, 250, 400, 500]
learning_rate_list = [0.2, 1.0, 2.0, 5.0, 10.0]

In [93]:
y_real = y_test.reshape(-1, )

for n_est in n_estimators_list:
    for lr in learning_rate_list:
        ab = AdaBoostClassifier(random_state=42, n_estimators=n_est, learning_rate=lr)
        print('n_estimator = ', n_est)
        print('learning_rate = ', lr)
        ab.fit(x_train, y_train)
        y_pred = ab.predict(x_test)
        print('MCC = ', mcc(y_pred, y_real))
        print('=============================')

n_estimator =  150
learning_rate =  0.2
MCC =  0.7362623883689489
n_estimator =  150
learning_rate =  1.0
MCC =  0.716767030264702
n_estimator =  150
learning_rate =  2.0
MCC =  0.6991040055092435
n_estimator =  150
learning_rate =  5.0
MCC =  -0.4910242904869541
n_estimator =  150
learning_rate =  10.0
MCC =  -0.48795003647426655
n_estimator =  200
learning_rate =  0.2
MCC =  0.673845889867267
n_estimator =  200
learning_rate =  1.0
MCC =  0.697261016728272
n_estimator =  200
learning_rate =  2.0
MCC =  0.6929543849923757
n_estimator =  200
learning_rate =  5.0
MCC =  -0.4910242904869541
n_estimator =  200
learning_rate =  10.0
MCC =  -0.48795003647426655
n_estimator =  250
learning_rate =  0.2
MCC =  0.673845889867267
n_estimator =  250
learning_rate =  1.0
MCC =  0.7138219363193263
n_estimator =  250
learning_rate =  2.0
MCC =  0.6867877808060369
n_estimator =  250
learning_rate =  5.0
MCC =  -0.4910242904869541
n_estimator =  250
learning_rate =  10.0
MCC =  -0.48795003647426655
n_

## XGBoost

In [104]:
#!pip install xgboost
import xgboost as xgb

In [105]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)

In [106]:
xgb_model.fit(x_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=42, ...)

In [111]:
y_preds = xgb_model.predict(x_test)

In [112]:
y_real = y_test.reshape(-1, )

In [113]:
mcc(y_preds, y_real)

0.7748758134087765