In [22]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from scipy import stats

from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import MinMaxScaler

from boruta import BorutaPy
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.feature_selection import VarianceThreshold

import warnings
warnings.filterwarnings('ignore')

In [2]:
# read dataset

df_breast = pd.read_csv("breast_normal_primary_ENSG_2.0.csv", index_col=0)

In [3]:
df_breast.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1271 entries, TCGA-C8-A1HL-01 to GTEX-1117F-2826-SM-5GZXL
Columns: 60499 entries, label to ENSG00000181518.3
dtypes: float64(60498), int64(1)
memory usage: 586.7+ MB


In [4]:
# create X

X = df_breast.drop("label", axis=1)

In [5]:
# create y

y = df_breast["label"]

In [None]:
#(df_breast.std() > 2).sum()

In [None]:
# filter X with std > treshold

#X_filter_std2 = X.loc[:, X.std() > 2]
#X_filter_std2.info()

In [6]:
X_filter_std3 = X.loc[:, X.std() > 3]
X_filter_std3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1271 entries, TCGA-C8-A1HL-01 to GTEX-1117F-2826-SM-5GZXL
Columns: 8264 entries, ENSG00000242268.2 to ENSG00000273233.1
dtypes: float64(8264)
memory usage: 80.1+ MB


In [None]:
# filter X with variation

#thresholder = VarianceThreshold(threshold=9)
#X_high_variance = thresholder.fit_transform(X)

#X_high_variance.shape

In [None]:
#type(X_high_variance)

In [7]:
# set seed for all random

SEED=1888

In [8]:
# split train and validation

X_train_0, X_val, y_train_0, y_val = train_test_split(X_filter_std3, y, test_size=0.3, random_state=SEED)

In [None]:
X_val.info()

In [None]:
X_train_0.info()

In [9]:
# split train to train and test

X_train, X_test, y_train, y_test = train_test_split(X_train_0, y_train_0, test_size=0.3, random_state=SEED)

In [None]:
# scaling

#scaler = MinMaxScaler()

#X_train_scaled = scaler.fit_transform(X_train)
#X_test_scaled = scaler.transform(X_test)
#X_val_scaled = scaler.transform(X_val)

In [10]:
print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

Number transactions X_train dataset:  (622, 8264)
Number transactions y_train dataset:  (622,)
Number transactions X_test dataset:  (267, 8264)
Number transactions y_test dataset:  (267,)


In [11]:
# upsampling with SMOTE

print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

sm = SMOTE(random_state=SEED)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())

print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))

Before OverSampling, counts of label '1': 536
Before OverSampling, counts of label '0': 86 

After OverSampling, the shape of train_X: (1072, 8264)
After OverSampling, the shape of train_y: (1072,) 

After OverSampling, counts of label '1': 536
After OverSampling, counts of label '0': 536


In [12]:
# get column names

column_names = X_train.columns

In [13]:
# numpy array from SMOTE back to df

X_train_smote = pd.DataFrame(X_train_res, columns=column_names)
#X_train_smote

In [16]:
# X_validation to df

#X_val_scaled_df = pd.DataFrame(X_val_scaled, columns=column_names)

In [15]:
from sklearn.metrics import f1_score # import again because it works like that :)

# Random Forests for Boruta

rf_boruta = RandomForestClassifier(n_jobs=-1, random_state=SEED)

# Perform Boruta

boruta = BorutaPy(rf_boruta, n_estimators='auto', verbose=2, alpha=0.005,
                  max_iter=25, perc=100, random_state=SEED)
boruta.fit(X_train_smote.values, y_train_res)

# Select features and fit Logistic Regression

cols = X_train_smote.columns[boruta.support_]
est_boruta = LogisticRegression(random_state=SEED)
est_boruta.fit(X_train_smote[cols], y_train_res)

scores = cross_val_score(est_boruta, X_train_smote[cols], y_train_res, cv=5)

# Test accuracy

#acc = accuracy_score(y_val, est_boruta.predict(X_val[cols0]))
f1_score = f1_score(y_val, est_boruta.predict(X_val[cols]))

print('Number of features selected: {}'.format(len(cols)))
#print('Test Accuracy {:.5f}'.format(acc))
print("F1 score {:.5f}".format(f1_score))
print("CV Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Iteration: 	1 / 25
Confirmed: 	0
Tentative: 	8264
Rejected: 	0


KeyboardInterrupt: 

In [34]:
boruta.support_

array([ True, False, False,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True, False, False,  True, False,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True, False, False,
        True,  True, False, False,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True])

In [37]:
boruta.n_features_

36

In [19]:
X_filtered = X_train_smote[cols]
#X_filtered

In [72]:
def boruta_tree(X_train_smote, y_train_res, X_test, random_state=SEED):

    for _ in range(2):
    
        from sklearn.metrics import f1_score # import again because it works like that :)

        # Random Forests for Boruta
        rf_boruta = RandomForestClassifier(n_jobs=-1, random_state=random_state)

        # Perform Boruta
        boruta = BorutaPy(rf_boruta, n_estimators='auto', verbose=2,
                      alpha=0.005, max_iter=30, perc=100, random_state=random_state)
        boruta.fit(X_train_smote.values, y_train_res)

        # Select features and fit Logistic Regression

        cols = X_train_smote.columns[boruta.support_]
        X_train_smote = X_train_smote[cols]
        est_boruta = LogisticRegression(random_state=random_state)
        est_boruta.fit(X_train_smote, y_train_res)

        scores = cross_val_score(est_boruta, X_train_smote, y_train_res, cv=5)

        print("CV Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
                
    # Random Forest for extracting features
    
    X_filtered = X_train_smote[cols]
    
    rf = RandomForestClassifier(n_estimators = 10, criterion = 'gini', random_state = SEED)
    rf.fit(X_filtered, y_train_res)

    print("Test Accuracy: {:.2f}".format(accuracy_score(y_test, rf_pred)))

    feature_names = X_filtered.columns
    rf_coeff = pd.DataFrame({"feature": feature_names,"coefficient": rf.feature_importances_})
    rf_coeff_top = rf_coeff.sort_values(by="coefficient",ascending=False).head(100).set_index("feature")
    
    selected_features = rf_coeff_top.index
    feature_importances = rf_coeff_top.coefficient.tolist()
    
    dictionary = {"boruta_tree": [selected_features, feature_importances]}
    
    return dictionary

NameError: name 'rf_coeff_top' is not defined

In [73]:
boruta_tree(X_train_smote, y_train_res, random_state=SEED)

Iteration: 	1 / 30
Confirmed: 	0
Tentative: 	8264
Rejected: 	0
Iteration: 	2 / 30
Confirmed: 	0
Tentative: 	8264
Rejected: 	0
Iteration: 	3 / 30
Confirmed: 	0
Tentative: 	8264
Rejected: 	0
Iteration: 	4 / 30
Confirmed: 	0
Tentative: 	8264
Rejected: 	0
Iteration: 	5 / 30
Confirmed: 	0
Tentative: 	8264
Rejected: 	0
Iteration: 	6 / 30
Confirmed: 	0
Tentative: 	8264
Rejected: 	0
Iteration: 	7 / 30
Confirmed: 	0
Tentative: 	8264
Rejected: 	0
Iteration: 	8 / 30
Confirmed: 	0
Tentative: 	8264
Rejected: 	0
Iteration: 	9 / 30
Confirmed: 	0
Tentative: 	8264
Rejected: 	0
Iteration: 	10 / 30
Confirmed: 	0
Tentative: 	8264
Rejected: 	0
Iteration: 	11 / 30
Confirmed: 	0
Tentative: 	8264
Rejected: 	0
Iteration: 	12 / 30
Confirmed: 	0
Tentative: 	2686
Rejected: 	5578
Iteration: 	13 / 30
Confirmed: 	339
Tentative: 	2347
Rejected: 	5578
Iteration: 	14 / 30
Confirmed: 	339
Tentative: 	2347
Rejected: 	5578
Iteration: 	15 / 30
Confirmed: 	339
Tentative: 	2347
Rejected: 	5578
Iteration: 	16 / 30
Confirmed: 

{'boruta_tree': [Index(['ENSG00000219928.2', 'ENSG00000256508.2', 'ENSG00000224126.2',
         'ENSG00000123500.9', 'ENSG00000260105.6', 'ENSG00000229237.2',
         'ENSG00000144230.16', 'ENSG00000262097.1', 'ENSG00000279473.1',
         'ENSG00000179751.6', 'ENSG00000235734.4', 'ENSG00000029559.6',
         'ENSG00000241431.1', 'ENSG00000224958.5', 'ENSG00000187922.13',
         'ENSG00000235875.3', 'ENSG00000197561.6', 'ENSG00000198099.8',
         'ENSG00000281974.1', 'ENSG00000067048.16', 'ENSG00000277741.4',
         'ENSG00000240509.1', 'ENSG00000279208.1', 'ENSG00000230838.1',
         'ENSG00000092054.12', 'ENSG00000267653.1', 'ENSG00000269124.1',
         'ENSG00000253683.1', 'ENSG00000175206.10', 'ENSG00000236754.5',
         'ENSG00000172023.7', 'ENSG00000237070.1', 'ENSG00000272573.5',
         'ENSG00000235563.1', 'ENSG00000189292.15', 'ENSG00000124440.15',
         'ENSG00000244538.1', 'ENSG00000115386.5', 'ENSG00000275152.4',
         'ENSG00000169248.12', 'ENSG000001

In [17]:
cols.shape

(117,)

In [47]:
#X_filtered.columns[boruta.support_]

#### Tree after boruta

In [20]:
X_filtered_ab = X_train_smote[cols]
X_filtered_ab.shape

(1072, 117)

In [26]:
rf = RandomForestClassifier(n_estimators = 10, criterion = 'gini', random_state = SEED)

rf.fit(X_filtered_ab, y_train_res)

rf_pred = rf.predict(X_test[cols])

print(classification_report(y_test, rf_pred))

print("Train Accuracy: {:.2f}".format(accuracy_score(y_train_res, rf.predict(X_filtered_ab))))
print("Test Accuracy: {:.2f}".format(accuracy_score(y_test, rf_pred)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        37
           1       1.00      1.00      1.00       230

   micro avg       1.00      1.00      1.00       267
   macro avg       1.00      1.00      1.00       267
weighted avg       1.00      1.00      1.00       267

Train Accuracy: 1.00
Test Accuracy: 1.00


In [61]:
feature_names = X_filtered_ab.columns
rf_coeff = pd.DataFrame({"feature": feature_names,"coefficient": rf.feature_importances_})
rf_coeff_top = rf_coeff.sort_values(by="coefficient",ascending=False).head(100).set_index("feature")
#rf_coeff_top

In [59]:
selected_features = rf_coeff_top.index

In [70]:
feature_importances = rf_coeff_top.coefficient.tolist()

In [71]:
dictionary = {"boruta_tree": [selected_features, feature_importances]}
dictionary

{'boruta_tree': [Index(['ENSG00000219928.2', 'ENSG00000256508.2', 'ENSG00000224126.2',
         'ENSG00000123500.9', 'ENSG00000260105.6', 'ENSG00000229237.2',
         'ENSG00000144230.16', 'ENSG00000262097.1', 'ENSG00000279473.1',
         'ENSG00000179751.6', 'ENSG00000235734.4', 'ENSG00000029559.6',
         'ENSG00000241431.1', 'ENSG00000224958.5', 'ENSG00000187922.13',
         'ENSG00000235875.3', 'ENSG00000197561.6', 'ENSG00000198099.8',
         'ENSG00000281974.1', 'ENSG00000067048.16', 'ENSG00000277741.4',
         'ENSG00000240509.1', 'ENSG00000279208.1', 'ENSG00000230838.1',
         'ENSG00000092054.12', 'ENSG00000267653.1', 'ENSG00000269124.1',
         'ENSG00000253683.1', 'ENSG00000175206.10', 'ENSG00000236754.5',
         'ENSG00000172023.7', 'ENSG00000237070.1', 'ENSG00000272573.5',
         'ENSG00000235563.1', 'ENSG00000189292.15', 'ENSG00000124440.15',
         'ENSG00000244538.1', 'ENSG00000115386.5', 'ENSG00000275152.4',
         'ENSG00000169248.12', 'ENSG000001

{'boruta_tree': Index(['ENSG00000219928.2', 'ENSG00000256508.2', 'ENSG00000224126.2',
        'ENSG00000123500.9', 'ENSG00000260105.6', 'ENSG00000229237.2',
        'ENSG00000144230.16', 'ENSG00000262097.1', 'ENSG00000279473.1',
        'ENSG00000179751.6', 'ENSG00000235734.4', 'ENSG00000029559.6',
        'ENSG00000241431.1', 'ENSG00000224958.5', 'ENSG00000187922.13',
        'ENSG00000235875.3', 'ENSG00000197561.6', 'ENSG00000198099.8',
        'ENSG00000281974.1', 'ENSG00000067048.16', 'ENSG00000277741.4',
        'ENSG00000240509.1', 'ENSG00000279208.1', 'ENSG00000230838.1',
        'ENSG00000092054.12', 'ENSG00000267653.1', 'ENSG00000269124.1',
        'ENSG00000253683.1', 'ENSG00000175206.10', 'ENSG00000236754.5',
        'ENSG00000172023.7', 'ENSG00000237070.1', 'ENSG00000272573.5',
        'ENSG00000235563.1', 'ENSG00000189292.15', 'ENSG00000124440.15',
        'ENSG00000244538.1', 'ENSG00000115386.5', 'ENSG00000275152.4',
        'ENSG00000169248.12', 'ENSG00000188626.6', 'ENS