In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import precision_recall_curve
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectFdr
from sklearn.model_selection import LeavePOut, StratifiedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.svm import SVR
from ReliefF import ReliefF
from sklearn.model_selection import KFold
import time
from sklearn.feature_selection import SelectKBest

In [None]:
!pip install numpy Cython
!pip install -U pymrmr


!CC=gcc-10 CXX=g++-10 pip install -U pymrmr

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pymrmr

In [None]:
def get_cv(df):
    n = df.shape[0]
    if n < 50:
        return LeavePOut(2), 'LeavePairOut'
    if n < 100:
        return LeaveOneOut(), 'LeaveOneOut'
    if n < 1000:
        return StratifiedKFold(n_splits=10), '10Folds'
    return StratifiedKFold(n_splits=5), '5Folds'

In [None]:
def run_classification(model, X, y):
    cv, cv_name = get_cv(X)
    print(cv_name)
    fit_times = []
    pred_times = []
    aucs = []
    accs = []
    mccs = []
    praucs = []
    total_folds = cv.get_n_splits(X, y)
    for train_index, test_index in cv.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        fit_time = time.time()
        model.fit(X_train, y_train)
        fit_times.append(time.time() - fit_time)
        pred_time = time.time()
        probs = model.predict_proba(X_test)
        pred_times.append(time.time() - pred_time)
        preds = model.predict(X_test)
        try:
            aucs.append(roc_auc_score(y_test, probs, multi_class='ovo'))
        except Exception as e:
            # print(str(e))
            try:
                aucs.append(roc_auc_score(y_test, probs[:, 1], multi_class='ovo'))
            except Exception as e:
                # print(str(e))
                aucs.append(None)
        accs.append(accuracy_score(y_test, preds))
        mccs.append(matthews_corrcoef(y_test, preds))
        try:
            precision, _, _ = precision_recall_curve(y_test, probs[:, 1], pos_label=y_test.unique()[1,])
        except:
            precision = None
        praucs.append(precision)
    return fit_times, pred_times, aucs, accs, mccs, praucs, total_folds, cv_name

In [None]:
def ReliefF_selector(x, y, k):
    fs = ReliefF(n_neighbors=1, n_features_to_keep=k)
    fs.fit(x.to_numpy(), y)
    f_names = fs.top_features[:k]
    scores = fs.feature_scores[:k]
    feature_names_in_ = x.columns
    x_to = x.iloc[:, f_names]
    return x_to, scores, f_names, feature_names_in_

In [None]:
def rfe_selector(x, y, k):
    estimator = SVR(kernel="linear")
    selector = RFE(estimator, n_features_to_select=k, step=1)
    selector.fit(x, y)
    f_names = selector.get_feature_names_out(x.columns)
    return x[f_names], selector.scores_, f_names, x.columns

In [None]:
def fdr_selector(x, y, k):
    fdr = SelectFdr(alpha=0.01)
    fdr.fit(x, y)
    feature_names_in_ = fdr.get_feature_names_out(x.columns)
    fs_fdr = x[feature_names_in_]
    fs_f = SelectKBest(k=k)
    fs_f.fit(fs_fdr, y)
    f_names = fs_f.get_feature_names_out(fs_fdr.columns)
    return fs_fdr[f_names], fs_f.scores_, f_names, feature_names_in_

In [None]:
def mRMR_selector(df, x, y, k):
  cols = df.columns.to_list()
  df = df[['y'] + cols]
  df = df.iloc[: , :-1]
  names_f = pymrmr.mRMR(df, 'MIQ', k)
  fs_f = SelectKBest(k=k)
  fs_f.fit(x[names_f], y)
  feature_names_in_ = df.columns[1:]
  return x[names_f], fs_f.scores_, names_f, feature_names_in_


In [None]:
def get_model(model):
    models = {'RandomForest': RandomForestClassifier(), 'LogisticRegression': LogisticRegression(),
              'svm': SVC(probability=True),
              'NB': GaussianNB(), 'knn': KNeighborsClassifier()}
    return models[model]

In [None]:
def main():
    ks = [1, 2, 3, 4, 5, 10, 15, 20, 25, 30, 50, 100]
    names = ['CLL-SUB-111', 'ALLAML', 'BASEHOCK', 'COIL20', 'Carcinom', 'pone.0202167.s016', 'pone.0202167.s017',
             'bladderbatch', 'ayeastCC', 'breastCancerVDX', 'curatedOvarianData', 'leukemiasEset', 'Lung', 'Lymphoma',
             'MLL', 'SRBCT', 'CNS', 'pone.0202167.s011', 'pone.0202167.s012', 'pone.0202167.s015']



    for i in range(20):
        try:
              save = pd.DataFrame(
                columns=['Dataset Name', 'Number of samples', 'Original Number of features', 'Filtering Algorithm',
                 'Learning algorithm', 'Number of features selected (K)', 'CV Method', 'Fold', 'Measure Type',
                 'Measure Value', 'List of Selected Features Names', 'Selected Features scores'])
              df = pd.read_csv(f'datasets/dataset_{i}.csv').drop(columns=['Unnamed: 0'])
              x = df.drop(columns=['y'])
              y = df['y']
              print(x.shape[1])
              for k in ks:
                  feature_selection_time = time.time()
                  # X, f_scores, names_f, feature_names_in_ = fdr_selector(x, y, k)
                  # X, f_scores, names_f, feature_names_in_ = rfe_selector(x, y, k)
                  # X, f_scores, names_f, feature_names_in_ = ReliefF_selector(x, y, k)
                  X, f_scores, names_f, feature_names_in_ = mRMR_selector(df, x, y, k)
                  feature_selection_time = time.time() - feature_selection_time
                  models = ['svm', 'knn', 'RandomForest', 'NB', 'LogisticRegression']
                  for m in models:
                      model = get_model(m)
                      ind = [np.where(feature_names_in_ == name)[0][0] for name in names_f]
                      print('len:', len(f_scores))
                      print(ind)
                      scores = f_scores
                      fit_times, pred_times, aucs, accs, mccs, praucs, total_folds, cv_name = run_classification(model,
                                                                                                                pd.DataFrame(
                                                                                                                    X), y)
                      row_auc = [names[i], X.shape[0], X.shape[1], 'mRMR', m, k, cv_name, total_folds, 'AUC', aucs,
                                str(names_f), scores]
                      save.loc[len(save)] = row_auc
                      row_acc = [names[i], X.shape[0], X.shape[1], 'mRMR', m, k, cv_name, total_folds, 'ACC', accs,
                                str(names_f), scores]
                      save.loc[len(save)] = row_acc
                      row_mcc = [names[i], X.shape[0], X.shape[1], 'mRMR', m, k, cv_name, total_folds, 'MCC', mccs,
                                str(names_f), scores]
                      save.loc[len(save)] = row_mcc
                      row_prauc = [names[i], X.shape[0], X.shape[1], 'mRMR', m, k, cv_name, total_folds, 'PR-AU',
                                  praucs, str(names_f), scores]
                      save.loc[len(save)] = row_prauc
                      row_pred_time = [names[i], X.shape[0], X.shape[1], 'mRMR', m, k, cv_name, total_folds,
                                      'prediction time', pred_times, str(names_f), scores]
                      save.loc[len(save)] = row_pred_time
                      row_fit_time = [names[i], X.shape[0], X.shape[1], 'mRMR', m, k, cv_name, total_folds, 'fit time',
                                      fit_times, str(names_f), scores]
                      save.loc[len(save)] = row_fit_time
                      row_selection_time = [names[i], X.shape[0], X.shape[1], 'mRMR', m, k, cv_name, total_folds,
                                            'feature selection time', feature_selection_time, str(names_f), scores]
                      save.loc[len(save)] = row_selection_time
              print('-----------------------------------------')
              print(i)
              save.to_csv(f'fre_selector_{i}.csv')
        except Exception as e:
          print(e)
          print(f'WORNING - {i} failed')

In [None]:
main()