In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

def model_rank(data, methods, parameters=None):
  model_name =  {"RF" : RandomForestClassifier(),
                "LGBM" : LGBMClassifier(),
                "XGB" : XGBClassifier(),
                "EXtra" : ExtraTreesClassifier(),
                "Ada" : AdaBoostClassifier()}
  
  models = []
  if parameters == None:
    parameters = {}
  for method in methods:
    if method == "RF":
      if method not in parameters:
        parameter = {}
      elif parameters[method] == "default":
        parameter = {}
      elif parameters[method] == "recommanded":
        parameter = {'max_depth': 10, 'min_samples_leaf': 8, 'min_samples_split': 16, 'n_estimators': 200}
      else:
        parameter = parameters[method]
      model = RandomForestClassifier(**parameter)

    elif method == "LGBM":
      if method not in parameters:
        parameter = {}
      elif parameters[method] == "default":
        parameter = {}
      elif parameters[method] == "recommanded":
        parameter = {'n_esimators' : 200}
      else:
        parameter = parameters[method]
      model = LGBMClassifier(**parameter)
    
    elif method == "XGB":
      if method not in parameters:
        parameter = {}
      elif parameters[method] == "default":
        parameter = {}
      elif parameters[method] == "recommanded":
        parameter = {'colsample_bytree': 0.5, 'n_estimators': 200, 'subsample': 0.75}
      else:
        parameter = parameters[method]
      model = XGBClassifier(**parameter)

    elif method == "EXtra":
      if method not in parameters:
        parameter = {}
      elif parameters[method] == "default":
        parameter = {}
      elif parameters[method] == "recommanded":
        parameter = {'criterion': 'gini', 'max_features': 'log2', 'min_samples_split': 4, 'n_estimators': 500}
      else:
        parameter = parameters[method]
      model = ExtraTreesClassifier(**parameter)

    elif method == "Ada":
      if method not in parameters:
        parameter = {}
      elif parameters[method] == "default":
        parameter = {}
      elif parameters[method] == "recommanded":
        parameter = {'algorithm': 'SAMME', 'learning_rate': 1, 'n_estimators': 200}
      else:
        parameter = parameters[method]
      model = AdaBoostClassifier(**parameter)    

    else:
      raise NameError('Error')
    
    models.append(model)

  X_features = data.iloc[:, :-1]
  y_target = data.iloc[:, -1]
  X_train, X_test, Y_train, Y_test = train_test_split(X_features, y_target, test_size=0.2, random_state=97, stratify=y_target)
  
  ranking_df = pd.DataFrame()
  importance_df = pd.DataFrame()
  for a, b in zip(models, methods):
    f = a.fit(X_train, Y_train)
    importance = f.feature_importances_

    top_biomarker = pd.DataFrame(importance, index=X_train.columns, columns=['importances_{0}'.format(b)])
    importance_df = pd.concat([importance_df, top_biomarker], axis=1)
    
    ranking = pd.DataFrame()
    ranking[f'ranking_{b}'] = importance_df[f'importances_{b}'].rank(method='min', ascending=False)
    ranking = ranking[[f'ranking_{b}']].astype('int')
    ranking_df = pd.concat([ranking_df, ranking], axis=1)

  return ranking_df, '-'*100, importance_df

In [None]:
data = pd.read_csv("/content/drive/MyDrive/암마커추출/data.csv", sep=',')
data

Unnamed: 0,?|100130426,?|100133144,?|100134869,?|10357,?|10431,?|136542,?|155060,?|26823,?|280660,?|317712,...,ZXDB|158586,ZXDC|79364,ZYG11A|440590,ZYG11B|79699,ZYX|7791,ZZEF1|23140,ZZZ3|26009,psiTPTE22|387590,tAKR|389932,Cancer_info
0,0.0,10.0113,11.2820,49.5994,848.9397,0.0,345.2308,1.0472,0.0000,0.0,...,292.5212,959.2460,0.6981,1088.0531,2837.9440,871.2802,575.2683,6.6323,0.0000,0
1,0.0,7.1957,12.4436,90.5117,924.0158,0.0,145.2025,1.6098,0.0000,0.0,...,493.5597,1179.3275,20.2833,787.5061,2351.2500,1138.1170,690.2752,179.9738,0.0000,0
2,0.0,7.2453,6.0184,49.5366,1140.6781,0.0,51.7284,0.0000,0.0000,0.0,...,365.4149,843.9028,26.5274,475.1720,5437.4534,1170.5214,532.8691,6.3003,0.0000,0
3,0.0,11.3311,7.5740,82.8303,807.1729,0.0,240.0221,0.4786,0.2393,0.0,...,346.7517,946.6872,64.8514,908.1593,6770.1537,1169.2401,663.8297,35.1777,0.0000,0
4,0.0,3.2254,3.4942,72.5351,562.0037,0.0,274.2822,0.6109,0.0000,0.0,...,563.2254,1320.7086,7.9414,778.8638,3341.4783,1737.3244,723.2743,378.1307,0.0000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1005,0.0,19.9503,47.1026,176.7177,1188.3278,0.0,226.8212,1.6556,0.0000,0.0,...,507.4503,1392.7980,126.2417,580.2980,2681.7053,1069.5364,783.1126,5.3808,1.2417,1
1006,0.0,30.0872,15.2957,188.7215,1248.0303,0.0,147.4945,0.4202,0.0000,0.0,...,366.8453,1326.6099,6.3032,845.8872,6094.3376,663.9353,778.6532,2.1011,0.0000,1
1007,0.0,53.6593,33.3907,260.3332,789.3606,0.0,854.3794,0.0000,0.0000,0.0,...,336.9156,2002.6867,101.5583,613.6486,2560.9887,730.2526,761.9559,12.3589,0.0000,1
1008,0.0,72.5666,42.1832,160.1624,460.8626,0.0,569.2226,1.3312,0.0000,0.0,...,584.1321,2074.5474,150.4260,1155.2183,1185.0373,833.3333,1383.1203,1.0650,0.0000,1


In [None]:
model_rank(data, ['RF','XGB'], {'RF':{'max_depth': 10, 'min_samples_leaf': 8, 'min_samples_split': 16, 'n_estimators': 200}, 'XGB':'recommanded'})

(                  ranking_RF  ranking_XGB
 ?|100130426             2174          379
 ?|100133144             2174          379
 ?|100134869             1189          379
 ?|10357                  805          379
 ?|10431                 2174          379
 ...                      ...          ...
 ZYX|7791                2174          379
 ZZEF1|23140             2174          379
 ZZZ3|26009              2174          379
 psiTPTE22|387590        2174          379
 tAKR|389932             2174          379
 
 [20531 rows x 2 columns],
 '----------------------------------------------------------------------------------------------------',
                   importances_RF  importances_XGB
 ?|100130426             0.000000              0.0
 ?|100133144             0.000000              0.0
 ?|100134869             0.000056              0.0
 ?|10357                 0.000120              0.0
 ?|10431                 0.000000              0.0
 ...                          ...           