
Thư viện



In [277]:
pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [278]:
import numpy as np
import pandas as pd
import scipy.io
from sklearn.model_selection import KFold
from sklearn import svm
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error, roc_auc_score, f1_score
from scipy.sparse import *
from sklearn.metrics.pairwise import pairwise_distances
import time
import math

from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.model_selection import train_test_split

In [279]:
def construct_W(X, **kwargs):
 
    # default metric is 'cosine'
    if 'metric' not in kwargs.keys():
        kwargs['metric'] = 'cosine'

    # default neighbor mode is 'knn' and default neighbor size is 5
    if 'neighbor_mode' not in kwargs.keys():
        kwargs['neighbor_mode'] = 'knn'
    if kwargs['neighbor_mode'] == 'knn' and 'k' not in kwargs.keys():
        kwargs['k'] = 5
    if kwargs['neighbor_mode'] == 'supervised' and 'k' not in kwargs.keys():
        kwargs['k'] = 5
    if kwargs['neighbor_mode'] == 'supervised' and 'y' not in kwargs.keys():
        print ('Warning: label is required in the supervised neighborMode!!!')
        exit(0)

    # default weight mode is 'binary', default t in heat kernel mode is 1
    if 'weight_mode' not in kwargs.keys():
        kwargs['weight_mode'] = 'binary'
    if kwargs['weight_mode'] == 'heat_kernel':
        if kwargs['metric'] != 'euclidean':
            kwargs['metric'] = 'euclidean'
        if 't' not in kwargs.keys():
            kwargs['t'] = 1
    elif kwargs['weight_mode'] == 'cosine':
        if kwargs['metric'] != 'cosine':
            kwargs['metric'] = 'cosine'

    # default fisher_score and reliefF mode are 'false'
    if 'fisher_score' not in kwargs.keys():
        kwargs['fisher_score'] = False
    if 'reliefF' not in kwargs.keys():
        kwargs['reliefF'] = False

    n_samples, n_features = np.shape(X)

    # choose 'knn' neighbor mode
    if kwargs['neighbor_mode'] == 'knn':
        k = kwargs['k']
        if kwargs['weight_mode'] == 'binary':
            if kwargs['metric'] == 'euclidean':
                # compute pairwise euclidean distances
                D = pairwise_distances(X)
                D **= 2
                # sort the distance matrix D in ascending order
                dump = np.sort(D, axis=1)
                idx = np.argsort(D, axis=1)
                # choose the k-nearest neighbors for each instance
                idx_new = idx[:, 0:k+1]
                G = np.zeros((n_samples*(k+1), 3))
                G[:, 0] = np.tile(np.arange(n_samples), (k+1, 1)).reshape(-1)
                G[:, 1] = np.ravel(idx_new, order='F')
                G[:, 2] = 1
                # build the sparse affinity matrix W
                W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples))
                bigger = np.transpose(W) > W
                W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger)
                return W

            elif kwargs['metric'] == 'cosine':
                # normalize the data first
                X_normalized = np.power(np.sum(X*X, axis=1), 0.5)
                for i in range(n_samples):
                    X[i, :] = X[i, :]/max(1e-12, X_normalized[i])
                # compute pairwise cosine distances
                D_cosine = np.dot(X, np.transpose(X))
                # sort the distance matrix D in descending order
                dump = np.sort(-D_cosine, axis=1)
                idx = np.argsort(-D_cosine, axis=1)
                idx_new = idx[:, 0:k+1]
                G = np.zeros((n_samples*(k+1), 3))
                G[:, 0] = np.tile(np.arange(n_samples), (k+1, 1)).reshape(-1)
                G[:, 1] = np.ravel(idx_new, order='F')
                G[:, 2] = 1
                # build the sparse affinity matrix W
                W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples))
                bigger = np.transpose(W) > W
                W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger)
                return W

        elif kwargs['weight_mode'] == 'heat_kernel':
            t = kwargs['t']
            # compute pairwise euclidean distances
            D = pairwise_distances(X)
            D **= 2
            # sort the distance matrix D in ascending order
            dump = np.sort(D, axis=1)
            idx = np.argsort(D, axis=1)
            idx_new = idx[:, 0:k+1]
            dump_new = dump[:, 0:k+1]
            # compute the pairwise heat kernel distances
            dump_heat_kernel = np.exp(-dump_new/(2*t*t))
            G = np.zeros((n_samples*(k+1), 3))
            G[:, 0] = np.tile(np.arange(n_samples), (k+1, 1)).reshape(-1)
            G[:, 1] = np.ravel(idx_new, order='F')
            G[:, 2] = np.ravel(dump_heat_kernel, order='F')
            # build the sparse affinity matrix W
            W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples))
            bigger = np.transpose(W) > W
            W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger)
            return W

        elif kwargs['weight_mode'] == 'cosine':
            # normalize the data first
            X_normalized = np.power(np.sum(X*X, axis=1), 0.5)
            for i in range(n_samples):
                    X[i, :] = X[i, :]/max(1e-12, X_normalized[i])
            # compute pairwise cosine distances
            D_cosine = np.dot(X, np.transpose(X))
            # sort the distance matrix D in ascending order
            dump = np.sort(-D_cosine, axis=1)
            idx = np.argsort(-D_cosine, axis=1)
            idx_new = idx[:, 0:k+1]
            dump_new = -dump[:, 0:k+1]
            G = np.zeros((n_samples*(k+1), 3))
            G[:, 0] = np.tile(np.arange(n_samples), (k+1, 1)).reshape(-1)
            G[:, 1] = np.ravel(idx_new, order='F')
            G[:, 2] = np.ravel(dump_new, order='F')
            # build the sparse affinity matrix W
            W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples))
            bigger = np.transpose(W) > W
            W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger)
            return W

    # choose supervised neighborMode
    elif kwargs['neighbor_mode'] == 'supervised':
        k = kwargs['k']
        # get true labels and the number of classes
        y = kwargs['y']
        label = np.unique(y)
        n_classes = np.unique(y).size
        # construct the weight matrix W in a fisherScore way, W_ij = 1/n_l if yi = yj = l, otherwise W_ij = 0
        if kwargs['fisher_score'] is True:
            W = lil_matrix((n_samples, n_samples))
            for i in range(n_classes):
                class_idx = (y == label[i])
                class_idx_all = (class_idx[:, np.newaxis] & class_idx[np.newaxis, :])
                W[class_idx_all] = 1.0/np.sum(np.sum(class_idx))
            return W

        # construct the weight matrix W in a reliefF way, NH(x) and NM(x,y) denotes a set of k nearest
        # points to x with the same class as x, a different class (the class y), respectively. W_ij = 1 if i = j;
        # W_ij = 1/k if x_j \in NH(x_i); W_ij = -1/(c-1)k if x_j \in NM(x_i, y)
        if kwargs['reliefF'] is True:
            # when xj in NH(xi)
            G = np.zeros((n_samples*(k+1), 3))
            id_now = 0
            for i in range(n_classes):
                class_idx = np.column_stack(np.where(y == label[i]))[:, 0]
                D = pairwise_distances(X[class_idx, :])
                D **= 2
                idx = np.argsort(D, axis=1)
                idx_new = idx[:, 0:k+1]
                n_smp_class = (class_idx[idx_new[:]]).size
                if len(class_idx) <= k:
                    k = len(class_idx) - 1
                G[id_now:n_smp_class+id_now, 0] = np.tile(class_idx, (k+1, 1)).reshape(-1)
                G[id_now:n_smp_class+id_now, 1] = np.ravel(class_idx[idx_new[:]], order='F')
                G[id_now:n_smp_class+id_now, 2] = 1.0/k
                id_now += n_smp_class
            W1 = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples))
            # when i = j, W_ij = 1
            for i in range(n_samples):
                W1[i, i] = 1
            # when x_j in NM(x_i, y)
            G = np.zeros((n_samples*k*(n_classes - 1), 3))
            id_now = 0
            for i in range(n_classes):
                class_idx1 = np.column_stack(np.where(y == label[i]))[:, 0]
                X1 = X[class_idx1, :]
                for j in range(n_classes):
                    if label[j] != label[i]:
                        class_idx2 = np.column_stack(np.where(y == label[j]))[:, 0]
                        X2 = X[class_idx2, :]
                        D = pairwise_distances(X1, X2)
                        idx = np.argsort(D, axis=1)
                        idx_new = idx[:, 0:k]
                        n_smp_class = len(class_idx1)*k
                        G[id_now:n_smp_class+id_now, 0] = np.tile(class_idx1, (k, 1)).reshape(-1)
                        G[id_now:n_smp_class+id_now, 1] = np.ravel(class_idx2[idx_new[:]], order='F')
                        G[id_now:n_smp_class+id_now, 2] = -1.0/((n_classes-1)*k)
                        id_now += n_smp_class
            W2 = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples))
            bigger = np.transpose(W2) > W2
            W2 = W2 - W2.multiply(bigger) + np.transpose(W2).multiply(bigger)
            W = W1 + W2
            return W

        if kwargs['weight_mode'] == 'binary':
            if kwargs['metric'] == 'euclidean':
                G = np.zeros((n_samples*(k+1), 3))
                id_now = 0
                for i in range(n_classes):
                    class_idx = np.column_stack(np.where(y == label[i]))[:, 0]
                    # compute pairwise euclidean distances for instances in class i
                    D = pairwise_distances(X[class_idx, :])
                    D **= 2
                    # sort the distance matrix D in ascending order for instances in class i
                    idx = np.argsort(D, axis=1)
                    idx_new = idx[:, 0:k+1]
                    n_smp_class = len(class_idx)*(k+1)
                    G[id_now:n_smp_class+id_now, 0] = np.tile(class_idx, (k+1, 1)).reshape(-1)
                    G[id_now:n_smp_class+id_now, 1] = np.ravel(class_idx[idx_new[:]], order='F')
                    G[id_now:n_smp_class+id_now, 2] = 1
                    id_now += n_smp_class
                # build the sparse affinity matrix W
                W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples))
                bigger = np.transpose(W) > W
                W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger)
                return W

            if kwargs['metric'] == 'cosine':
                # normalize the data first
                X_normalized = np.power(np.sum(X*X, axis=1), 0.5)
                for i in range(n_samples):
                    X[i, :] = X[i, :]/max(1e-12, X_normalized[i])
                G = np.zeros((n_samples*(k+1), 3))
                id_now = 0
                for i in range(n_classes):
                    class_idx = np.column_stack(np.where(y == label[i]))[:, 0]
                    # compute pairwise cosine distances for instances in class i
                    D_cosine = np.dot(X[class_idx, :], np.transpose(X[class_idx, :]))
                    # sort the distance matrix D in descending order for instances in class i
                    idx = np.argsort(-D_cosine, axis=1)
                    idx_new = idx[:, 0:k+1]
                    n_smp_class = len(class_idx)*(k+1)
                    G[id_now:n_smp_class+id_now, 0] = np.tile(class_idx, (k+1, 1)).reshape(-1)
                    G[id_now:n_smp_class+id_now, 1] = np.ravel(class_idx[idx_new[:]], order='F')
                    G[id_now:n_smp_class+id_now, 2] = 1
                    id_now += n_smp_class
                # build the sparse affinity matrix W
                W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples))
                bigger = np.transpose(W) > W
                W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger)
                return W

        elif kwargs['weight_mode'] == 'heat_kernel':
            G = np.zeros((n_samples*(k+1), 3))
            id_now = 0
            for i in range(n_classes):
                class_idx = np.column_stack(np.where(y == label[i]))[:, 0]
                # compute pairwise cosine distances for instances in class i
                D = pairwise_distances(X[class_idx, :])
                D **= 2
                # sort the distance matrix D in ascending order for instances in class i
                dump = np.sort(D, axis=1)
                idx = np.argsort(D, axis=1)
                idx_new = idx[:, 0:k+1]
                dump_new = dump[:, 0:k+1]
                t = kwargs['t']
                # compute pairwise heat kernel distances for instances in class i
                dump_heat_kernel = np.exp(-dump_new/(2*t*t))
                n_smp_class = len(class_idx)*(k+1)
                G[id_now:n_smp_class+id_now, 0] = np.tile(class_idx, (k+1, 1)).reshape(-1)
                G[id_now:n_smp_class+id_now, 1] = np.ravel(class_idx[idx_new[:]], order='F')
                G[id_now:n_smp_class+id_now, 2] = np.ravel(dump_heat_kernel, order='F')
                id_now += n_smp_class
            # build the sparse affinity matrix W
            W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples))
            bigger = np.transpose(W) > W
            W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger)
            return W

        elif kwargs['weight_mode'] == 'cosine':
            # normalize the data first
            X_normalized = np.power(np.sum(X*X, axis=1), 0.5)
            for i in range(n_samples):
                X[i, :] = X[i, :]/max(1e-12, X_normalized[i])
            G = np.zeros((n_samples*(k+1), 3))
            id_now = 0
            for i in range(n_classes):
                class_idx = np.column_stack(np.where(y == label[i]))[:, 0]
                # compute pairwise cosine distances for instances in class i
                D_cosine = np.dot(X[class_idx, :], np.transpose(X[class_idx, :]))
                # sort the distance matrix D in descending order for instances in class i
                dump = np.sort(-D_cosine, axis=1)
                idx = np.argsort(-D_cosine, axis=1)
                idx_new = idx[:, 0:k+1]
                dump_new = -dump[:, 0:k+1]
                n_smp_class = len(class_idx)*(k+1)
                G[id_now:n_smp_class+id_now, 0] = np.tile(class_idx, (k+1, 1)).reshape(-1)
                G[id_now:n_smp_class+id_now, 1] = np.ravel(class_idx[idx_new[:]], order='F')
                G[id_now:n_smp_class+id_now, 2] = np.ravel(dump_new, order='F')
                id_now += n_smp_class
            # build the sparse affinity matrix W
            W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples))
            bigger = np.transpose(W) > W
            W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger)
            return W

In [280]:
def reliefF(X, y, **kwargs):
    """
    This function implements the reliefF feature selection

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    y: {numpy array}, shape (n_samples,)
        input class labels
    kwargs: {dictionary}
        parameters of reliefF:
        k: {int}
            choices for the number of neighbors (default k = 5)

    Output
    ------
    score: {numpy array}, shape (n_features,)
        reliefF score for each feature

    Reference
    ---------
    Robnik-Sikonja, Marko et al. "Theoretical and empirical analysis of relieff and rrelieff." Machine Learning 2003.
    Zhao, Zheng et al. "On Similarity Preserving Feature Selection." TKDE 2013.
    """

    if "k" not in kwargs.keys():
        k = 8
    else:
        k = kwargs["k"]
    n_samples, n_features = X.shape

    # calculate pairwise distances between instances
    distance = pairwise_distances(X, metric='manhattan')

    score = np.zeros(n_features)

    # the number of sampled instances is equal to the number of total instances
    for idx in range(n_samples):
        near_hit = []
        near_miss = dict()

        self_fea = X[idx, :]
        c = np.unique(y).tolist()

        stop_dict = dict()
        for label in c:
            stop_dict[label] = 0
        del c[c.index(y[idx])]

        p_dict = dict()
        p_label_idx = float(len(y[y == y[idx]]))/float(n_samples)

        for label in c:
            p_label_c = float(len(y[y == label]))/float(n_samples)
            p_dict[label] = p_label_c/(1-p_label_idx)
            near_miss[label] = []

        distance_sort = []
        distance[idx, idx] = np.max(distance[idx, :])

        for i in range(n_samples):
            distance_sort.append([distance[idx, i], int(i), y[i]])
        distance_sort.sort(key=lambda x: x[0])

        for i in range(n_samples):
            # find k nearest hit points
            if distance_sort[i][2] == y[idx]:
                if len(near_hit) < k:
                    near_hit.append(distance_sort[i][1])
                elif len(near_hit) == k:
                    stop_dict[y[idx]] = 1
            else:
                # find k nearest miss points for each label
                if len(near_miss[distance_sort[i][2]]) < k:
                    near_miss[distance_sort[i][2]].append(distance_sort[i][1])
                else:
                    if len(near_miss[distance_sort[i][2]]) == k:
                        stop_dict[distance_sort[i][2]] = 1
            stop = True
            for (key, value) in stop_dict.items():
                    if value != 1:
                        stop = False
            if stop:
                break

        # update reliefF score
        near_hit_term = np.zeros(n_features)
        for ele in near_hit:
            near_hit_term = np.array(abs(self_fea-X[ele, :]))+np.array(near_hit_term)

        near_miss_term = dict()
        for (label, miss_list) in near_miss.items():
            near_miss_term[label] = np.zeros(n_features)
            for ele in miss_list:
                near_miss_term[label] = np.array(abs(self_fea-X[ele, :]))+np.array(near_miss_term[label])
            score += near_miss_term[label]/(k*p_dict[label])
        score -= near_hit_term/k
    return score


def feature_ranking(score):
    """
    Rank features in descending order according to reliefF score, the higher the reliefF score, the more important the
    feature is
    """
    idx = np.argsort(score, 0)
    return idx[::-1]



In [281]:
from google.colab import drive 
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [282]:
# load data

dataset = 'CreditRisk.csv'
link = '/content/drive/MyDrive/FeatureSelection/dataset/' + dataset
df=pd.read_csv(link)
nRow, nCol = df.shape


  exec(code_obj, self.user_global_ns, self.user_ns)


In [283]:
df.columns

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
       'issue_d', 'pymnt_plan', 'desc', 'purpose', 'title', 'zip_code',
       'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line',
       'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record',
       'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'policy_code', 'application_type', 'annual_inc_joint', 'dti_joint',
       'verification_status_joint', 'acc_now_delinq', 'tot_col

In [284]:
n_samples = 37000;
df = df.groupby('y', group_keys=False).apply(lambda x: x.sample(frac=n_samples/nRow))
df.shape

(37000, 73)

In [285]:
#df = df.dropna(axis='rows')
df = df.fillna(0)
cat_columns = df.select_dtypes(['object']).columns
if not cat_columns.empty:
  df[cat_columns] = df[cat_columns].apply(lambda x: pd.factorize(x)[0])
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,y
710659,50315232,53714004,7000,7000,7000.0,0,13.33,236.98,0,0,...,0.0,0.0,0.0,0.0,0.0,10150.0,0.0,0.0,0.0,0
268216,31457503,34050696,15000,15000,15000.0,1,20.99,405.72,1,1,...,0.0,0.0,0.0,0.0,0.0,39500.0,0.0,0.0,0.0,0
627665,57364991,61067729,15000,15000,15000.0,1,8.18,305.44,2,2,...,0.0,0.0,0.0,0.0,0.0,25600.0,0.0,0.0,0.0,0
410215,12968021,15000209,19500,19500,19500.0,1,16.59,480.34,3,3,...,0.0,0.0,0.0,0.0,0.0,29200.0,0.0,0.0,0.0,0
102221,6580987,8132991,7200,7200,7200.0,0,7.62,224.37,4,4,...,0.0,0.0,0.0,0.0,0.0,19700.0,0.0,0.0,0.0,0


In [286]:
X = df.drop(columns=['id','y']).to_numpy()

features_label = df.columns.difference(['y']).to_numpy()
y = df['y'].to_numpy()
n_samples, n_features = X.shape
del df

In [287]:
start_time=time.time()

# obtain the score of each feature on the training set
score = reliefF(X, y)

# rank features in descending order according to score
idx = feature_ranking(score)

#print(score)
print("Ranking: ")
print(idx)



Ranking: 
[ 0 55 12 67 29 33 34  1  2  3  9 35 36 37 40 38 61 16 43 18  6 41 65 42
 54 19 23 14 45 31 47 26 30 21 25  8 20  5 39 27 66 17 50 62 44 24 11  4
  7 32 10 57 64 13 59 49 51 48 15 52 63 53 56 58 28 68 70 46 69 22 60]


In [288]:
idx_top10 = idx[0:int(n_features/10)]
print("--------------------------------------")
print("index of top  10 : %s" %idx_top10)
#print("Top 10 feature: %s " %features_label[idx_top10])

--------------------------------------
index of top  10 : [ 0 55 12 67 29 33 34]


In [289]:
idx_top15 = idx[0:int(15*n_features/100)]
print("--------------------------------------")
print("index of top  15 : %s" %idx_top15)
#print("Top 15 feature: %s " %features_label[idx_top15])

--------------------------------------
index of top  15 : [ 0 55 12 67 29 33 34  1  2  3]


In [290]:
idx_top25 = idx[0:int(n_features/4)]
print("--------------------------------------")
print("index of top  25: %s" %idx_top25)
#print("Top 25 feature: %s " %features_label[idx_top25])

print("--- %s seconds ---" % (time.time() - start_time))

--------------------------------------
index of top  25: [ 0 55 12 67 29 33 34  1  2  3  9 35 36 37 40 38 61]
--- 4343.245657205582 seconds ---


In [291]:
X_selected = X[:, idx_top10]
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=5)


In [292]:
# fit and predict

model = CatBoostClassifier(loss_function='Logloss', eval_metric='Accuracy')
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

y_predict
acc = "{:.2f}".format(accuracy_score(y_test,y_predict))
auc = "{:.2f}".format(roc_auc_score(y_test, y_predict))
f1 = "{:.2f}".format(f1_score(y_test,y_predict))
print('Accuracy =', accuracy_score(y_test,y_predict))
print('ROC AUC =', roc_auc_score(y_test, y_predict ))
print('F1 =', f1_score(y_test,y_predict))

Learning rate set to 0.04377
0:	learn: 0.9455405	total: 30.6ms	remaining: 30.6s
1:	learn: 0.9455405	total: 47.3ms	remaining: 23.6s
2:	learn: 0.9455405	total: 63.5ms	remaining: 21.1s
3:	learn: 0.9455405	total: 74.6ms	remaining: 18.6s
4:	learn: 0.9455405	total: 85.4ms	remaining: 17s
5:	learn: 0.9455405	total: 96.4ms	remaining: 16s
6:	learn: 0.9455405	total: 108ms	remaining: 15.3s
7:	learn: 0.9455405	total: 119ms	remaining: 14.7s
8:	learn: 0.9455405	total: 130ms	remaining: 14.3s
9:	learn: 0.9455405	total: 141ms	remaining: 14s
10:	learn: 0.9455405	total: 152ms	remaining: 13.7s
11:	learn: 0.9455405	total: 164ms	remaining: 13.5s
12:	learn: 0.9455405	total: 176ms	remaining: 13.3s
13:	learn: 0.9455405	total: 188ms	remaining: 13.2s
14:	learn: 0.9455405	total: 203ms	remaining: 13.3s
15:	learn: 0.9455405	total: 214ms	remaining: 13.2s
16:	learn: 0.9455405	total: 225ms	remaining: 13s
17:	learn: 0.9455405	total: 243ms	remaining: 13.2s
18:	learn: 0.9455405	total: 257ms	remaining: 13.3s
19:	learn: 0.9

In [293]:
df=pd.read_csv('/content/drive/MyDrive/FeatureSelection/dataset/result.csv')
df2 = pd.DataFrame({'score':'ReliefF','name':[dataset], 'samples': [n_samples], 'top': '10','acc':[acc],'auc':[auc],'f1':[f1]})
df = pd.concat([df,df2], ignore_index = True)
df.to_csv('/content/drive/MyDrive/FeatureSelection/dataset/result.csv', index=False)


In [294]:
X_selected = X[:, idx_top15]
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=5)


In [295]:
# fit and predict

model = CatBoostClassifier(loss_function='Logloss', eval_metric='Accuracy')
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

acc = "{:.2f}".format(accuracy_score(y_test,y_predict))
auc = "{:.2f}".format(roc_auc_score(y_test, y_predict))
f1 = "{:.2f}".format(f1_score(y_test,y_predict))
print('Accuracy =', acc)
print('ROC AUC =', auc)
print('F1 =', f1)

Learning rate set to 0.04377
0:	learn: 0.9455405	total: 16.6ms	remaining: 16.5s
1:	learn: 0.9455405	total: 28.7ms	remaining: 14.3s
2:	learn: 0.9455405	total: 41.1ms	remaining: 13.7s
3:	learn: 0.9455405	total: 53.1ms	remaining: 13.2s
4:	learn: 0.9455405	total: 64.9ms	remaining: 12.9s
5:	learn: 0.9455405	total: 76.7ms	remaining: 12.7s
6:	learn: 0.9455405	total: 88.1ms	remaining: 12.5s
7:	learn: 0.9455405	total: 99.9ms	remaining: 12.4s
8:	learn: 0.9455405	total: 112ms	remaining: 12.3s
9:	learn: 0.9455405	total: 127ms	remaining: 12.6s
10:	learn: 0.9455405	total: 143ms	remaining: 12.9s
11:	learn: 0.9455405	total: 155ms	remaining: 12.8s
12:	learn: 0.9455405	total: 168ms	remaining: 12.7s
13:	learn: 0.9455405	total: 181ms	remaining: 12.8s
14:	learn: 0.9455405	total: 195ms	remaining: 12.8s
15:	learn: 0.9455405	total: 212ms	remaining: 13s
16:	learn: 0.9455405	total: 224ms	remaining: 12.9s
17:	learn: 0.9455405	total: 261ms	remaining: 14.2s
18:	learn: 0.9455405	total: 273ms	remaining: 14.1s
19:	le

In [296]:
df=pd.read_csv('/content/drive/MyDrive/FeatureSelection/dataset/result.csv')
df2 = pd.DataFrame({'score':'ReliefF','name':[dataset], 'samples': [n_samples], 'top': '15','acc':[acc],'auc':[auc],'f1':[f1]})
df = pd.concat([df,df2], ignore_index = True)
df.to_csv('/content/drive/MyDrive/FeatureSelection/dataset/result.csv', index=False)


In [297]:
X_selected = X[:, idx_top25]
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=5)


In [298]:
# fit and predict

model = CatBoostClassifier(loss_function='Logloss', eval_metric='Accuracy')
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

y_predict
acc = "{:.2f}".format(accuracy_score(y_test,y_predict))
auc = "{:.2f}".format(roc_auc_score(y_test, y_predict))
f1 = "{:.2f}".format(f1_score(y_test,y_predict))
print('Accuracy =', accuracy_score(y_test,y_predict))
print('ROC AUC =', roc_auc_score(y_test, y_predict ))
print('F1 =', f1_score(y_test,y_predict))

Learning rate set to 0.04377
0:	learn: 0.9921284	total: 18.3ms	remaining: 18.3s
1:	learn: 0.9862500	total: 31ms	remaining: 15.5s
2:	learn: 0.9863176	total: 44.2ms	remaining: 14.7s
3:	learn: 0.9869257	total: 57.6ms	remaining: 14.3s
4:	learn: 0.9874662	total: 71.2ms	remaining: 14.2s
5:	learn: 0.9875000	total: 88.8ms	remaining: 14.7s
6:	learn: 0.9874662	total: 103ms	remaining: 14.6s
7:	learn: 0.9876351	total: 117ms	remaining: 14.4s
8:	learn: 0.9875676	total: 130ms	remaining: 14.4s
9:	learn: 0.9876351	total: 144ms	remaining: 14.3s
10:	learn: 0.9877027	total: 163ms	remaining: 14.7s
11:	learn: 0.9884459	total: 177ms	remaining: 14.6s
12:	learn: 0.9901351	total: 191ms	remaining: 14.5s
13:	learn: 0.9916554	total: 210ms	remaining: 14.8s
14:	learn: 0.9925000	total: 224ms	remaining: 14.7s
15:	learn: 0.9919932	total: 237ms	remaining: 14.6s
16:	learn: 0.9933784	total: 251ms	remaining: 14.5s
17:	learn: 0.9933446	total: 264ms	remaining: 14.4s
18:	learn: 0.9935473	total: 278ms	remaining: 14.3s
19:	lear

In [299]:
df=pd.read_csv('/content/drive/MyDrive/FeatureSelection/dataset/result.csv')
df2 = pd.DataFrame({'score':'ReliefF','name':[dataset], 'samples': [n_samples], 'top': '25','acc':[acc],'auc':[auc],'f1':[f1]})
df = pd.concat([df,df2], ignore_index = True)
df.to_csv('/content/drive/MyDrive/FeatureSelection/dataset/result.csv', index=False)
