In [1]:
import os
import sys
import time
import math
import warnings
import multiprocessing
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import GenericUnivariateSelect, chi2, f_classif, mutual_info_classif, SelectFromModel, SequentialFeatureSelector, RFE, RFECV
from sklearn.feature_selection import SelectKBest, SelectFpr, SelectFdr, SelectFwe
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from Feature_Selector5 import Feature_Statistics, Feature_Selector, Feature_SelectorX

In [2]:
def MP(Model, X, y):
    My_scores = Model.predict_proba(X)
    My_predictions = Model.predict(X)
    TN, FP, FN, TP = confusion_matrix(y, My_predictions, labels=[0, 1]).ravel()
    accuracy    = (TP+TN)/(TP+TN+FP+FN)
    sensitivity = TP / (TP+FN)
    specifity   = TN / (TN+FP)
    if My_scores.shape[1]==2:
        fpr, tpr, thresholds = roc_curve(y, My_scores[:,1])
    else:
        fpr, tpr, thresholds = roc_curve(y, My_scores)
    AUC = auc(fpr, tpr)
    GINI = 2 * AUC - 1
    return AUC, GINI, accuracy, sensitivity, specifity

In [3]:
df = pd.read_csv('csv\\scene.csv')
df.shape

(2407, 300)

In [4]:
df.head()

Unnamed: 0,attr1,attr2,attr3,attr4,attr5,attr6,attr7,attr8,attr9,attr10,...,attr291,attr292,attr293,attr294,Beach,Sunset,FallFoliage,Field,Mountain,y
0,0.646467,0.666435,0.685047,0.699053,0.652746,0.407864,0.150309,0.535193,0.555689,0.580782,...,0.157332,0.247298,0.014025,0.029709,1,0,0,0,1,0
1,0.770156,0.767255,0.761053,0.74563,0.742231,0.688086,0.708416,0.757351,0.760633,0.740314,...,0.251454,0.137833,0.082672,0.03632,1,0,0,0,0,1
2,0.793984,0.772096,0.76182,0.762213,0.740569,0.734361,0.722677,0.849128,0.839607,0.812746,...,0.017166,0.051125,0.112506,0.083924,1,0,0,0,0,0
3,0.938563,0.94926,0.955621,0.966743,0.968649,0.869619,0.696925,0.95346,0.959631,0.96632,...,0.019267,0.03129,0.04978,0.090959,1,0,0,0,0,0
4,0.51213,0.524684,0.52002,0.504467,0.471209,0.417654,0.364292,0.562266,0.588592,0.584449,...,0.198151,0.238796,0.16427,0.18429,1,0,0,0,0,0


In [5]:
X = df.drop('y',axis=1)
y = df['y']
X = pd.get_dummies(X)
X = X.fillna(0)
tmp = pd.DataFrame(pd.Series(X.columns).value_counts())
tmp.columns = ['count']
bad_columns = list(tmp.loc[tmp['count']>1].index)
print('bad columns=',bad_columns)
X = X.drop(bad_columns,axis=1)

bad columns= []


In [6]:
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.50,random_state=42,stratify=y)
print(X.shape,X_train.shape)

(2407, 299) (1203, 299)


In [7]:
%%time
my_feat_stat_n = Feature_Statistics(X_train,y_train)

CPU times: total: 15 s
Wall time: 6.56 s


In [None]:
%%time
selected10 = Feature_SelectorX(X_train,y_train,10,my_feat_stat_n,rm_mult_thrs=0.8,p12=1)
selected10

In [None]:
%%time
selected10 = Feature_Selector(X_train,y_train,10)
selected10

In [None]:
%%time
selected10 = Feature_Selector(X_train,y_train,10,my_feat_stat_n)
selected10

In [None]:
%%time
selected10 = Feature_Selector(X_train,y_train,10,my_feat_stat_n,'Best_Any_Inv')
selected10

# LIGHT GBM CLASSIFIER

In [None]:
%%time
My_model = LGBMClassifier(random_state=42,max_leaf_nodes=200,min_samples_leaf=50,verbosity=-1)
My_model.fit(X_train,y_train)
print('TRA:',MP(My_model,X_train,y_train))
print('VAL:',MP(My_model,X_val,y_val))

In [None]:
%%time
My_model = LGBMClassifier(random_state=42,max_leaf_nodes=200,min_samples_leaf=50,verbosity=-1)
My_model.fit(X_train[selected10],y_train)
print('TRA:',MP(My_model,X_train[selected10],y_train))
print('VAL:',MP(My_model,X_val[selected10],y_val))

In [None]:
%%time
selected10_FI_LGBM = Feature_Selector(X_train,y_train,10,my_feat_stat_n,'FI_LGBM')

In [None]:
%%time
My_model = LGBMClassifier(random_state=42,max_leaf_nodes=200,min_samples_leaf=50,verbosity=-1)
My_model.fit(X_train[selected10_FI_LGBM],y_train)
print('TRA:',MP(My_model,X_train[selected10_FI_LGBM],y_train))
print('VAL:',MP(My_model,X_val[selected10_FI_LGBM],y_val))