### Import Modules

In [1]:
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tpot import TPOTClassifier
from hpsklearn import HyperoptEstimator, pca, min_max_scaler, standard_scaler
from hpsklearn import xgboost_classification, random_forest, ada_boost, gradient_boosting, extra_trees
from hpsklearn import svc, svc_linear, svc_rbf, svc_poly, svc_sigmoid, liblinear_svc
from hpsklearn import any_classifier
from hpsklearn import any_preprocessing
from hyperopt import tpe, hp
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
from sklearn.svm import OneClassSVM
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, recall_score, classification_report,confusion_matrix,plot_confusion_matrix
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import PowerTransformer, QuantileTransformer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_selection import RFE, SelectFromModel, RFECV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, mutual_info_classif
from matplotlib import pyplot

import warnings
warnings.filterwarnings('ignore')



WARN: OMP_NUM_THREADS=None =>
... If you are using openblas if you are using openblas set OMP_NUM_THREADS=1 or risk subprocess calls hanging indefinitely


### Load Data

In [2]:
df_train = pd.read_csv('Data/otto_group_Product_train.csv')
#df_test = pd.read_csv('Data/otto_group_Product_test.csv')

In [3]:
df_train.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [4]:
df_train.shape

(61878, 95)

In [5]:
df_train["target"].values

array(['Class_1', 'Class_1', 'Class_1', ..., 'Class_9', 'Class_9',
       'Class_9'], dtype=object)

In [6]:
df_train["target"].value_counts()

Class_2    16122
Class_6    14135
Class_8     8464
Class_3     8004
Class_9     4955
Class_7     2839
Class_5     2739
Class_4     2691
Class_1     1929
Name: target, dtype: int64

In [7]:
X = df_train.drop(['id', 'target'],axis=1)
y = df_train['target']

In [8]:
y = LabelEncoder().fit_transform(y)

### Train Test Split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [10]:
X_train.shape

(43314, 93)

### Define Base Model

In [11]:
# evaluate a model
def evaluate_model(X, y, model):
    # define evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
    # evaluate model  'accuracy'  'f1_micro'
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    #print('F-measure: %.3f' % score)
    return scores

In [12]:
model = XGBClassifier(base_score=0.5, booster='gbtree',
              colsample_bylevel=0.50029110018247, colsample_bynode=1,
              colsample_bytree=0.9371080486138555, gamma=0.00014999673883672553,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.15017743323882513, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=None, monotone_constraints='()',
              n_estimators=1000, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=4,
              reg_alpha=6.402887715271464e-05, reg_lambda=1.0516334360090362,
              scale_pos_weight=1, seed=4, subsample=0.9995865341563067,
              tree_method='exact', validate_parameters=1, verbosity=None)

### Find Optimal number of features (RFE)

In [13]:
init_time = datetime.now()
print (f"Job Started at: {init_time}")
#rfc = RandomForestClassifier(random_state=101)
#rfecv = RFECV(estimator=rfc, step=1, cv=RepeatedStratifiedKFold(10), scoring='accuracy')
rfecv = RFECV(estimator=RandomForestClassifier(n_estimators=1000, random_state=101))
rfecv.fit(X_train, y_train)
fin_time = datetime.now()
print("Execution time : ", (fin_time-init_time))

Job Started at: 2020-11-15 01:59:41.876361
Execution time :  0:39:38.489188


In [16]:
n_features = rfecv.n_features_
print(f'Optimal number of features: {n_features}')

Optimal number of features: 87


In [15]:
rfecv.estimator_.feature_importances_

array([0.00731202, 0.0040518 , 0.01241899, 0.01116241, 0.00197651,
       0.00330553, 0.01498064, 0.01337249, 0.00412989, 0.04171915,
       0.00248947, 0.0059908 , 0.03253525, 0.02614734, 0.01194113,
       0.00925984, 0.00594922, 0.00469889, 0.00772869, 0.00381849,
       0.00629115, 0.00332659, 0.01883343, 0.02754061, 0.02580626,
       0.01598969, 0.00385833, 0.00465135, 0.00967903, 0.01316532,
       0.01175661, 0.04620874, 0.00775855, 0.01825531, 0.00611078,
       0.00740583, 0.01170817, 0.02991659, 0.00719551, 0.02333728,
       0.01314149, 0.00517112, 0.00404715, 0.00531686, 0.00815644,
       0.01802885, 0.00361922, 0.00787561, 0.00279535, 0.00997745,
       0.01146715, 0.00530752, 0.00665557, 0.01167063, 0.00703373,
       0.00859509, 0.03971245, 0.00792015, 0.01938818, 0.00288398,
       0.01474889, 0.00359994, 0.00708055, 0.02383189, 0.0126068 ,
       0.01605346, 0.01037368, 0.00818162, 0.0142885 , 0.00509412,
       0.00526011, 0.01662553, 0.01149159, 0.00294511, 0.00854

In [17]:
print(np.where(rfecv.support_ == False)[0])
X_train_best_features = X_train.drop(X_train.columns[np.where(rfecv.support_ == False)[0]], axis=1)
X_test_best_features = X_test.drop(X_test.columns[np.where(rfecv.support_ == False)[0]], axis=1)

[ 5 30 50 80 81 83]


In [18]:
X_train_best_features.shape

(43314, 87)

In [19]:
X_test_best_features.shape

(18564, 87)

### Pipeline (Scaling + Transforming)

In [20]:
# get modeling pipelines to evaluate
def get_pipelines(model):
    pipelines = list()
    # xgb
    p = Pipeline([('m', model)])
    pipelines.append(('xgb', p))
    # power
    p = Pipeline([('p', PowerTransformer()), ('m', model)])
    pipelines.append(('power', p))
    # Quantile
    p = Pipeline([('q', QuantileTransformer(n_quantiles=100, output_distribution='normal')), ('m', model)])
    pipelines.append(('quantile', p))
    # normalize
    p = Pipeline([('n', MinMaxScaler()), ('m', model)])
    pipelines.append(('norm', p))
    # normalize and power
    p = Pipeline([('n', MinMaxScaler()), ('p', PowerTransformer()), ('m', model)])
    pipelines.append(('norm_power', p))
    # normalize and Quantile
    p = Pipeline([('n', MinMaxScaler()), 
                  ('q', QuantileTransformer(n_quantiles=100, output_distribution='normal')), ('m', model)])
    pipelines.append(('norm_quantile', p))
    # standardize
    p = Pipeline([('s', StandardScaler()), ('m', model)])
    pipelines.append(('std', p))
    # standardize and power
    p = Pipeline([('s', StandardScaler()), ('p', PowerTransformer()), ('m', model)])
    pipelines.append(('std_power', p))
    # standardize and Quantile
    p = Pipeline([('s', StandardScaler()), 
                  ('q', QuantileTransformer(n_quantiles=100, output_distribution='normal')), ('m', model)])
    pipelines.append(('std_quantile', p))
    # Robust
    p = Pipeline([('r', RobustScaler(with_centering=False, with_scaling=True)), ('m',model)])
    pipelines.append(('robust', p))
    # Robust and power
    p = Pipeline([('r', RobustScaler(with_centering=False, with_scaling=True)), ('p', PowerTransformer()), 
                  ('m', model)])
    pipelines.append(('robust_power', p))
    # Robust and Quantile
    p = Pipeline([('r', RobustScaler(with_centering=False, with_scaling=True)), 
                  ('q', QuantileTransformer(n_quantiles=100, output_distribution='normal')), ('m', model)])
    pipelines.append(('robust_quantile', p))
    return pipelines

### RFE Pipeline 

In [21]:
# get modeling pipelines to evaluate
def get_ref_pipelines(model, n_features):
    pipelines = list()
    # RFE
    p = Pipeline([('rfe', RFE(estimator=RandomForestClassifier(random_state=101),n_features_to_select=n_features)), 
                  ('m', model)])
    pipelines.append(('rfe', p))
    # power and RFE
    p = Pipeline([('p', PowerTransformer()), 
                  ('rfe', RFE(estimator=RandomForestClassifier(random_state=101),n_features_to_select=n_features)),
                  ('m', model)])
    pipelines.append(('power_rfe', p))
    # Quantile and RFE
    p = Pipeline([('q', QuantileTransformer(n_quantiles=100, output_distribution='normal')), 
                  ('rfe', RFE(estimator=RandomForestClassifier(random_state=101),n_features_to_select=n_features)), 
                  ('m', model)])
    pipelines.append(('quantile_rfe', p))
    # normalize and RFE
    p = Pipeline([('n', MinMaxScaler()), 
                  ('rfe', RFE(estimator=RandomForestClassifier(random_state=101),n_features_to_select=n_features)), 
                  ('m', model)])
    pipelines.append(('norm_rfe', p))
    # normalize and power and RFE
    p = Pipeline([('n', MinMaxScaler()), ('p', PowerTransformer()), 
                  ('rfe', RFE(estimator=RandomForestClassifier(random_state=101),n_features_to_select=n_features)), 
                  ('m', model)])
    pipelines.append(('norm_power_rfe', p))
    # normalize and Quantile and RFE
    p = Pipeline([('n', MinMaxScaler()), 
                  ('q', QuantileTransformer(n_quantiles=100, output_distribution='normal')), 
                  ('rfe', RFE(estimator=RandomForestClassifier(random_state=101),n_features_to_select=n_features)), 
                  ('m', model)])
    pipelines.append(('norm_quantile_rfe', p))
    # standardize and RFE
    p = Pipeline([('s', StandardScaler()), 
                  ('rfe', RFE(estimator=RandomForestClassifier(random_state=101),n_features_to_select=n_features)), 
                  ('m', model)])
    pipelines.append(('std_rfe', p))
    # standardize and power and RFE
    p = Pipeline([('s', StandardScaler()), ('p', PowerTransformer()), 
                  ('rfe', RFE(estimator=RandomForestClassifier(random_state=101),n_features_to_select=n_features)), 
                  ('m', model)])
    pipelines.append(('std_power_rfe', p))
    # standardize and Quantile and RFE
    p = Pipeline([('s', StandardScaler()), 
                  ('q', QuantileTransformer(n_quantiles=100, output_distribution='normal')), 
                  ('rfe', RFE(estimator=RandomForestClassifier(random_state=101),n_features_to_select=n_features)), 
                  ('m', model)])
    pipelines.append(('std_quantile_rfe', p))
    # Robust and RFE
    p = Pipeline([('r', RobustScaler(with_centering=False, with_scaling=True)), 
                  ('rfe', RFE(estimator=RandomForestClassifier(random_state=101),n_features_to_select=n_features)), 
                  ('m', model)])
    pipelines.append(('robust_rfe', p))
    # Robust and power and RFE
    p = Pipeline([('r', RobustScaler(with_centering=False, with_scaling=True)), ('p', PowerTransformer()), 
                  ('rfe', RFE(estimator=RandomForestClassifier(random_state=101),n_features_to_select=n_features)), 
                  ('m', model)])
    pipelines.append(('robust_power_rfe', p))
    # Robust and Quantile and RFE
    p = Pipeline([('r', RobustScaler(with_centering=False, with_scaling=True)), 
                  ('q', QuantileTransformer(n_quantiles=100, output_distribution='normal')), 
                  ('rfe', RFE(estimator=RandomForestClassifier(random_state=101),n_features_to_select=n_features)), 
                  ('m', model)])
    pipelines.append(('robust_quantile_rfe', p))
    return pipelines

### Evaluate Pipeline (Scaling + Transforming + RFE)

In [22]:
def fit_pipeline(name, pipeline, X_train, y_train, X_test, y_test):
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    print('> %s: %.4f' % (name, accuracy_score(y_test, predictions)))

In [21]:
init_time = datetime.now()
print (f"Job Started at: {init_time}")
# get the modeling pipelines
pipelines = get_ref_pipelines(model, n_features)
print (f"Total Models: {len(pipelines)}")
# evaluate each pipeline
for name, pipeline in pipelines:
    fit_pipeline(name, pipeline, X_train, y_train, X_test, y_test)
fin_time = datetime.now()
print("Execution time : ", (fin_time-init_time))

Job Started at: 2020-11-14 23:01:59.601354
Total Models: 12
Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


> rfe: 0.824
Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


> power_rfe: 0.827
Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


> quantile_rfe: 0.826
Parameters

In [23]:
init_time = datetime.now()
print (f"Job Started at: {init_time}")
# get the modeling pipelines
pipelines = get_pipelines(model)
print (f"Total Models: {len(pipelines)}")
# evaluate each pipeline
for name, pipeline in pipelines:
    fit_pipeline(name, pipeline, X_train_best_features, y_train, X_test_best_features, y_test)
fin_time = datetime.now()
print("Execution time : ", (fin_time-init_time))

Job Started at: 2020-11-15 02:39:20.488495
Total Models: 12
Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


> xgb: 0.824
Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


> power: 0.826
Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


> quantile: 0.826
Parameters: { scal

### Remove correlated features

#### Identify Highly Correlated Features

In [20]:
# Create correlation matrix
corr_matrix = X.corr().abs()
print(); print(corr_matrix)


           feat_1    feat_2    feat_3    feat_4    feat_5    feat_6    feat_7  \
feat_1   1.000000  0.031332  0.027807  0.027529  0.042973  0.043603  0.298952   
feat_2   0.031332  1.000000  0.082573  0.134987  0.020926  0.041343  0.222386   
feat_3   0.027807  0.082573  1.000000  0.583523  0.010880  0.004288  0.001294   
feat_4   0.027529  0.134987  0.583523  1.000000  0.017290  0.014059  0.014490   
feat_5   0.042973  0.020926  0.010880  0.017290  1.000000  0.145355  0.075047   
...           ...       ...       ...       ...       ...       ...       ...   
feat_89  0.096851  0.105527  0.174781  0.183715  0.119951  0.035042  0.063511   
feat_90  0.010310  0.515022  0.015068  0.009454  0.004842  0.054034  0.129578   
feat_91  0.037264  0.026383  0.012417  0.010312  0.012012  0.012465  0.068506   
feat_92  0.054777  0.008219  0.066921  0.087631  0.065331  0.015479  0.032261   
feat_93  0.081783  0.054593  0.006814  0.015746  0.002038  0.008521  0.034912   

           feat_8    feat_

In [21]:
# Select upper triangle of correlation matrix
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
print(); print(upper_tri)


         feat_1    feat_2    feat_3    feat_4    feat_5    feat_6    feat_7  \
feat_1      NaN  0.031332  0.027807  0.027529  0.042973  0.043603  0.298952   
feat_2      NaN       NaN  0.082573  0.134987  0.020926  0.041343  0.222386   
feat_3      NaN       NaN       NaN  0.583523  0.010880  0.004288  0.001294   
feat_4      NaN       NaN       NaN       NaN  0.017290  0.014059  0.014490   
feat_5      NaN       NaN       NaN       NaN       NaN  0.145355  0.075047   
...         ...       ...       ...       ...       ...       ...       ...   
feat_89     NaN       NaN       NaN       NaN       NaN       NaN       NaN   
feat_90     NaN       NaN       NaN       NaN       NaN       NaN       NaN   
feat_91     NaN       NaN       NaN       NaN       NaN       NaN       NaN   
feat_92     NaN       NaN       NaN       NaN       NaN       NaN       NaN   
feat_93     NaN       NaN       NaN       NaN       NaN       NaN       NaN   

           feat_8    feat_9   feat_10  ...   feat_

In [22]:
# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.80)]

In [23]:
print(to_drop)

['feat_45']


In [24]:
len(to_drop)

1

#### Drop Marked Features

In [25]:
# Drop features 
if len(to_drop) > 0:
    X = X.drop(X[to_drop], axis=1)
    print (X.head())

   feat_1  feat_2  feat_3  feat_4  feat_5  feat_6  feat_7  feat_8  feat_9  \
0       1       0       0       0       0       0       0       0       0   
1       0       0       0       0       0       0       0       1       0   
2       0       0       0       0       0       0       0       1       0   
3       1       0       0       1       6       1       5       0       0   
4       0       0       0       0       0       0       0       0       0   

   feat_10  ...  feat_84  feat_85  feat_86  feat_87  feat_88  feat_89  \
0        0  ...        0        1        0        0        0        0   
1        0  ...        0        0        0        0        0        0   
2        0  ...        0        0        0        0        0        0   
3        1  ...       22        0        1        2        0        0   
4        0  ...        0        1        0        0        0        0   

   feat_90  feat_91  feat_92  feat_93  
0        0        0        0        0  
1        0        

#### Train Test Split

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

#### Evaluate Pipeline (Scaling + Transforming)

In [None]:
init_time = datetime.now()
print (f"Job Started at: {init_time}")
# get the modeling pipelines
pipelines = get_pipelines(model)
print (len(pipelines))
# evaluate each pipeline
for name, pipeline in pipelines:
    fit_pipeline(name, pipeline, X_train, y_train, X_test, y_test)
fin_time = datetime.now()
print("Execution time : ", (fin_time-init_time))