## Feature selection for categorical variables
- `f_classif`: Used only for categorical targets and based on the Analysis of Variance (ANOVA) statistical test.
- `chi2`: Performs the chi-square statistic for categorical targets, which is less sensible to the nonlinear relationship between the predictive variable and its target.

In [38]:
import os
import pandas  as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib   import Path
from sklearn.preprocessing     import StandardScaler
from sklearn.decomposition     import PCA
from sklearn.pipeline          import Pipeline
from sklearn.model_selection   import train_test_split 
from sklearn.feature_selection import chi2, SelectKBest, f_classif
from sklearn.model_selection   import GridSearchCV
from sklearn.ensemble          import RandomForestClassifier
from sklearn.metrics           import accuracy_score

In [39]:
# set root directory
path_root = Path("C:/Users/giann/data-science-core")
os.chdir(path_root)
print(f'- Root directory = {os.getcwd()}')

- Root directory = C:\Users\giann\data-science-core


In [40]:
# import dataset
path_dataset = path_root / 'dataset/arrh.csv'
data  = pd.read_csv(path_dataset)
print(data.shape)
data.head(3)

(452, 280)


Unnamed: 0,age,sex,height,weight,QRSduration,PRinterval,Q-Tinterval,Tinterval,Pinterval,QRS,...,chV6_QwaveAmp,chV6_RwaveAmp,chV6_SwaveAmp,chV6_RPwaveAmp,chV6_SPwaveAmp,chV6_PwaveAmp,chV6_TwaveAmp,chV6_QRSA,chV6_QRSTA,class
0,75,0,190,80,91,193,371,174,121,-16,...,0.0,9.0,-0.9,0.0,0.0,0.9,2.9,23.3,49.4,0
1,56,1,165,64,81,174,401,149,39,25,...,0.0,8.5,0.0,0.0,0.0,0.2,2.1,20.4,38.8,0
2,54,0,172,95,138,163,386,185,102,96,...,0.0,9.5,-2.4,0.0,0.0,0.3,3.4,12.3,49.0,0


In [41]:
X, y = data.drop('class', axis = 1), data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify = y, random_state = 42)

In [42]:
# Find the best value for max_depth among values 2, 5 and 10
grid_search = GridSearchCV(RandomForestClassifier(random_state = 1, n_estimators = 10), param_grid = {'max_depth': [2, 5, 10]}, cv = 5)
best_value = grid_search.fit(X_train, y_train).best_params_['max_depth']
print(f"o best max depth = {best_value}")

o best max depth = 10


In [43]:
# Using the best value from above, fit a random forest
clf = RandomForestClassifier(random_state = 1, max_depth = best_value, n_estimators = 10).fit(X_train, y_train)

In [45]:
# Apply SelectKBest with f_classif and pick top 100 features
vt = SelectKBest(f_classif, k = 100).fit(X_train, y_train)

 157 164 194 204 264 274] are constant.
  f = msb / msw


In [46]:
# Refit the classifier using best_depth on the reduced data
clf_vt = RandomForestClassifier(random_state = 1, max_depth = best_value, n_estimators = 10).fit(vt.transform(X_train), y_train)

In [47]:
# Calculates the test set accuracy
y_pred = clf_vt.predict(vt.transform(X_test))
acc = accuracy_score(y_test, y_pred)
print("{0:.1%} accuracy on test set.".format(acc))

78.7% accuracy on test set.
