#### Univariate Feature Selection

In [None]:
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
import pandas as pd
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
# load data
filename = 'https://raw.githubusercontent.com/mchandak/DS_Repo10/main/Data/pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(filename, names=names)
array = dataframe.values
dataframe.head()

In [None]:
X = array[:,0:8]
Y = array[:,8]

# feature extraction 
test = SelectKBest(score_func=chi2, k=5)
fit = test.fit(X, Y)
# summarize scores
set_printoptions(precision=4)
print(fit.scores_)
print(names)

#For regression: f_regression, mutual_info_regression
#For classification: chi2, f_classif, mutual_info_classif

In [None]:
features = fit.transform(X)
features

In [None]:
df1= pd.DataFrame(features, columns=['preg','plas','test','mass','age'])
df1

# Feature Scaling (standardization, Normalization etc.)

In [None]:
# https://www.analyticsvidhya.com/blog/2021/05/feature-scaling-techniques-in-python-a-complete-guide/
# https://www.analyticsvidhya.com/blog/2020/07/types-of-feature-transformation-and-scaling/
# https://medium.datadriveninvestor.com/feature-scaling-in-data-science-5b1e82492727
# https://www.kaggle.com/code/aimack/complete-guide-to-feature-scaling

#### Recursive Feature Elimination

In [None]:
# Feature Extraction with RFE
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# load data
filename = 'https://raw.githubusercontent.com/mchandak/DS_Repo10/main/Data/pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
model = LogisticRegression(max_iter=400)
rfe = RFE(model,n_features_to_select =5)
fit = rfe.fit(X, Y)


In [None]:
#Num Features: 
fit.n_features_

In [None]:
#Selected Features:
fit.support_

In [None]:
names

In [None]:
# Feature Ranking:
fit.ranking_

#### Feature Importance using Decision Tree

In [None]:
# Feature Importance with Extra Trees Classifier
from pandas import read_csv
from sklearn.tree import  DecisionTreeClassifier
# load data
filename = 'https://raw.githubusercontent.com/mchandak/DS_Repo10/main/Data/pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
model = DecisionTreeClassifier()
model.fit(X, Y)
print(model.feature_importances_)
print(names)