In [1]:
import pandas as pd
import sklearn

In [2]:
sklearn.__version__

'1.4.2'

# Preprocessing

## Ordinal Encoding

แปลงตัวเลขที่มีลำดับ อย่าไปใช้กับแยก LQBTQ, หมูหมาแมว

In [3]:
from sklearn.preprocessing import OrdinalEncoder

In [4]:
encoder = OrdinalEncoder()

In [5]:
X = [['A'],['B'], ['C'], ['D']]

In [6]:
encoder.fit(X)

In [7]:
x_ = encoder.transform(X)

In [8]:
x_

array([[0.],
       [1.],
       [2.],
       [3.]])

In [9]:
encoder.categories_

[array(['A', 'B', 'C', 'D'], dtype=object)]

In [10]:
X = [['A',10],['B',9.5], ['C',8], ['D',16]]

In [11]:
encoder.fit(X)

In [12]:
x_ = encoder.transform(X)

In [13]:
x_

array([[0., 2.],
       [1., 1.],
       [2., 0.],
       [3., 3.]])

In [14]:
encoder.categories_

[array(['A', 'B', 'C', 'D'], dtype=object),
 array([8, 9.5, 10, 16], dtype=object)]

In [15]:
encoder.inverse_transform(x_)

array([['A', 10],
       ['B', 9.5],
       ['C', 8],
       ['D', 16]], dtype=object)

# Label Encoder

In [16]:
from sklearn.preprocessing import LabelEncoder

In [17]:
encoder = LabelEncoder()

In [18]:
Y = ['ant','bat','cat','dog']

In [19]:
encoder.fit(Y)

In [20]:
y_ = encoder.transform(Y)

In [21]:
y_

array([0, 1, 2, 3])

In [22]:
encoder.classes_

array(['ant', 'bat', 'cat', 'dog'], dtype='<U3')

In [23]:
encoder.inverse_transform(y_)

array(['ant', 'bat', 'cat', 'dog'], dtype='<U3')

# One-hot Encoding

ใช้เมื่อข้อมูลมีมากกว่า 2 ค่า คือ ตั้งแต่ 3 ค่าขึ้นไป

ข้อมูลไม่อยู่ในลำดับ ไม่สนใจลำดับ

In [24]:
from sklearn.preprocessing import OneHotEncoder

In [25]:
encoder = OneHotEncoder()

In [26]:
X = [['ant'],['bat'], ['cat'], ['bat'],['ant'], ['cat']]

In [27]:
encoder.fit(X)

In [28]:
X_ = encoder.transform(X)

In [29]:
X_

<6x3 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

# Sparse Matrix ออกแบบมาเพื่อประหยัดหน่วยความจำ

In [30]:
X_.todense()

matrix([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [0., 1., 0.],
        [1., 0., 0.],
        [0., 0., 1.]])

In [31]:
encoder.categories_

[array(['ant', 'bat', 'cat'], dtype=object)]

In [32]:
encoder.inverse_transform(x_)

ValueError: Shape of the passed X data is not correct. Expected 3 columns, got 2.

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(X, columns=['animal'])

In [None]:
df

In [None]:
pd.get_dummies(df)

In [33]:
pd.get_dummies(df, dtype=int)

NameError: name 'df' is not defined

In [34]:
df = pd.DataFrame([['male'], ['female'], ['female'], ['male']], columns=['gender'])

In [35]:
df_ = pd.get_dummies(df, dtype=int)

In [36]:
df_.columns

Index(['gender_female', 'gender_male'], dtype='object')

In [37]:
df_.drop(columns=df_.columns[0])

Unnamed: 0,gender_male
0,1
1,0
2,0
3,1


# MultiLabel Binarizer

## ใช้เมื่อข้อมูลมีหลายค่า และไม่มีลำดับ และไม่สนใจลำดับ และไม่สนใจจำนวนค่า

In [38]:
from sklearn.preprocessing import MultiLabelBinarizer

In [39]:
encoder = MultiLabelBinarizer()

In [40]:
X = [['sci-fi','comedy'],
     ['comedy'],
     ['drama','romance'],
     ['sci-fi','drama','action']]

In [41]:
encoder.fit(X)

In [42]:
X_  = encoder.transform(X)

In [43]:
X_

array([[0, 1, 0, 0, 1],
       [0, 1, 0, 0, 0],
       [0, 0, 1, 1, 0],
       [1, 0, 1, 0, 1]])

In [44]:
encoder.inverse_transform(X_)

[('comedy', 'sci-fi'),
 ('comedy',),
 ('drama', 'romance'),
 ('action', 'drama', 'sci-fi')]

In [45]:
encoder.classes_

array(['action', 'comedy', 'drama', 'romance', 'sci-fi'], dtype=object)

# K Bins Discretizer

## แบ่งข้อมูล continue ให้เป็นกลุ่ม , กำหนดกลุ่มด้วย n_bins เช่น กลุ่มของเงินเดือน เป็นช่วง

In [46]:
from sklearn.preprocessing import KBinsDiscretizer

In [47]:
X = [[10],[11],[12],[16],[21],[22],[35]]

In [48]:
encoder = KBinsDiscretizer(n_bins=3)

In [49]:
encoder.fit(X)

In [50]:
X_ = encoder.transform(X)

In [51]:
X_

<7x3 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [52]:
X_.todense()

matrix([[1., 0., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.]])

In [53]:
encoder.bin_edges_

array([array([10., 12., 21., 35.])], dtype=object)

In [54]:
encoder.inverse_transform(X_)

array([[11. ],
       [11. ],
       [16.5],
       [16.5],
       [28. ],
       [28. ],
       [28. ]])

# กำหนด strategy เป็น quantile สนใจลำดับ

In [55]:
encoder = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')

In [56]:
encoder.fit_transform(X)

array([[0.],
       [0.],
       [1.],
       [1.],
       [2.],
       [2.],
       [2.]])

# กำหนด strategy เป็น uniform สนใจ distance ไม่สนใจค่า

In [57]:
encoder = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')

In [58]:
encoder.fit_transform(X)



array([[0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [2.]])

# MinMaxScaler

## เหมาะกับการคงอัตราส่วนเดิม

## ปรับข้อมูลให้อยู่ในช่วง 0-1 หรือ -1 ถึง 1 โดยใช้สูตร (x - min) / (max - min) หรือ (x - mean) / std ใช้กับข้อมูลที่มีการกระจายตัวเป็นแบบ normal distribution หรือ uniform distribution และไม่มี outlier

In [59]:
from sklearn.preprocessing import MinMaxScaler

In [60]:
X = [[1],[2],[3.9],[4],[5]]

In [61]:
scaler = MinMaxScaler()

In [62]:
scaler.fit(X)

In [63]:
X_ = scaler.transform(X)

In [64]:
X_

array([[0.   ],
       [0.25 ],
       [0.725],
       [0.75 ],
       [1.   ]])

In [65]:
scaler.inverse_transform(X_)

array([[1. ],
       [2. ],
       [3.9],
       [4. ],
       [5. ]])

# Normalization

In [66]:
from sklearn.preprocessing import Normalizer

In [67]:
normalizer = Normalizer()

In [68]:
X = [[4,1,2,2],
     [1,3,9,3],
     [5,7,5,1]]

In [69]:
normalizer.fit(X)

In [70]:
X_ = normalizer.transform(X)

In [71]:
X_

array([[0.8, 0.2, 0.4, 0.4],
       [0.1, 0.3, 0.9, 0.3],
       [0.5, 0.7, 0.5, 0.1]])

In [72]:
normalizer.inverse_transform(X_)

AttributeError: 'Normalizer' object has no attribute 'inverse_transform'

In [73]:
normalizer = Normalizer(norm='l1')

In [74]:
normalizer.fit(X)

In [75]:
x_ = normalizer.transform(X)

In [76]:
X_

array([[0.8, 0.2, 0.4, 0.4],
       [0.1, 0.3, 0.9, 0.3],
       [0.5, 0.7, 0.5, 0.1]])

# Standard Scaler

## ปรับข้อมูลให้อยู่ในช่วง -1 ถึง 1 โดยใช้สูตร (x - mean) / std ใช้กับข้อมูลที่มีการกระจายตัวเป็นแบบ normal distribution หรือ uniform distribution และไม่มี outlier

In [77]:
from sklearn.preprocessing import StandardScaler

In [78]:
scaler = StandardScaler()

In [79]:
X = [[1,2],[2,3],[3,4],[4,5]]

In [80]:
scaler.fit(X)

In [81]:
X_ = scaler.transform(X)

In [82]:
X_

array([[-1.34164079, -1.34164079],
       [-0.4472136 , -0.4472136 ],
       [ 0.4472136 ,  0.4472136 ],
       [ 1.34164079,  1.34164079]])

In [83]:
scaler.mean_, scaler.var_

(array([2.5, 3.5]), array([1.25, 1.25]))

In [84]:
scaler.inverse_transform(X_)

array([[1., 2.],
       [2., 3.],
       [3., 4.],
       [4., 5.]])

# Imutation

In [85]:
import numpy as np

In [86]:
X = [[np.nan, np.nan, 3],[4,1,6],[10,2,9],[10,2,9]]

In [87]:
from sklearn.impute import SimpleImputer

In [88]:
imputer = SimpleImputer(strategy='mean')

In [89]:
imputer.fit(X)

In [90]:
X_ = imputer.transform(X)

In [91]:
X_

array([[ 8.        ,  1.66666667,  3.        ],
       [ 4.        ,  1.        ,  6.        ],
       [10.        ,  2.        ,  9.        ],
       [10.        ,  2.        ,  9.        ]])

In [92]:
imputer.inverse_transform(X_)

ValueError: 'inverse_transform' works only when 'SimpleImputer' is instantiated with 'add_indicator=True'. Got 'add_indicator=False' instead.

In [93]:
imputer = SimpleImputer(strategy='mean', add_indicator=True)

In [94]:
imputer.fit(X)

In [95]:
X_ = imputer.transform(X)

In [96]:
X_

array([[ 8.        ,  1.66666667,  3.        ,  1.        ,  1.        ],
       [ 4.        ,  1.        ,  6.        ,  0.        ,  0.        ],
       [10.        ,  2.        ,  9.        ,  0.        ,  0.        ],
       [10.        ,  2.        ,  9.        ,  0.        ,  0.        ]])

In [97]:
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit_transform(X)

array([[10.,  2.,  3.],
       [ 4.,  1.,  6.],
       [10.,  2.,  9.],
       [10.,  2.,  9.]])

In [98]:
imputer = SimpleImputer(strategy='median')
imputer.fit_transform(X)

array([[10.,  2.,  3.],
       [ 4.,  1.,  6.],
       [10.,  2.,  9.],
       [10.,  2.,  9.]])

In [99]:
imputer = SimpleImputer(strategy='constant', fill_value=-1)
imputer.fit_transform(X)

array([[-1., -1.,  3.],
       [ 4.,  1.,  6.],
       [10.,  2.,  9.],
       [10.,  2.,  9.]])

In [100]:
X = [['dog'],['dog'],['cat'],[np.nan]]

In [101]:
imputer = SimpleImputer(strategy='most_frequent') # ใช้กับของที่นับได้
imputer.fit_transform(X)

array([['dog'],
       ['dog'],
       ['cat'],
       ['dog']], dtype=object)

In [102]:
imputer = SimpleImputer(strategy='constant')
imputer.fit_transform(X)

array([['dog'],
       ['dog'],
       ['cat'],
       ['missing_value']], dtype=object)

In [103]:
imputer = SimpleImputer(strategy='constant', fill_value='cat')
imputer.fit_transform(X)

array([['dog'],
       ['dog'],
       ['cat'],
       ['cat']], dtype=object)

# KNN Imputer

In [104]:
X = [[np.nan, np.nan,3],[4,1,6],[10,2,9],[10,2,9]]

In [105]:
from sklearn.impute import KNNImputer

In [106]:
imputer = KNNImputer(n_neighbors=1)

In [107]:
imputer.fit(X)

In [108]:
X_ = imputer.transform(X)

In [109]:
X_

array([[ 4.,  1.,  3.],
       [ 4.,  1.,  6.],
       [10.,  2.,  9.],
       [10.,  2.,  9.]])

# Feature Selection

# แบบ (X)

## feature selection คือการเลือก feature ที่สำคัญ ลด dimension ของข้อมูล ลด overfitting และเวลาทำงาน โดยใช้วิธีการต่างๆ เช่น filter, wrapper, embedded

## drop variance ต่ำ คือการลบ feature ที่มีค่า variance ต่ำ ไม่มีข้อมูลที่สำคัญ
## nan มาก คือการลบ feature ที่มีค่า nan มาก หรือ correlation กับ target ต่ำ
## duplicate column คือการลบ feature ที่มีค่าซ้ำกับ feature อื่น

# แบบ (X,Y)

## ถ้าทำ regression ใช้ correlation pearson, ใช้ F-stat

### Entropy คือ ความสามารถในการแยกข้อมูล ค่า 0 คือ ไม่สามารถแยกข้อมูลได้ ค่า 1 คือ สามารถแยกข้อมูลได้

## ถ้าทำ Classification ใช้ chi-square, mutual information, Gini index

# แบบ (X,Y,ML)

## Machine Learning ใช้ feature importance ของ model ที่ใช้ โดยใช้วิธีการต่างๆ เช่น permutation importance, drop column importance, shap value

## forward strategy คือการเริ่มจาก feature ว่าง แล้วเพิ่ม feature ทีละ 1 ตัว จนกว่าจะไม่มีการเพิ่ม feature ได้อีก ใช้กับข้อมูลที่มี feature น้อย และไม่มี noise

## backward strategy คือการเริ่มจาก feature ทั้งหมด แล้วลบ feature ทีละ 1 ตัว จนกว่าจะไม่มีการลบ feature ได้อีก ใช้กับข้อมูลที่มี feature มาก และมี noise ทำแบบ recursive ได้

## Exhaustive search คือการทดลองทุกๆ กรณี ใช้กับข้อมูลที่มี feature น้อย และไม่มี noise (ดีที่สุด แต่ใช้ resource เยอะมาก)

## Variance Threshold

In [110]:
from sklearn.feature_selection import VarianceThreshold

In [111]:
X = [[1,2,3],[5,2,4],[10,2.2,5]]

In [112]:
X = pd.DataFrame(X, columns=['A','B','C'])

In [113]:
X

Unnamed: 0,A,B,C
0,1,2.0,3
1,5,2.0,4
2,10,2.2,5


In [114]:
p = 0.8
th = p*(1-p)
selector = VarianceThreshold(threshold=th)

In [115]:
selector.fit(X)

In [116]:
selector.variances_

array([1.35555556e+01, 8.88888889e-03, 6.66666667e-01])

In [117]:
X_ = selector.transform(X)

In [118]:
X_

array([[ 1.,  3.],
       [ 5.,  4.],
       [10.,  5.]])

## .get_feature_names_out() คือ ชื่อ column ที่เหลือ หลังจากทำการลบ feature ที่มี variance ต่ำ

In [119]:
selector.get_feature_names_out()

array(['A', 'C'], dtype=object)

# Classification

## SelectKBest คือการเลือก feature ที่ดีที่สุด k ตัว
## SelectPercentile คือการเลือก feature ที่ดีที่สุด % ตัว

คือเงื่อนไขการตัดออก

In [135]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest, SelectPercentile, GenericUnivariateSelect,\
                                        chi2, f_classif, mutual_info_classif

In [136]:
X = load_iris(as_frame=True)

In [137]:
X.data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [125]:
score_func = chi2
# score_func = f_classif
# score_func = mutual_info_classif

selector = SelectKBest(score_func=score_func, k=2)
selector.fit(X.data, X.target)
print(selector.scores_)
print(selector.get_feature_names_out())

[ 10.81782088   3.7107283  116.31261309  67.0483602 ]
['petal length (cm)' 'petal width (cm)']


In [126]:
selector.transform(X.data)

array([[1.4, 0.2],
       [1.4, 0.2],
       [1.3, 0.2],
       [1.5, 0.2],
       [1.4, 0.2],
       [1.7, 0.4],
       [1.4, 0.3],
       [1.5, 0.2],
       [1.4, 0.2],
       [1.5, 0.1],
       [1.5, 0.2],
       [1.6, 0.2],
       [1.4, 0.1],
       [1.1, 0.1],
       [1.2, 0.2],
       [1.5, 0.4],
       [1.3, 0.4],
       [1.4, 0.3],
       [1.7, 0.3],
       [1.5, 0.3],
       [1.7, 0.2],
       [1.5, 0.4],
       [1. , 0.2],
       [1.7, 0.5],
       [1.9, 0.2],
       [1.6, 0.2],
       [1.6, 0.4],
       [1.5, 0.2],
       [1.4, 0.2],
       [1.6, 0.2],
       [1.6, 0.2],
       [1.5, 0.4],
       [1.5, 0.1],
       [1.4, 0.2],
       [1.5, 0.2],
       [1.2, 0.2],
       [1.3, 0.2],
       [1.4, 0.1],
       [1.3, 0.2],
       [1.5, 0.2],
       [1.3, 0.3],
       [1.3, 0.3],
       [1.3, 0.2],
       [1.6, 0.6],
       [1.9, 0.4],
       [1.4, 0.3],
       [1.6, 0.2],
       [1.4, 0.2],
       [1.5, 0.2],
       [1.4, 0.2],
       [4.7, 1.4],
       [4.5, 1.5],
       [4.9,

In [129]:
# score_func = chi2
score_func = f_classif
# score_func = mutual_info_classif

selector = SelectKBest(score_func=score_func, k=2)
selector.fit(X.data, X.target)
print(selector.scores_)
print(selector.get_feature_names_out())

[ 119.26450218   49.16004009 1180.16118225  960.0071468 ]
['petal length (cm)' 'petal width (cm)']


In [130]:
selector.transform(X.data)

array([[1.4, 0.2],
       [1.4, 0.2],
       [1.3, 0.2],
       [1.5, 0.2],
       [1.4, 0.2],
       [1.7, 0.4],
       [1.4, 0.3],
       [1.5, 0.2],
       [1.4, 0.2],
       [1.5, 0.1],
       [1.5, 0.2],
       [1.6, 0.2],
       [1.4, 0.1],
       [1.1, 0.1],
       [1.2, 0.2],
       [1.5, 0.4],
       [1.3, 0.4],
       [1.4, 0.3],
       [1.7, 0.3],
       [1.5, 0.3],
       [1.7, 0.2],
       [1.5, 0.4],
       [1. , 0.2],
       [1.7, 0.5],
       [1.9, 0.2],
       [1.6, 0.2],
       [1.6, 0.4],
       [1.5, 0.2],
       [1.4, 0.2],
       [1.6, 0.2],
       [1.6, 0.2],
       [1.5, 0.4],
       [1.5, 0.1],
       [1.4, 0.2],
       [1.5, 0.2],
       [1.2, 0.2],
       [1.3, 0.2],
       [1.4, 0.1],
       [1.3, 0.2],
       [1.5, 0.2],
       [1.3, 0.3],
       [1.3, 0.3],
       [1.3, 0.2],
       [1.6, 0.6],
       [1.9, 0.4],
       [1.4, 0.3],
       [1.6, 0.2],
       [1.4, 0.2],
       [1.5, 0.2],
       [1.4, 0.2],
       [4.7, 1.4],
       [4.5, 1.5],
       [4.9,

In [131]:
# score_func = chi2
# score_func = f_classif
score_func = mutual_info_classif

selector = SelectKBest(score_func=score_func, k=2)
selector.fit(X.data, X.target)
print(selector.scores_)
print(selector.get_feature_names_out())

[0.51259163 0.27261057 0.99058928 0.98835034]
['petal length (cm)' 'petal width (cm)']


In [132]:
selector.transform(X.data)

array([[1.4, 0.2],
       [1.4, 0.2],
       [1.3, 0.2],
       [1.5, 0.2],
       [1.4, 0.2],
       [1.7, 0.4],
       [1.4, 0.3],
       [1.5, 0.2],
       [1.4, 0.2],
       [1.5, 0.1],
       [1.5, 0.2],
       [1.6, 0.2],
       [1.4, 0.1],
       [1.1, 0.1],
       [1.2, 0.2],
       [1.5, 0.4],
       [1.3, 0.4],
       [1.4, 0.3],
       [1.7, 0.3],
       [1.5, 0.3],
       [1.7, 0.2],
       [1.5, 0.4],
       [1. , 0.2],
       [1.7, 0.5],
       [1.9, 0.2],
       [1.6, 0.2],
       [1.6, 0.4],
       [1.5, 0.2],
       [1.4, 0.2],
       [1.6, 0.2],
       [1.6, 0.2],
       [1.5, 0.4],
       [1.5, 0.1],
       [1.4, 0.2],
       [1.5, 0.2],
       [1.2, 0.2],
       [1.3, 0.2],
       [1.4, 0.1],
       [1.3, 0.2],
       [1.5, 0.2],
       [1.3, 0.3],
       [1.3, 0.3],
       [1.3, 0.2],
       [1.6, 0.6],
       [1.9, 0.4],
       [1.4, 0.3],
       [1.6, 0.2],
       [1.4, 0.2],
       [1.5, 0.2],
       [1.4, 0.2],
       [4.7, 1.4],
       [4.5, 1.5],
       [4.9,

In [139]:
# score_func = chi2
# score_func = f_classif
score_func = mutual_info_classif

selector = GenericUnivariateSelect(score_func=score_func, mode='k_best', param=2)
selector.fit(X.data, X.target)
print(selector.scores_)
print(selector.get_feature_names_out())

[0.49552719 0.2577079  0.99994632 0.98035882]
['petal length (cm)' 'petal width (cm)']


In [148]:
score_func = chi2
# score_func = f_classif
#score_func = mutual_info_classif

selector = GenericUnivariateSelect(score_func=score_func, mode='percentile', param=30)
selector.fit(X.data, X.target)
print(selector.scores_)
print(selector.get_feature_names_out())

[ 10.81782088   3.7107283  116.31261309  67.0483602 ]
['petal length (cm)']


In [145]:
score_func = chi2
# score_func = f_classif
# score_func = mutual_info_classif

selector = GenericUnivariateSelect(score_func=score_func, mode='fpr', param=0.05)
selector.fit(X.data, X.target)
print(selector.scores_)
print(selector.pvalues_)
print(selector.get_feature_names_out())

[ 10.81782088   3.7107283  116.31261309  67.0483602 ]
[4.47651499e-03 1.56395980e-01 5.53397228e-26 2.75824965e-15]
['sepal length (cm)' 'petal length (cm)' 'petal width (cm)']


In [146]:
score_func = chi2
# score_func = f_classif
# score_func = mutual_info_classif

selector = GenericUnivariateSelect(score_func=score_func, mode='fdr', param=0.05)
selector.fit(X.data, X.target)
print(selector.scores_)
print(selector.pvalues_)
print(selector.get_feature_names_out())

[ 10.81782088   3.7107283  116.31261309  67.0483602 ]
[4.47651499e-03 1.56395980e-01 5.53397228e-26 2.75824965e-15]
['sepal length (cm)' 'petal length (cm)' 'petal width (cm)']


In [147]:
score_func = chi2
# score_func = f_classif
# score_func = mutual_info_classif

selector = GenericUnivariateSelect(score_func=score_func, mode='fwe', param=0.05)
selector.fit(X.data, X.target)
print(selector.scores_)
print(selector.pvalues_)
print(selector.get_feature_names_out())

[ 10.81782088   3.7107283  116.31261309  67.0483602 ]
[4.47651499e-03 1.56395980e-01 5.53397228e-26 2.75824965e-15]
['sepal length (cm)' 'petal length (cm)' 'petal width (cm)']


# Regression

In [149]:
from sklearn.feature_selection import r_regression, f_regression, mutual_info_regression

In [150]:
from sklearn.datasets import load_diabetes

In [151]:
X = load_diabetes(as_frame=True)

In [152]:
X.data

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930


In [153]:
X.target

0      151.0
1       75.0
2      141.0
3      206.0
4      135.0
       ...  
437    178.0
438    104.0
439    132.0
440    220.0
441     57.0
Name: target, Length: 442, dtype: float64

In [155]:
score_func = r_regression
# score_func = f_regression
# score_func = mutual_info_regression

selector = GenericUnivariateSelect(score_func=score_func, mode='k_best', param=5)
selector.fit(X.data, X.target)
print(selector.scores_)
print(selector.get_feature_names_out())

[ 0.18788875  0.043062    0.58645013  0.44148176  0.21202248  0.17405359
 -0.39478925  0.43045288  0.56588259  0.38248348]
['bmi' 'bp' 's4' 's5' 's6']


In [156]:
# score_func = r_regression
score_func = f_regression
# score_func = mutual_info_regression

selector = GenericUnivariateSelect(score_func=score_func, mode='k_best', param=5)
selector.fit(X.data, X.target)
print(selector.scores_)
print(selector.get_feature_names_out())

[ 16.10137401   0.81742349 230.65376449 106.52013086  20.71056745
  13.74607917  81.23965868 100.06926441 207.27119362  75.3996832 ]
['bmi' 'bp' 's3' 's4' 's5']


In [158]:
# score_func = r_regression
# score_func = f_regression
score_func = mutual_info_regression

selector = GenericUnivariateSelect(score_func=score_func, mode='k_best', param=5)
selector.fit(X.data, X.target)
print(selector.scores_)
print(selector.get_feature_names_out())

[0.         0.03158618 0.17153955 0.05627651 0.0652929  0.0103619
 0.06902771 0.09106715 0.14566843 0.11852245]
['bmi' 's3' 's4' 's5' 's6']


# export model

In [159]:
import joblib

In [160]:
joblib.dump(selector, 'feature_eng.pkl')

['feature_eng.pkl']

# Feature Importance

In [166]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [167]:
X = load_iris(as_frame=True)

In [168]:
X.data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [169]:
clf = RandomForestClassifier()

In [170]:
clf.fit(X.data, X.target)

In [171]:
clf.feature_importances_

array([0.10269758, 0.02256448, 0.44971872, 0.42501921])

In [172]:
selector = SelectFromModel(clf, prefit=True)

In [173]:
selector.get_support()

array([False, False,  True,  True])

In [174]:
X.data.columns[selector.get_support()]

Index(['petal length (cm)', 'petal width (cm)'], dtype='object')

# Foward Selection

In [176]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier

In [177]:
clf = KNeighborsClassifier(n_neighbors=3)

In [178]:
selector = SequentialFeatureSelector(clf, n_features_to_select='auto', direction='forward')
selector.fit(X.data, X.target)
print(selector.get_feature_names_out())

['sepal length (cm)' 'petal width (cm)']


# Backward Selection

In [180]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier

In [181]:
clf = KNeighborsClassifier(n_neighbors=3)

In [182]:
selector = SequentialFeatureSelector(clf, n_features_to_select='auto', direction='backward')
selector.fit(X.data, X.target)
print(selector.get_feature_names_out())

['petal length (cm)' 'petal width (cm)']


# Recursive Feature Elimination

In [183]:
from sklearn.feature_selection import RFE

In [185]:
clf = RandomForestClassifier()

In [186]:
selector = RFE(clf)

In [187]:
selector.fit(X.data, X.target)

In [188]:
selector.get_feature_names_out()

array(['petal length (cm)', 'petal width (cm)'], dtype=object)

# Cross Validation

https://www.google.co.th/url?sa=i&url=https%3A%2F%2Fscikit-learn.org%2Fstable%2Fmodules%2Fcross_validation.html&psig=AOvVaw1JmsksqQmhxpcf77R-6Plo&ust=1716711955342000&source=images&cd=vfe&opi=89978449&ved=0CBIQjRxqFwoTCKCn78OwqIYDFQAAAAAdAAAAABAE

## Cross Validation คือการแบ่งข้อมูลเป็นส่วนๆ แล้วทำการ train และ test หลายรอบ โดยใช้วิธีการต่างๆ เช่น KFold, StratifiedKFold, LeaveOneOut, LeavePOut, ShuffleSplit, StratifiedShuffleSplit, TimeSeriesSplit

In [191]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

In [192]:
clf = RandomForestClassifier()

In [193]:
slector = RFECV(clf, cv=StratifiedKFold(5), scoring='accuracy', min_features_to_select=1, n_jobs=2)
selector.fit(X.data, X.target)
print(selector.get_feature_names_out())

['petal length (cm)' 'petal width (cm)']


# Exhaustive (All combinations)

****ดีที่สุด แต่ใช้เวลานาน*****

## Exhaustive คือ การทดลองทุกๆ กรณี ใช้กับข้อมูลที่มี feature น้อย และไม่มี noise โดยใช้วิธีการต่างๆ เช่น ExhaustiveFeatureSelector และ ExhaustiveFeatureSelectorCV โดยใช้เวลานาน

In [215]:
from itertools import combinations, chain

In [216]:
combinations([1,2,3], 2)

<itertools.combinations at 0x1a958774c70>

In [217]:
for i in range(1,5):
     for x in combinations([1,2,3,4], i):
          print(x)

(1,)
(2,)
(3,)
(4,)
(1, 2)
(1, 3)
(1, 4)
(2, 3)
(2, 4)
(3, 4)
(1, 2, 3)
(1, 2, 4)
(1, 3, 4)
(2, 3, 4)
(1, 2, 3, 4)


In [226]:
n_features = X.data.shape[1]
chain.from_iterable(combinations(range(n_features), i) for i in range(1, 5))

<itertools.chain at 0x1a9589508b0>

In [227]:
from sklearn.model_selection import cross_val_score

In [228]:
def EFS(estimator, X, Y, cv=5, verbose=False):
    n_features = X.shape[1]
    subsets = chain.from_iterable(combinations(range(n_features), i) for i in range(1, 5))
    best_score = -np.inf
    best_subset = None
    for i, subset in enumerate(subsets):
        subset = list(subset)
        score = cross_val_score(estimator, X.iloc[:, subset], Y, cv=cv).mean()
        if score > best_score:
            best_score = score
            best_subset = subset
        if verbose:
            print(i, score, subset)
    return X.columns[best_subset]

In [230]:
clf = KNeighborsClassifier(n_neighbors=3)
selected = EFS(clf, X.data, X.target, cv=StratifiedKFold(5), verbose=True)
print(selected)

0 0.6199999999999999 [0]
1 0.5066666666666666 [1]
2 0.9400000000000001 [2]
3 0.96 [3]
4 0.74 [0, 1]
5 0.9466666666666667 [0, 2]
6 0.9533333333333334 [0, 3]
7 0.9333333333333333 [1, 2]
8 0.9400000000000001 [1, 3]
9 0.9533333333333334 [2, 3]
10 0.9400000000000001 [0, 1, 2]
11 0.9466666666666667 [0, 1, 3]
12 0.9733333333333334 [0, 2, 3]
13 0.96 [1, 2, 3]
14 0.9666666666666668 [0, 1, 2, 3]
Index(['sepal length (cm)', 'petal length (cm)', 'petal width (cm)'], dtype='object')
