<a href="https://colab.research.google.com/github/mehdiabbasidev/darsman-machine-learning/blob/main/FilterMethods_FeatureSelection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Dataset download link:
https://drive.google.com/file/d/1HBcBD5CQpTQxVcJtmri29n6bT6awRTxm/view?usp=sharing

https://drive.google.com/file/d/16oFcV45yrdmAIpt5UaoLZsJjchxAOMnL/view?usp=sharing

https://drive.google.com/file/d/1_O0zWQsAh9luNFqz5DgUQ7pc_FDsMf1z/view?usp=sharing

https://drive.google.com/file/d/1_O0zWQsAh9luNFqz5DgUQ7pc_FDsMf1z/view?usp=sharing

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

### Correlation Coefficient

In [None]:
data=pd.read_csv('/content/drive/MyDrive/datasets/out_initial_feature_selection.csv')
data.head()

X = data.drop(['target'], axis=1)
y = data['target']

corrmat = X.corr(method='pearson')
cmap = sns.diverging_palette(220, 20, as_cmap=True)
fig, ax = plt.subplots()
fig.set_size_inches(8,8)
sns.heatmap(corrmat, cmap=cmap)

corr_matrix = X.corr().abs()
corr_matrix

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
corr_columns = [column for column in upper.columns if any(upper[column] > 0.95)]
corr_columns

remaining_features = [feature for feature in data.columns if feature not in corr_columns]
remaining_features

### Mutual Information

#### Classification

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

df = pd.read_csv('/content/drive/MyDrive/datasets/wine.csv')
df.head()
df.shape
X=df.drop(labels=['Wine'], axis=1)
y=df['Wine']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

mutual_info = mutual_info_classif(X_train, y_train)
mutual_info

mutual_info = pd.Series(mutual_info)
mutual_info.index = X_train.columns
mutual_info.sort_values(ascending=False)

mutual_info.sort_values(ascending=False).plot.bar(figsize=(20, 8))


k_best_features = SelectKBest(mutual_info_classif, k=5)
k_best_features.fit(X_train, y_train)
X_train.columns[k_best_features.get_support()]


X_train = k_best_features.transform(X_train.fillna(0))
X_train.shape

#### Regression

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectPercentile


df=pd.read_csv('/content/drive/MyDrive/datasets/housing_data.csv')
df.head()

numerics = ['int16', 'int32','int64', 'float16', 'float32', 'float64']
numerical_features = list(df.select_dtypes(include=numerics).columns)

data = df[numerical_features]
data.head()

X = data.drop(['SalePrice'], axis=1)
y = data['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

mutual_info = mutual_info_regression(X_train.fillna(0), y_train)
mutual_info

mi_series = pd.Series(mutual_info)
mi_series.index = X_train.columns
mi_series.sort_values(ascending=False)

mi_series.sort_values(ascending=False).plot.bar(figsize=(20,8))

k_percentile_features = SelectPercentile(mutual_info_classif, percentile=10).fit(X_train.fillna(0), y_train)
k_percentile_features.fit(X_train.fillna(0), y_train)
X_train.columns[k_percentile_features.get_support()]

X_train = k_percentile_features.transform(X_train.fillna(0))
X_train.shape


### Univariate Feature Selection

#### Classification


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest

df = pd.read_csv('/content/drive/MyDrive/datasets/paribas.csv')
df.head()

numerics = ['int16', 'int32','int64', 'float16', 'float32', 'float64']
numerical_features = list(df.select_dtypes(include=numerics).columns)


data = df[numerical_features]

X = data.drop(['ID','target'], axis=1)
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

univariate = f_classif(X_train.fillna(0), y_train)
univariate

univariate = pd.Series(univariate[1])
univariate.index = X_train.columns
univariate.sort_values(ascending=False, inplace=True)
univariate

univariate.sort_values(ascending=False).plot.bar(figsize=(20,8))

k_best_features = SelectKBest(f_classif, k=10).fit(X_train.fillna(0), y_train)
X_train.columns[k_best_features.get_support()]

X_train = k_best_features.transform(X_train.fillna(0))
X_train.shape

#### Regression

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectPercentile

df=pd.read_csv('/content/drive/MyDrive/datasets/housing_data.csv')
df.head()

numerics = ['int16', 'int32','int64', 'float16', 'float32', 'float64']
numerical_features = list(df.select_dtypes(include=numerics).columns)

data = df[numerical_features]
data.head()

X = data.drop(['SalePrice'], axis=1)
y = data['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

univariate = f_regression(X_train.fillna(0), y_train)
univariate

univariate = pd.Series(univariate[1])
univariate.index = X_train.columns
univariate.sort_values(ascending=False, inplace=True)
univariate

univariate.sort_values(ascending=False).plot.bar(figsize=(20,8))

k_percentile_features = SelectPercentile(f_regression, percentile=40).fit(X_train.fillna(0), y_train)
k_percentile_features.fit(X_train.fillna(0), y_train)
X_train.columns[k_percentile_features.get_support()]


X_train = k_percentile_features.transform(X_train.fillna(0))
X_train.shape


### Chi Square

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest, SelectPercentile
%matplotlib inline

In [None]:
df = pd.read_csv('/content/drive/MyDrive/datasets/titanic.csv')
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['Sex'] = label_encoder.fit_transform(df['Sex'])
df['Embarked'] = label_encoder.fit_transform(df['Embarked'])
df.head()

In [None]:
X = df[['Pclass', 'Sex', 'Embarked']]
y = df['Survived']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
chi_scores = chi2(X_train.fillna(0), y_train)    # chi_scores[0] => chi-squared      &    chi_scores[1] => p-value
chi_scores

In [None]:
chi_squared = pd.Series(chi_scores[0], index=X.columns)
chi_squared.sort_values(ascending=False, inplace=True)
chi_squared.plot.bar()

In [None]:
p_value = pd.Series(chi_scores[1], index=X.columns)
p_value.sort_values(ascending=False, inplace=True)
p_value.plot.bar()