<a href="https://colab.research.google.com/github/mehdiabbasidev/darsman-machine-learning/blob/main/InitialOperation_FeatureSelection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Dataset download link:
https://drive.google.com/file/d/1wMqhiBq_GGF_5nYKbbMC2njSSjcLU8YD/view?usp=sharing

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
data = pd.read_csv('/content/drive/MyDrive/datasets/FeatureSelectionDatasets/feature_selection_dataset1.csv')
data.head()

In [None]:
data.shape

In [None]:
data['feature_16'].value_counts()
data['feature_17'].value_counts()

In [None]:
features_to_encode = ['feature_16', 'feature_17']
le = LabelEncoder()
for feature in features_to_encode:
    data[feature] = le.fit_transform(data[feature])
data.head()

### Removing constant features

In [None]:
data.shape

In [None]:
data.nunique()

In [None]:
X=data.drop(labels=['target'], axis=1)
X.shape

In [None]:
constant_features = [col for col in X.columns if X[col].nunique() <= 1]
constant_features

In [None]:
data=data.drop(labels=constant_features, axis=1)
data.head()

### Remove quasi-constant features


In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
X=data.drop(labels=['target'], axis=1)
X.shape

In [None]:
quasi_constants = []
for col in X.columns:
    feature_freq= X[col].value_counts(normalize=True).sort_values(ascending=False).values[0]
    # print(feature_freq)
    if feature_freq > 0.990:
        quasi_constants.append(col)
quasi_constants

In [None]:
data=data.drop(labels=quasi_constants, axis=1)
data.head()

### Remove duplicated features

In [None]:
data.shape

In [None]:
X=data.drop(['target'],axis=1)
X.shape

In [None]:
duplicate_columns = []
for i in range(len(X.columns)):
    col1 = X.columns[i]
    for j in range(i + 1, len(X.columns)):
        col2 = X.columns[j]
        if X[col1].equals(X[col2]):
            duplicate_columns.append(col2)
duplicate_columns

In [None]:
data=data.drop(labels=duplicate_columns, axis=1)
data.head()

In [None]:
data.to_csv('out.csv', index=False)

### Remove features with high correlation

In [None]:
data.shape

In [None]:
X=data.drop(['target'],axis=1)
X.shape

In [None]:
corrmat = X.corr(method='pearson')
cmap = sns.diverging_palette(220, 20, as_cmap=True)
fig, ax = plt.subplots()
fig.set_size_inches(8,8)
sns.heatmap(corrmat, cmap=cmap)

In [None]:
corr_matrix = X.corr().abs()
print(corr_matrix)

In [None]:
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
corr_columns = [column for column in upper.columns if any(upper[column] > 0.9)]
corr_columns

In [None]:
data=data.drop(labels=corr_columns, axis=1)
data.head()

### Select features with high Mutual Informaion

In [None]:
data.shape

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

X = data.drop('target', axis=1)
y = data['target']

if (X < 0).any().any():
    X = X + abs(X.min().min())

selector = SelectKBest(score_func=mutual_info_classif, k=5)
selector.fit(X, y)
selected_features = selector.get_support(indices=True)
selected_feature_names = X.columns[selected_features]

print("Selected features:", selected_feature_names)

X_selected = X[selected_feature_names]
X_selected