<a href="https://colab.research.google.com/github/majidiali1/machine-learning/blob/main/FeatureImportance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Feature Selection**

# **Principle Component Analysis**

In [84]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Sample DataFrame
data = {
    'feature1': [1, 4, 6, 8, 10],
    'feature2': [1, 4, 6, 8, 10],
    'feature3': [1, 16, 36, 64, 100],
    'feature4': [10, 40, 60, 80, 10],
    'feature5': [100, 400, 600, 800, 100],
}
df = pd.DataFrame(data)
print(df)

# # Standardizing the features
# scaler = StandardScaler()
# df = pd.DataFrame(scaler.fit_transform(df))
# print(df)

   feature1  feature2  feature3  feature4  feature5
0         1         1         1        10       100
1         4         4        16        40       400
2         6         6        36        60       600
3         8         8        64        80       800
4        10        10       100        10       100


In [67]:


# Applying PCA
pca = PCA(n_components=2)  # None means all components are kept
pca.fit(df_scaled)
X_pca = pca.fit_transform(df)

explained_variance_ratio = pca.explained_variance_ratio_
feature_importance_matrix = pd.DataFrame(pca.components_, columns=df.columns, index=[f'PC{i+1}' for i in range(len(pca.components_))])


In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(22, 7))

# Scree Plot
axes[0].bar(range(1, pca.n_components_ + 1), pca.explained_variance_ratio_)
axes[0].set_title('Scree Plot')
axes[0].set_xlabel('Principal Component')
axes[0].set_ylabel('Variance Explained')
axes[0].set_xticks(range(1, pca.n_components_ + 1))

# PCA Component Makeup
feature_importance_matrix.abs().plot(kind='bar', ax=axes[1])
axes[1].set_title('PCA Component Makeup')
axes[1].set_ylabel('Absolute Coefficient Value')
axes[1].set_xlabel('Principal Components')

# Show the plots
plt.tight_layout()  # Adjust layout to not overlap
plt.show()

# **Feature Correlation Matrix**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming `df` is your DataFrame
corr_matrix = df.corr()
print(corr_matrix)

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Feature Correlation Matrix")
plt.show()

# **Mutual Information Regression**

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_regression
import seaborn as sns
import matplotlib.pyplot as plt

# Function to calculate MI between each pair of features
def calculate_mutual_information(df):
    mi_matrix = pd.DataFrame(index=df.columns, columns=df.columns, data=0.0)

    for col in df.columns:
        for other_col in df.columns:
            if col != other_col:
                mi = mutual_info_regression(df[[col]], df[other_col], discrete_features=False)
                mi_matrix.loc[col, other_col] = mi[0]

    return mi_matrix

# Calculate MI for each feature pair
mi_matrix = calculate_mutual_information(df)

# Plotting the mutual information matrix
plt.figure(figsize=(10, 8))
sns.heatmap(mi_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Mutual Information between Feature Pairs")
plt.show()


# **Detect relationship formula between features using PolynomialFeatures**

In [91]:
import pandas as pd
import numpy as np
from itertools import combinations
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

# Sample DataFrame creation for demonstration
data = {
    'feature1': [1, 4, 6, 8, 10],
    'feature2': [1, 4, 6, 8, 10],
    'feature3': [1, 16, 36, 64, 100],
    'feature4': [10, 40, 60, 80, 10],
    'feature5': [100, 400, 600, 800, 100],
}
df = pd.DataFrame(data)

# Function to fit polynomial regression, construct a relationship formula, ignoring 0.00 coefficients
def fit_and_describe_relationship(df, feature_x, feature_y, degree=2):
    x = df[[feature_x]]
    y = df[feature_y]

    poly = PolynomialFeatures(degree=degree, include_bias=False)
    x_poly = poly.fit_transform(x)

    model = LinearRegression().fit(x_poly, y)

    # Constructing formula, ignoring near-zero coefficients
    terms = []
    if abs(model.intercept_) > 1e-4:  # Adjust threshold as needed
        terms.append(f"{model.intercept_:.2f}")
    for i, coef in enumerate(model.coef_):
        if abs(coef) > 1e-4:  # Ignore near-zero coefficients
            term = f"({coef:.2f})"
            if i > 0:
                term += f" * {feature_x}^{i+1}"
            else:
                term += f" * {feature_x}"
            terms.append(term)

    formula = " + ".join(terms) if terms else "0"
    return f"{feature_y} = {formula}"

# Analyzing relationships for all pairs of features
relationships = []
for feature_x, feature_y in combinations(df.columns, 2):
    formula_xy = fit_and_describe_relationship(df, feature_x, feature_y, degree=2)
    relationships.append((feature_x, feature_y, formula_xy))
    formula_yx = fit_and_describe_relationship(df, feature_y, feature_x, degree=2)
    relationships.append((feature_y, feature_x, formula_yx))

# Displaying the relationships, ignoring effectively zero coefficients
for rel in relationships:
    print(f"{rel[0]} vs {rel[1]} => {rel[2]}")


feature1 vs feature2 => feature2 = (1.00) * feature1
feature2 vs feature1 => feature1 = (1.00) * feature2
feature1 vs feature3 => feature3 = (1.00) * feature1^2
feature3 vs feature1 => feature1 = 1.18 + (0.16) * feature3 + (-0.00) * feature3^2
feature1 vs feature4 => feature4 = -23.73 + (29.83) * feature1 + (-2.52) * feature1^2
feature4 vs feature1 => feature1 = 6.55 + (-0.13) * feature4 + (0.00) * feature4^2
feature1 vs feature5 => feature5 = -237.26 + (298.30) * feature1 + (-25.18) * feature1^2
feature5 vs feature1 => feature1 = 6.55 + (-0.01) * feature5
feature2 vs feature3 => feature3 = (1.00) * feature2^2
feature3 vs feature2 => feature2 = 1.18 + (0.16) * feature3 + (-0.00) * feature3^2
feature2 vs feature4 => feature4 = -23.73 + (29.83) * feature2 + (-2.52) * feature2^2
feature4 vs feature2 => feature2 = 6.55 + (-0.13) * feature4 + (0.00) * feature4^2
feature2 vs feature5 => feature5 = -237.26 + (298.30) * feature2 + (-25.18) * feature2^2
feature5 vs feature2 => feature2 = 6.55 +