### Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest, chi2

### Loading the dataset

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("dataset.csv")  # Change to your actual file name

# Display first few rows
df.head()

### Dataset Transformation/Feature Selection

Check for missing values:

In [None]:
print(df.isnull().sum())  # Check missing values
print(df.dtypes)  # Check data types

Checking for pairs of features with High Correlation:

In [None]:
# Compute the correlation matrix
corr_matrix = df.corr()

# Set correlation threshold
threshold = 0.9  # Change to 0.8 if needed

# Find highly correlated feature pairs
high_corr_features = [(col1, col2, corr_matrix.loc[col1, col2]) 
                      for col1 in corr_matrix.columns 
                      for col2 in corr_matrix.columns 
                      if col1 != col2 and abs(corr_matrix.loc[col1, col2]) > threshold]

# Convert to DataFrame for better readability
high_corr_df = pd.DataFrame(high_corr_features, columns=["Feature 1", "Feature 2", "Correlation"])

print(high_corr_df)

We have to decide if we want to commit to a correlation based feature selection, Random Forest Feature Importance Selection, or some other form of feature selection

Chi_Square Test (For Categorical Data)

In [None]:
X = df.drop(columns=["target"])  # Features
y = df["target"]  # Target variable

# Select top 5 features based on chi-square test
chi_selector = SelectKBest(score_func=chi2, k=5)
X_selected = chi_selector.fit_transform(X, y)

# Get selected feature names
selected_features = X.columns[chi_selector.get_support()]
print("Selected Features:", selected_features)


Random Forest Feature Selection:

In [None]:
# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y) #For the feature selection, we can use the whole dataset, dont need train-test split

# Get feature importance scores
importance = model.feature_importances_

# Convert to DataFrame
feature_importance = pd.DataFrame({"Feature": X.columns, "Importance": importance})
feature_importance = feature_importance.sort_values(by="Importance", ascending=False)

In [None]:
# Calculate cumulative importance
feature_importance["Cumulative Importance"] = feature_importance["Importance"].cumsum()

# Select features contributing to top 90% importance
threshold = 0.90  # Change to 0.95 for 95%
selected_features = feature_importance[feature_importance["Cumulative Importance"] <= threshold]["Feature"]

Visualization of the Feature Importance:

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.barh(feature_importance["Feature"], feature_importance["Importance"], color="skyblue")
plt.xlabel("Feature Importance")
plt.ylabel("Features")
plt.title("Feature Importance - Random Forest")
plt.gca().invert_yaxis()  # Reverse order to show most important features on top
plt.show()