# 1- Read the dataset

In [None]:
# Import necessary libraries for data manipulation and preprocessing
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
import pandas as pd

# Load training features and labels from CSV files
x_train = pd.read_csv("feature.csv")
y_train = pd.read_csv("class_labels.csv")

# Convert the labels to a 1-dimensional numpy array for compatibility with scikit-learn
y_train = y_train.iloc[:, 0].values.ravel()

# Feature Selection
selector = VarianceThreshold()
x_train = selector.fit_transform(x_train)

# Feature Scaling
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)

# Feature Normalization
x_train = preprocessing.normalize(x_train, norm='l2')

# Print the shapes of the processed labels and features
print(y_train.shape)  
print(x_train.shape)


# 2- Random Forest algorithm + GridSearch

In [None]:
# Import necessary libraries
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, make_scorer, precision_score, f1_score
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Create a pipeline with three steps: scaling, feature selection, and classification
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # First, scale features to normalize them
    ('feature_selection', SelectKBest(score_func=f_classif)),  # Then select the best features based on ANOVA F-test
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=1))  # Finally, use a RandomForestClassifier
])

# Define a grid of parameters for the grid search
param_grid = {
    'classifier__n_estimators': [100, 200],  # Number of trees in the random forest
    'classifier__max_depth': [5, 10, 20],  # Maximum depth of the trees
}


# Configure a KFold cross-validation strategy
cv = KFold(n_splits=5, shuffle=True, random_state=42)


# Set up GridSearchCV to find the best parameters within the defined grid
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=cv, verbose=2, n_jobs=-1)



# Fit the grid search object to find the best model
grid_search.fit(x_train, y_train)



# Print the best parameters found by the grid search
print(f"Best parameters found: {grid_search.best_params_}")
best_clf = grid_search.best_estimator_  # Extract the best estimator



# Evaluate the best model using cross-validation for different metrics

# Accuracy
print("\n\n************** Accuracy **************\n")
scoring_metric = 'accuracy'
cv_scores = cross_val_score(best_clf, x_train, y_train, cv=cv, scoring=scoring_metric)
print(f"CV Scores ({scoring_metric}): {cv_scores}")
print(f"Mean CV Score ({scoring_metric}): {np.mean(cv_scores)}")
print(f"Standard Deviation of CV Score ({scoring_metric}): {np.std(cv_scores)}")



# Recall
print("\n\n************** Recall **************\n")
recall_scorer = make_scorer(recall_score, average='macro')
cv_scores = cross_val_score(best_clf, x_train, y_train, cv=cv, scoring=recall_scorer)
print(f"CV Scores (recall): {cv_scores}")
print(f"Mean CV Score (recall): {np.mean(cv_scores)}")
print(f"Standard Deviation of CV Score (recall): {np.std(cv_scores)}")



# Precision (Corrected to use precision_scorer)
print("\n\n************** Precision Score **************\n")
precision_scorer = make_scorer(precision_score, average='macro')
cv_scores = cross_val_score(best_clf, x_train, y_train, cv=cv, scoring=precision_scorer)  # Corrected to use precision_scorer
print(f"CV Scores (precision_score): {cv_scores}")
print(f"Mean CV Score (precision_score): {np.mean(cv_scores)}")
print(f"Standard Deviation of CV Score (precision_score): {np.std(cv_scores)}")



# F1 Score (Corrected to use f1_scorer)
print("\n\n************** F1 Score **************\n")
f1_scorer = make_scorer(f1_score, average='macro')
cv_scores = cross_val_score(best_clf, x_train, y_train, cv=cv, scoring=f1_scorer)  # Corrected to use f1_scorer
print(f"CV Scores (f1_score): {cv_scores}")
print(f"Mean CV Score (f1_score): {np.mean(cv_scores)}")
print(f"Standard Deviation of CV Score (f1_score): {np.std(cv_scores)}")



# Visualize the confusion matrix
print("\n\n************** Confusion Matrix **************\n")
y_pred = cross_val_predict(best_clf, x_train, y_train, cv=cv)  # Generate predictions using cross-validation
cm = confusion_matrix(y_train, y_pred)  # Compute the confusion matrix
plt.figure(figsize=(5, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap='Blues', cbar=False)  # Plot the confusion matrix
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()
