In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# Step 1: Load the dataset
df = pd.read_csv('your_dataset.csv')  # Replace with your dataset file name

# Display the first five rows of the dataset
print("Dataset Head:")
print(df.head())

# Step 2: Check for missing values and handle them
print("Missing Values:")
print(df.isnull().sum())

# Drop duplicates
df = df.drop_duplicates()

# Handle missing values (drop or fill)
df = df.dropna()  # Alternatively, you can fill missing values using df.fillna()

# Step 3: Encode categorical variables if any
label_encoder = LabelEncoder()
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

# Step 4: Feature scaling (if necessary)
scaler = StandardScaler()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Step 5: Separate features and target variable
X = df.drop('target', axis=1)  # Replace 'target' with your actual target column name
y = df['target']

# Step 6: Feature selection using SelectKBest
k = 10  # Number of top features to select
selector = SelectKBest(score_func=f_classif, k=k)
selector.fit(X, y)

# Get selected feature indices and names
cols = selector.get_support(indices=True)
selected_features = X.columns[cols]
print("Selected Features:", selected_features)

# Create a new dataframe with only the selected features
X_selected = X[selected_features]

# Step 7: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42, stratify=y)

print(f'Training set size: {X_train.shape}')
print(f'Testing set size: {X_test.shape}')

# Step 8: Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Step 9: Train the model
rf_classifier.fit(X_train, y_train)

# Step 10: Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Step 11: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Visualize the confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

# Step 12: Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'max_features': ['auto', 'sqrt'],
    'min_samples_split': [2, 5],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(
    estimator=rf_classifier,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring='accuracy',
    verbose=2
)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Train the model with the best parameters
best_rf_classifier = grid_search.best_estimator_

# Make predictions with the optimized model
y_pred_best = best_rf_classifier.predict(X_test)

# Calculate accuracy of the optimized model
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f'Optimized Accuracy: {accuracy_best:.4f}')
print("Optimized Classification Report:")
print(classification_report(y_test, y_pred_best))

# Confusion matrix for the optimized model
cm_best = confusion_matrix(y_test, y_pred_best)

# Visualize the confusion matrix for the optimized model
plt.figure(figsize=(6, 4))
sns.heatmap(cm_best, annot=True, fmt='d', cmap='Greens')
plt.title('Optimized Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

# Step 13: Save the model to a file
joblib.dump(best_rf_classifier, 'best_random_forest_model.pkl')

print("Model saved as best_random_forest_model.pkl")
