In [None]:
import joblib
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np


# Load your dataset
file_path = r"C:\Users\offic\Desktop\NextPractice\trojan-detection\backend\Obfuscated-MalMem2022.csv"
data = pd.read_csv(file_path)

# Preprocess the dataset for multi-class classification
# Encoding the 'Category' column to represent multiple classes (Ransomware, Trojan, etc.)
label_encoder = LabelEncoder()
data['Category'] = label_encoder.fit_transform(data['Category'])  # Encode categories to integers

# Save the label encoder to map predictions back to category names
label_encoder_path = r"C:\Users\offic\Desktop\NextPractice\trojan-detection\backend\label_encoder.pkl"
joblib.dump(label_encoder, label_encoder_path)

# Separate features and labels
X = data.drop(columns=['Category', 'Class'])  # Drop non-feature columns
y = data['Category']  # Use 'Category' for multi-class classification

# Handle missing values and scale the data
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Best hyperparameters found during tuning
best_params = {
    'n_estimators': 100,
    'min_samples_split': 5,
    'min_samples_leaf': 1,
    'max_features': 'sqrt',
    'max_depth': 50,
    'bootstrap': True
}

# Initialize and train the Random Forest model for multi-class classification
rf_best_model = RandomForestClassifier(**best_params, random_state=42)
rf_best_model.fit(X_scaled, y)


# Perform cross-validation
cv = StratifiedKFold(n_splits=5)
cross_val_scores = cross_val_score(rf_best_model, X_scaled, y, cv=cv, scoring='accuracy')
print(f"Cross-validation accuracy scores: {cross_val_scores}")
print(f"Mean cross-validation accuracy: {np.mean(cross_val_scores)}")

# Split into train and test for further evaluation
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
y_pred = rf_best_model.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)



# Save the retrained model
model_path = r"C:\Users\offic\Desktop\NextPractice\trojan-detection\backend\tuned_trojan_detection_rf_model.pkl"
joblib.dump(rf_best_model, model_path)

# Save the scaler for consistent preprocessing during predictions
scaler_path = r'C:\Users\offic\Desktop\NextPractice\trojan-detection\backend\scaler.pkl'
joblib.dump(scaler, scaler_path)

print(f"Model retrained with best hyperparameters and saved to {model_path}")
print(f"Scaler saved to {scaler_path}")
print(f"Label encoder saved to {label_encoder_path}")
