# Model Comparison

In [23]:
# Modelling
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn. preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import pandas as pd

In [24]:
data =pd.read_csv('preprocessed_data.csv')

In [25]:
# Create an Evaluate Function to give all metrics after model Training
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted, average='weighted')  # or 'micro', 'macro', etc.
    recall = recall_score(true, predicted, average='weighted')  # or 'micro', 'macro', etc.
    f1 = f1_score(true, predicted, average='weighted')  # or 'micro', 'macro', etc.
    return accuracy, precision, recall, f1

In [26]:
# Define your classification models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced'),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "CatBoosting Classifier": CatBoostClassifier(verbose=False),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "Support Vector Classifier": SVC()
}


In [27]:
from sklearn.model_selection import train_test_split

# Define features and target
x = data.drop(columns=['PerformanceRating'], axis=1)  # Replace with your actual target column
y = data['PerformanceRating']

# Now split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [30]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

results = []

# Iterate through the models and train/evaluate them
for model_name, model in models.items():
    # Use scaled data for models that require it
    if name in ["Logistic Regression", "Support Vector Classifier"]:
        model.fit(x_train_scaled, y_train)
        y_train_pred = model.predict(x_train_scaled)
        y_test_pred = model.predict(x_test_scaled)
    else:
        model.fit(x_train, y_train)
        y_train_pred = model.predict(x_train)
        y_test_pred = model.predict(x_test)

    # Evaluate Train and Test dataset
    accuracy_train, precision_train, recall_train, f1_train = evaluate_model(y_train, y_train_pred)
    accuracy_test, precision_test, recall_test, f1_test = evaluate_model(y_test, y_test_pred)



    print(model_name)
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(accuracy_train))
    print("- Precision: {:.4f}".format(precision_train))
    print("- Recall: {:.4f}".format(recall_train))
    print("- F1 Score: {:.4f}".format(f1_train))
    print('----------------------------------')
    print('Model performance for Test set')
    print("- Accuracy: {:.4f}".format(accuracy_test))
    print("- Precision: {:.4f}".format(precision_test))
    print("- Recall: {:.4f}".format(recall_test))
    print("- F1 Score: {:.4f}".format(f1_test))
    print('=' * 35)
    print('\n')

Logistic Regression
Model performance for Training set
- Accuracy: 0.7240
- Precision: 0.8153
- Recall: 0.7240
- F1 Score: 0.7416
----------------------------------
Model performance for Test set
- Accuracy: 0.7042
- Precision: 0.8141
- Recall: 0.7042
- F1 Score: 0.7303


Decision Tree Classifier
Model performance for Training set
- Accuracy: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- F1 Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.8958
- Precision: 0.8996
- Recall: 0.8958
- F1 Score: 0.8969


Random Forest Classifier
Model performance for Training set
- Accuracy: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- F1 Score: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.9375
- Precision: 0.9402
- Recall: 0.9375
- F1 Score: 0.9367


CatBoosting Classifier
Model performance for Training set
- Accuracy: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- F1 Score: 1.0000
----------------------------------
Mod

Based on the test set performance metrics, both the Random Forest Classifier and the CatBoosting Classifier seem to perform very well. They have high accuracy and F1 scores, indicating good balance between precision and recall. The Decision Tree Classifier also performs well but not as strongly as the other two.

# Random Forest Classifier

In [31]:
# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)  

# Fit the model on the training data
rf_classifier.fit(x_train, y_train)

# Make predictions on the test data
y_pred = rf_classifier.predict(x_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the performance metrics
print("Accuracy: {:.4f}".format(accuracy))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1 Score: {:.4f}".format(f1))

Accuracy: 0.9333
Precision: 0.9366
Recall: 0.9333
F1 Score: 0.9324
