In [None]:
# 1_data_preprocessing.py

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

def load_and_preprocess_data(filepath):
    # Load dataset
    df = pd.read_csv(filepath)

    # Encode categorical columns
    le = LabelEncoder()
    for column in df.select_dtypes(include=['object']).columns:
        df[column] = le.fit_transform(df[column])
    
    # Define features and target
    X = df.drop("Performance Index", axis=1)
    y = df["Performance Index"]

    # Split into train and test sets
    return train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 2_model_training_and_evaluation.py

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
    }

    results = {}

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)
        results[name] = {"accuracy": acc, "report": report, "model": model}

    return results


In [None]:
# 3_visualizations.py

import matplotlib.pyplot as plt
import seaborn as sns

def plot_feature_importance(model, feature_names):
    if hasattr(model, "feature_importances_"):
        importance = model.feature_importances_
        sorted_idx = importance.argsort()
        plt.figure(figsize=(10, 6))
        sns.barplot(x=importance[sorted_idx], y=[feature_names[i] for i in sorted_idx])
        plt.title("Feature Importances")
        plt.xlabel("Importance")
        plt.ylabel("Feature")
        plt.tight_layout()
        plt.show()

def plot_accuracy_comparison(results):
    model_names = list(results.keys())
    accuracies = [results[name]["accuracy"] for name in model_names]

    plt.figure(figsize=(8, 5))
    sns.barplot(x=model_names, y=accuracies, palette="Set2")
    plt.title("Model Accuracy Comparison")
    plt.ylabel("Accuracy")
    plt.ylim(0, 1)
    plt.tight_layout()
    plt.show()