# **Machine Learning Model Comparison Framework**
This notebook provides a framework for comparing machine learning models (Logistic Regression, GLM, and Decision Tree) for classification problems. It includes data preparation, model training and evaluation, and visualizations.


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, f1_score, accuracy_score, matthews_corrcoef, roc_curve
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
%matplotlib inline


## **Load and Prepare Data**
Here, we load the dataset and prepare it for modeling.


In [None]:
# Function: Load dataset
def load_dataset(filepath):
    return pd.read_csv(filepath)

# Function: Preprocess data
def preprocess_data(df, target_column):
    numeric_features = df.select_dtypes(include=['float64', 'int64']).columns.drop(target_column)
    categorical_features = df.select_dtypes(include=['object', 'category']).columns

    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    X = df.drop(columns=[target_column])
    y = df[target_column]
    X = preprocessor.fit_transform(X)
    return X, y

# Example
dataset_path = "your_dataset.csv"  # Replace with your dataset path
target_column = "target"  # Specify the target column
df = load_dataset(dataset_path)
X, y = preprocess_data(df, target_column)


## **Dummy Model as Baseline**
The dummy model serves as a baseline and makes predictions based on simple rules (e.g., most frequent class).


In [None]:
# Evaluate dummy model
dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(X, y)
dummy_accuracy = dummy_clf.score(X, y)
print("Dummy accuracy:", dummy_accuracy)


## **Train and Evaluate Models**
We train and evaluate multiple models (Logistic Regression, Decision Tree).


In [None]:
# Function: Train and evaluate model
def train_and_evaluate(X, y, model, name):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(f"\n{name} - Report:\n", classification_report(y_test, y_pred))
    return conf_matrix

# Define models
models = [
    (LogisticRegression(max_iter=1000), "Logistic Regression"),
    (DecisionTreeClassifier(), "Decision Tree")
]

# Evaluate models
for model, name in models:
    conf_matrix = train_and_evaluate(X, y, model, name)


## **Visualization: Confusion Matrix**
Here, we visualize the confusion matrix to evaluate the performance of the models.


In [None]:
# Function: Plot confusion matrix
def plot_confusion_matrix(conf_matrix, title):
    plt.figure(figsize=(6, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
    plt.title(title)
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
