# Baseline Model

## Table of Contents
1. [Model Choice](#model-choice)
2. [Feature Selection](#feature-selection)
3. [Implementation](#implementation)
4. [Evaluation](#evaluation)


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
# Import your chosen baseline model
# Example: from sklearn.linear_model import LogisticRegression


## Model Choice

[Explain why you've chosen a particular model as the baseline. This could be a simple statistical model or a basic machine learning model. Justify your choice.]


## Feature Selection

[Indicate which features from the dataset you will be using for the baseline model, and justify your selection.]

--> Using "DP  ALTV ASTV  Mean AC  Variance  LB  MSTV " as these are the features with the highest discrimatory power


In [None]:
%pip install ucimlrepo

# Load the dataset
import pandas as pd
from ucimlrepo import fetch_ucirepo

# fetch dataset
df = fetch_ucirepo(id=193)

# data (as pandas dataframes)
X = df.data.features
y = df.data.targets

# drop the first column of the y target variable
y = y.iloc[:, 1] # this is the NSP column (Normal, suspect, pathologic)


# Feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

#apply SelectKBest class to extract top 10 best features --> highest discriminatory power
bestfeatures = SelectKBest(score_func=f_classif, k=10)
fit = bestfeatures.fit(X, y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)


# Combine scores and column names into a single DataFrame
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Specs', 'Score']

# Select the top 10 features
X_selected = X[featureScores.nlargest(10, 'Score')['Specs']]
# selected features with highest discriminatory power are : DP  ALTV ASTV  Mean  Mode  Median  AC  Variance  LB  MSTV

# Drop the Mode and Median columns form the selected features due to hight correlation with Mean
X_selected = X_selected.drop(['Mode', 'Median'], axis=1)



# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

## Implementation





### Logistic Regression

In [None]:
# Initialize and train the baseline model
# Example for a classification problem using Logistic Regression
# model = LogisticRegression()
# model.fit(X_train, y_train)

# Your implementation code here


# Import necessary libraries
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import numpy as np

# Step 1: Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)  # Assuming X_selected contains your features

# Step 2: Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Step 3: Train-test split with the resampled data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Step 4: Initialize logistic regression model
model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=1000,
    random_state=42
)

# Step 5: Train the model
model.fit(X_train, y_train)

# Step 6: Make predictions
y_pred = model.predict(X_test)

# Step 7: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Model accuracy after applying SMOTE:", accuracy)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Step 8: Plot the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
custom_labels = ['Normal=1', 'Suspect=2', 'Pathologic=3']

disp = ConfusionMatrixDisplay(conf_matrix, display_labels=custom_labels)
disp.plot(cmap="viridis")

# using a simple multiiclass classification model using tensorflow
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense







### Logistic regression with SMOTE

In [None]:
# Import necessary libraries
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)  # Assuming X_selected contains your features

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Train-test split with the resampled data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize logistic regression model
model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=1000,
    random_state=42
)

# Train the model
model.fit(X_train, y_train)



### Support Vector Machine

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay

# Initialize the Support Vector Machine model
svm_model = SVC(
    kernel='linear',  # You can try other kernels like 'rbf', 'poly', etc.
    probability=True,  # Enable if you need probabilities
    random_state=42
)

# Train the SVM model
svm_model.fit(X_train, y_train)

## Evaluation

[Clearly state what metrics you will use to evaluate the model's performance. These metrics will serve as a starting point for evaluating more complex models later on.]



#### Metrics to Evaluate the Model

### Primary Metrics
- **Recall**: Ensure no pathological cases are missed (focus on sensitivity).
- **F1-Score**: Balances Precision and Recall for each class.
- **Confusion Matrix**: Analyze detailed classification errors.

### Secondary Metrics
- **Accuracy**: Provides an overall performance snapshot but is less reliable for imbalanced datasets.
- **Precision**: Evaluate the proportion of correct positive predictions to avoid excessive false alarms.

### Advanced Metric
- **ROC-AUC**: Measure the model’s ability to distinguish between classes, useful for comparing models or optimizing thresholds.


In [None]:
# Evaluate the baseline model
# Example for a classification problem
# y_pred = model.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)

# For a regression problem, you might use:
# mse = mean_squared_error(y_test, y_pred)

# Your evaluation code here


### Evaluation logistic regression

In [None]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Model accuracy after applying SMOTE:", accuracy)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))



In [None]:
# Convert classification report to DataFrame
from sklearn.metrics import classification_report
import pandas as pd

# Extract metrics from classification report
report = classification_report(y_test, y_pred, target_names=["Normal", "Suspect", "Pathologic"], output_dict=True)
df_report = pd.DataFrame(report).transpose()

# Display as a table
print("\nClassification Report as Table:")
print(df_report)

# Optional: Visualize the table in a cleaner format (if using Jupyter or Colab)
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.heatmap(df_report.iloc[:-3, :3], annot=True, cmap="Blues", fmt=".2f", linewidths=0.5)
plt.title("Precision, Recall, and F1-Score (Heatmap Table)")
plt.xlabel("Metrics")
plt.ylabel("Classes")
plt.show()

# Plot the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(conf_matrix, display_labels=model.classes_)
disp.plot(cmap="viridis")


In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
import numpy as np

# Compute ROC curve and AUC for each class
y_proba = model.predict_proba(X_test)
plt.figure(figsize=(10, 8))

for i, class_label in enumerate(model.classes_):
    fpr, tpr, _ = roc_curve(y_test == class_label, y_proba[:, i])
    auc = roc_auc_score(y_test == class_label, y_proba[:, i])
    plt.plot(fpr, tpr, label=f"Class {class_label} (AUC = {auc:.2f})")

# Add diagonal line for random guess
plt.plot([0, 1], [0, 1], 'k--', label="Random Guess")

plt.title("ROC Curves for Each Class")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.show()


### Evaluation Support Vector Machine

In [None]:
# Make predictions
y_pred = svm_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix)
disp.plot()

# Convert classification report to DataFrame for SVM
report_svm = classification_report(y_test, y_pred, target_names=["Normal", "Suspect", "Pathologic"], output_dict=True)
df_report_svm = pd.DataFrame(report_svm).transpose()

# Display as a table
print("\nClassification Report as Table (SVM):")
print(df_report_svm)

# Optional: Visualize the table in a cleaner format
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.heatmap(df_report_svm.iloc[:-3, :3], annot=True, cmap="Blues", fmt=".2f", linewidths=0.5)
plt.title("Precision, Recall, and F1-Score (SVM Heatmap Table)")
plt.xlabel("Metrics")
plt.ylabel("Classes")
plt.show()

# Plot the confusion matrix for SVM
conf_matrix_svm = confusion_matrix(y_test, y_pred)
disp_svm = ConfusionMatrixDisplay(conf_matrix_svm, display_labels=svm_model.classes_)
disp_svm.plot(cmap="viridis")
plt.title("Confusion Matrix (SVM)")
plt.show()


In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import numpy as np

# Check if probabilities are enabled
if svm_model.probability:
    y_proba_svm = svm_model.predict_proba(X_test)
    plt.figure(figsize=(10, 8))

    # Compute ROC curve and AUC for each class
    for i, class_label in enumerate(svm_model.classes_):
        fpr, tpr, _ = roc_curve(y_test == class_label, y_proba_svm[:, i])
        auc = roc_auc_score(y_test == class_label, y_proba_svm[:, i])
        plt.plot(fpr, tpr, label=f"Class {class_label} (AUC = {auc:.2f})")

    # Add diagonal line for random guess
    plt.plot([0, 1], [0, 1], 'k--', label="Random Guess")

    plt.title("ROC Curves for Each Class (SVM)")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend()
    plt.show()
else:
    print("Probability estimates are not available. Set `probability=True` when initializing the SVM model.")


## Comparing logistic regression and support vector machine

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Prepare data for plotting
metrics = ["Precision", "Recall", "F1-Score"]
classes = comparison_df_plot.index
x = np.arange(len(classes))  # Label locations
width = 0.35  # Width of the bars

# Plot each metric separately
fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=True)

for i, metric in enumerate(metrics):
    logreg_values = comparison_df_plot[f"{metric.lower()}_LogReg"]
    svm_values = comparison_df_plot[f"{metric.lower()}_SVM"]

    ax = axes[i]
    ax.bar(x - width / 2, logreg_values, width, label='Logistic Regression')
    ax.bar(x + width / 2, svm_values, width, label='SVM')

    ax.set_title(metric)
    ax.set_xticks(x)
    ax.set_xticklabels(classes, rotation=45, ha='right')
    ax.set_ylabel("Score")
    ax.set_ylim(0, 1)
    ax.legend()

fig.suptitle("Comparison of Metrics: Logistic Regression vs. SVM", fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

