# DS 552 - Generative AI
## Homework 2 - Kevin Metzler

## Question 1

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv"
penguins = pd.read_csv(url)

# Filter the dataset for Adelie and Gentoo species
penguins = penguins[penguins['species'].isin(['Adelie', 'Gentoo'])]

# Drop rows with missing values
penguins.dropna(inplace=True)

# Encode the species column
penguins['species'] = penguins['species'].map({'Adelie': 0, 'Gentoo': 1})

# Define features and target
X = penguins.drop(columns=['species'])
y = penguins['species']

# One-hot encode categorical features
X = pd.get_dummies(X, drop_first=True)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Predict and evaluate Naive Bayes model
y_pred_nb = nb_model.predict(X_test)
nb_accuracy = accuracy_score(y_test, y_pred_nb)
nb_report = classification_report(y_test, y_pred_nb)

# Initialize and train Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# Predict and evaluate Logistic Regression model
y_pred_lr = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, y_pred_lr)
lr_report = classification_report(y_test, y_pred_lr)

# Print the results
print("Naive Bayes Accuracy:", nb_accuracy)
print("Naive Bayes Classification Report:\n", nb_report)
print("Logistic Regression Accuracy:", lr_accuracy)
print("Logistic Regression Classification Report:\n", lr_report)

In [None]:
# Evaluate Naive Bayes model on training data
y_train_pred_nb = nb_model.predict(X_train)
nb_train_accuracy = accuracy_score(y_train, y_train_pred_nb)

# Evaluate Logistic Regression model on training data
y_train_pred_lr = lr_model.predict(X_train)
lr_train_accuracy = accuracy_score(y_train, y_train_pred_lr)

# Print the results
print("Naive Bayes Training Accuracy:", nb_train_accuracy)
print("Naive Bayes Test Accuracy:", nb_accuracy)
print("Logistic Regression Training Accuracy:", lr_train_accuracy)
print("Logistic Regression Test Accuracy:", lr_accuracy)

# Compare the performance
if nb_accuracy > lr_accuracy:
    print("Naive Bayes model performs better on the test dataset.")
elif nb_accuracy < lr_accuracy:
    print("Logistic Regression model performs better on the test dataset.")
else:
    print("Both models perform equally well on the test dataset.")

### Accuracy
Both the Naive Bayes and Logistic Regression models achieved an accuracy of 1.0 on the test dataset. This indicates that both models are highly effective in classifying the two penguin species.

### Conclusion
Both the Naive Bayes and Logistic Regression models perform equally well in classifying the two penguin species based on the accuracy metric. Given that both models achieve perfect accuracy, there is no clear winner in terms of distinguishing between the two species. 

## Question 2

In [None]:
from sklearn.metrics import roc_auc_score

# Calculate AUC for Naive Bayes model on training and test datasets
nb_train_auc = roc_auc_score(y_train, nb_model.predict_proba(X_train)[:, 1])
nb_test_auc = roc_auc_score(y_test, nb_model.predict_proba(X_test)[:, 1])

# Calculate AUC for Logistic Regression model on training and test datasets
lr_train_auc = roc_auc_score(y_train, lr_model.predict_proba(X_train)[:, 1])
lr_test_auc = roc_auc_score(y_test, lr_model.predict_proba(X_test)[:, 1])

# Print the AUC values
print("Naive Bayes Training AUC:", nb_train_auc)
print("Naive Bayes Test AUC:", nb_test_auc)
print("Logistic Regression Training AUC:", lr_train_auc)
print("Logistic Regression Test AUC:", lr_test_auc)

# Interpret the AUC values
if nb_test_auc > lr_test_auc:
    print("Naive Bayes model is more effective based on the AUC metric.")
elif nb_test_auc < lr_test_auc:
    print("Logistic Regression model is more effective based on the AUC metric.")
else:
    print("Both models are equally effective based on the AUC metric.")

### Naive Bayes Model
- **Training AUC:** 1.0
- **Test AUC:** 1.0

### Logistic Regression Model
- **Training AUC:** 1.0
- **Test AUC:** 1.0

### Insights
Both the Naive Bayes and Logistic Regression models have achieved perfect AUC scores of 1.0 on both the training and test datasets. This indicates that both models are excellent at discriminating between the two penguin species. 

## Question 3

In [None]:
import numpy as np

import matplotlib.pyplot as plt

# Calculate predicted probabilities for both models
nb_probs = nb_model.predict_proba(X_test)[:, 1]
lr_probs = lr_model.predict_proba(X_test)[:, 1]

# Create deciles
nb_deciles = pd.qcut(nb_probs, 10, labels=False)
lr_deciles = pd.qcut(lr_probs, 10, labels=False)

# Calculate Lift and Gain for each decile
def calculate_lift_gain(y_true, probs, deciles):
    data = pd.DataFrame({'y_true': y_true, 'probs': probs, 'deciles': deciles})
    data = data.sort_values(by='probs', ascending=False)
    
    total_positives = data['y_true'].sum()
    data['cumulative_positives'] = data['y_true'].cumsum()
    data['cumulative_total'] = np.arange(1, len(data) + 1)
    
    data['gain'] = data['cumulative_positives'] / total_positives
    data['lift'] = data['gain'] / (data['cumulative_total'] / len(data))
    
    lift = data.groupby('deciles')['lift'].last().values
    gain = data.groupby('deciles')['gain'].last().values
    
    return lift, gain

nb_lift, nb_gain = calculate_lift_gain(y_test, nb_probs, nb_deciles)
lr_lift, lr_gain = calculate_lift_gain(y_test, lr_probs, lr_deciles)

# Plot Lift and Gain charts
fig, ax1 = plt.subplots()

deciles = np.arange(1, 11)

ax1.set_xlabel('Deciles')
ax1.set_ylabel('Lift', color='tab:blue')
ax1.plot(deciles, nb_lift, label='Naive Bayes Lift', color='tab:blue', marker='o')
ax1.plot(deciles, lr_lift, label='Logistic Regression Lift', color='tab:cyan', marker='o')
ax1.tick_params(axis='y', labelcolor='tab:blue')

ax2 = ax1.twinx()
ax2.set_ylabel('Gain', color='tab:red')
ax2.plot(deciles, nb_gain, label='Naive Bayes Gain', color='tab:red', marker='x')
ax2.plot(deciles, lr_gain, label='Logistic Regression Gain', color='tab:orange', marker='x')
ax2.tick_params(axis='y', labelcolor='tab:red')

fig.tight_layout()
fig.legend(loc='upper left', bbox_to_anchor=(0.1, 0.9))
plt.title('Lift and Gain Charts')
plt.show()

### Lift and Gain Charts
- **Lift:**
    - Both models show similar lift values across deciles, with the highest lift observed in the top deciles.
- **Gain:**
    - Both models show similar gain values, with the highest gain observed in the bottom deciles.

### Conclusion
Both the Naive Bayes and Logistic Regression models perform equally well in classifying the two penguin species based on the accuracy, AUC, and Lift/Gain charts. 

## Question 4

### Accuracy
Both the Naive Bayes and Logistic Regression models achieved an accuracy of 1.0 on the test dataset. 

### AUC (Area Under the Curve)
Both models also achieved perfect AUC scores of 1.0 on both the training and test datasets.

### Lift and Gain Charts
- **Lift:**
    - Both models show similar lift values across deciles, with the highest lift observed in the top deciles.
- **Gain:**
    - Both models show similar gain values, with the highest gain observed in the bottom deciles.

### Conclusion
Based on the accuracy, AUC, and Lift/Gain charts, both the Naive Bayes and Logistic Regression models perform equally well in classifying the two penguin species. Both models achieve perfect scores in all metrics, indicating that they are highly effective and we can't choose a better model based on these metrics alone.

However, in practical applications, other factors such as model interpretability, training time, and computational resources might influence the choice of model. I prefer Logistic Regression for its interpretability, while Naive Bayes can be better in scenarios with smaller datasets or when we can assume feature independence.

## Question 5

In [None]:
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# Load the MNIST dataset
mnist = fetch_openml('mnist_784', version=1)
X_mnist, y_mnist = mnist.data, mnist.target.astype(int)

# Standardize the features
scaler = StandardScaler()
X_mnist = scaler.fit_transform(X_mnist)

# Split the dataset into training and testing sets
X_train_mnist, X_test_mnist, y_train_mnist, y_test_mnist = train_test_split(X_mnist, y_mnist, test_size=0.2, random_state=42)

# Initialize and train Naive Bayes model
nb_model_mnist = GaussianNB()
nb_model_mnist.fit(X_train_mnist, y_train_mnist)

# Predict and evaluate Naive Bayes model
y_pred_nb_mnist = nb_model_mnist.predict(X_test_mnist)
nb_accuracy_mnist = accuracy_score(y_test_mnist, y_pred_nb_mnist)
nb_report_mnist = classification_report(y_test_mnist, y_pred_nb_mnist)

# Initialize and train Logistic Regression model
lr_model_mnist = LogisticRegression(max_iter=1000)
lr_model_mnist.fit(X_train_mnist, y_train_mnist)

# Predict and evaluate Logistic Regression model
y_pred_lr_mnist = lr_model_mnist.predict(X_test_mnist)
lr_accuracy_mnist = accuracy_score(y_test_mnist, y_pred_lr_mnist)
lr_report_mnist = classification_report(y_test_mnist, y_pred_lr_mnist)

# Print the results
print("Naive Bayes Accuracy on MNIST:", nb_accuracy_mnist)
print("Naive Bayes Classification Report on MNIST:\n", nb_report_mnist)
print("Logistic Regression Accuracy on MNIST:", lr_accuracy_mnist)
print("Logistic Regression Classification Report on MNIST:\n", lr_report_mnist)

### Performance on MNIST Dataset

#### Naive Bayes (Generative Model)
- **Accuracy:** 0.53

#### Logistic Regression (Discriminative Model)
- **Accuracy:** 0.92

#### MNIST Dataset
- **Naive Bayes:** The generative model performs poorly on the MNIST dataset with an accuracy of 0.53. This is likely due to the high dimensionality and complexity of the image data, which violates the assumption of feature independence. 
- **Logistic Regression:** The discriminative model performs significantly better with an accuracy of 0.92. Logistic Regression is better suited for high-dimensional data and can capture complex relationships between features.

#### Penguin Dataset
- **Naive Bayes and Logistic Regression:** Both models achieve perfect accuracy (1.0) on the penguin dataset. This dataset is simpler with fewer features and clear distinctions between the two species, making it easier for both models to perform well.

### Conclusion
- **Generative Models (Naive Bayes):** Perform well on simpler datasets with fewer features and clear distinctions but struggle with high-dimensional and complex data like images. 
- **Discriminative Models (Logistic Regression):** Perform well on both simple and complex datasets, making them more versatile for various types of data.
