<a href="https://colab.research.google.com/github/pearl-yu/mist5400fall2025/blob/main/2_Neural_Networks/Basic_Neural_Networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Basic Neural Networks

MIST 5400 Fall 2025
By: Pearl Yu

Creadit to the help from Aditya Deshpande and Chris Volinsky.

 Lets see if Neural Nets can improve on our models on the DirectMarketing data set...

In [None]:
#Loading Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelBinarizer

#Installing data

[You can download the data here](https://drive.google.com/uc?export=download&id=1deEx-Ey37F7qznPlIqmaAjjkmkvBtV28).  Each record represents an individual who was targeted with a direct marketing offer.  The offer was a solicitation to make a charitable donation. You'll remember this data set from last chapter!


After downloading, we could open the folder at the left, and drag the downloaded local csv to the current working directory.


In [None]:
df = pd.read_csv("DirectMarketing.csv")
# remove cases where Firstdate == 0 using .loc
df = df.loc[df.Firstdate != 0]


Below is just some data cleaning procedures. Maybe try asking Gemini to explain these?

In [None]:
# replace gavr and glast with log versions of same features using .loc
df_clean = df
df_clean['gavr'] = np.log(df.gavr+1)
df_clean['glast'] = np.log(df.glast+1)
income_cat = pd.Categorical(df['Income'], categories=[0,1,2,3,4,5,6,7])
df_clean['Income'] = income_cat

rfaf2_cat = pd.Categorical(df['rfaf2'], categories=[1,2,3,4])
df_clean['rfaf2'] = rfaf2_cat

df_clean = pd.get_dummies(df_clean, columns=['rfaa2', 'pepstrfl','Income','rfaf2'],drop_first=True)
df_clean.head()
# Create a new feature 'tenure'
df_clean['tenure'] = df_clean['Lastdate'] - df_clean['Firstdate']

# maybe check to see this is always greater than zero?
df_clean['tenure'].min()
today = df_clean['Lastdate'].max()
df_clean['recency'] = today - df_clean['Lastdate']

# remove Firstdate and Lastdate
df_clean = df_clean.drop(['Firstdate', 'Lastdate'], axis=1)


In [None]:
# Take a look at a few rows
df_clean.head()


# Let's review Logistic Regression a little

In [None]:
#Loading Libraries
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
X = df_clean.drop(['class'], axis=1)
y = df_clean['class']

In [None]:
#Split Data into Testing and Training Data
# original random_state = 42 gives results *81, 78, 85)
random_state_value = 99
X_train,X_test, y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = random_state_value)

In [None]:
lrmodel = LogisticRegression(solver="liblinear")
lrmodel.fit(X_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score

# Define a range of thresholds to test
thresholds = np.arange(0, 1.01, 0.05)

train_accuracies = []
test_accuracies = []

# Get predicted probabilities for the training set
y_prob_lr_train = lrmodel.predict_proba(X_train)[:, 1]

for threshold in thresholds:
    # Calculate accuracy for training set
    y_pred_train_threshold = (y_prob_lr_train >= threshold).astype(int)
    train_accuracy = accuracy_score(y_train, y_pred_train_threshold)
    train_accuracies.append(train_accuracy)

    # Calculate accuracy for testing set
    y_pred_test_threshold = (y_prob_lr >= threshold).astype(int)
    test_accuracy = accuracy_score(y_test, y_pred_test_threshold)
    test_accuracies.append(test_accuracy)

# Find the best threshold and highest accuracy for the Keras model on the testing set
best_threshold_lr = thresholds[np.argmax(test_accuracies)]
highest_accuracy_lr = np.max(test_accuracies)

print(f"Keras Model - Best Threshold: {best_threshold_lr:.2f}")
print(f"Keras Model - Highest Testing Accuracy: {highest_accuracy_lr :.4f}")

In [None]:
# Plot the accuracies
plt.figure(figsize=(10, 6))
plt.plot(thresholds, train_accuracies, label='Training Accuracy')
plt.plot(thresholds, test_accuracies, label='Testing Accuracy')
plt.xlabel('Threshold')
plt.ylabel('Accuracy')
plt.title('Logistic Regression Accuracy vs. Threshold')
plt.legend()
plt.grid(True)
plt.show()

## Neural Networks (using Keras)

In [None]:
#Loading Libraries

import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Defining SIMPLE Keras Model
kmodel = Sequential()
kmodel.add(Dense(12,input_shape =(19,), activation = "relu"))
kmodel.add(Dense(8,activation = "relu"))
kmodel.add(Dense(1,activation = "sigmoid"))

In [None]:
#Compile Keras Model
kmodel.compile(loss = "binary_crossentropy", optimizer = "adam", metrics =['accuracy'])


In [None]:
#Fitting Keras Model
kmodel.fit(X_train_scaled,y_train,epochs = 50, batch_size = 256)

In [None]:
kmodel.summary()

In [None]:
from sklearn.metrics import accuracy_score

# Get predicted probabilities for the positive class (class 1)
y_prob_keras_test = kmodel.predict(X_test_scaled).ravel()
y_prob_keras_train = kmodel.predict(X_train_scaled).ravel()

# Define a range of thresholds to test
thresholds = np.arange(0, 1.01, 0.05)

train_accuracies_keras = []
test_accuracies_keras = []

for threshold in thresholds:
    # Calculate accuracy for training set
    y_pred_train_threshold_keras = (y_prob_keras_train >= threshold).astype(int)
    train_accuracy_keras = accuracy_score(y_train, y_pred_train_threshold_keras)
    train_accuracies_keras.append(train_accuracy_keras)

    # Calculate accuracy for testing set
    y_pred_test_threshold_keras = (y_prob_keras_test >= threshold).astype(int)
    test_accuracy_keras = accuracy_score(y_test, y_pred_test_threshold_keras)
    test_accuracies_keras.append(test_accuracy_keras)

# Find the best threshold and highest accuracy for the Keras model on the testing set
best_threshold_keras = thresholds[np.argmax(test_accuracies_keras)]
highest_accuracy_keras = np.max(test_accuracies_keras)

print(f"Keras Model - Best Threshold: {best_threshold_keras:.2f}")
print(f"Keras Model - Highest Testing Accuracy: {highest_accuracy_keras:.4f}")

In [None]:
# Plot the accuracies
plt.figure(figsize=(10, 6))
plt.plot(thresholds, train_accuracies_keras, label='Keras Training Accuracy')
plt.plot(thresholds, test_accuracies_keras, label='Keras Testing Accuracy')
plt.axhline(y=baseline_accuracy, color='r', linestyle='--', label='Baseline Accuracy') # Add baseline
plt.xlabel('Threshold')
plt.ylabel('Accuracy')
plt.title('Keras Model Accuracy vs. Threshold')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
## Now we make it more complex, with an extra layer, and Dropout

kmodel2 = Sequential()
kmodel2.add(Dense(12,input_shape =(19,), activation = "relu")) # Change input_shape to (19,)
kmodel2.add(Dropout(0.3))  # Add dropout
kmodel2.add(Dense(8,activation = "relu"))
kmodel2.add(Dropout(0.3))  # Add dropout to the new layer
kmodel2.add(Dense(6,activation = "relu"))
kmodel2.add(Dense(1,activation = "sigmoid"))


In [None]:
#Compile Keras Model
kmodel2.compile(loss = "binary_crossentropy", optimizer = "adam", metrics =['accuracy'])


In [None]:
#Fitting Keras Model
kmodel2.fit(X_train_scaled,y_train,epochs = 20, batch_size = 256)

In [None]:
from sklearn.metrics import accuracy_score

# Get predicted probabilities for the positive class (class 1)
y_prob_keras_test = kmodel2.predict(X_test_scaled).ravel()
y_prob_keras_train = kmodel2.predict(X_train_scaled).ravel()

# Define a range of thresholds to test
thresholds = np.arange(0, 1.01, 0.05)

train_accuracies_keras = []
test_accuracies_keras = []

for threshold in thresholds:
    # Calculate accuracy for training set
    y_pred_train_threshold_keras = (y_prob_keras_train >= threshold).astype(int)
    train_accuracy_keras = accuracy_score(y_train, y_pred_train_threshold_keras)
    train_accuracies_keras.append(train_accuracy_keras)

    # Calculate accuracy for testing set
    y_pred_test_threshold_keras = (y_prob_keras_test >= threshold).astype(int)
    test_accuracy_keras = accuracy_score(y_test, y_pred_test_threshold_keras)
    test_accuracies_keras.append(test_accuracy_keras)

# Find the best threshold and highest accuracy for the Keras model on the testing set
best_threshold_keras2 = thresholds[np.argmax(test_accuracies_keras)]
highest_accuracy_keras2 = np.max(test_accuracies_keras)

print(f"Keras Model - Best Threshold: {best_threshold_keras2:.2f}")
print(f"Keras Model - Highest Testing Accuracy: {highest_accuracy_keras2:.4f}")

In [None]:
# Plot the accuracies
plt.figure(figsize=(10, 6))
plt.plot(thresholds, train_accuracies_keras, label='Keras Training Accuracy')
plt.plot(thresholds, test_accuracies_keras, label='Keras Testing Accuracy')
plt.axhline(y=baseline_accuracy, color='r', linestyle='--', label='Baseline Accuracy') # Add baseline
plt.xlabel('Threshold')
plt.ylabel('Accuracy')
plt.title('Keras Model Accuracy vs. Threshold')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
kmodel.summary()

# Comparing results

In [None]:
print("Model Performance Comparison:")
print("-" * 30)
print(f"Logistic Regression:")
print(f"  Highest Testing Accuracy: {highest_accuracy_lr:.4f}")
print(f"  Best Threshold: {best_threshold_lr:.2f}")
print("-" * 30)
print(f"Simple Keras Model:")
print(f"  Highest Testing Accuracy: {highest_accuracy_keras:.4f}")
print(f"  Best Threshold: {best_threshold_keras:.2f}")
print("-" * 30)
print(f"More Complex Keras Model (with Dropout):")
print(f"  Highest Testing Accuracy: {highest_accuracy_keras2:.4f}")
print(f"  Best Threshold: {best_threshold_keras2:.2f}")
print("-" * 30)

In [None]:
# What's the baseline classifications?

# Calculate the proportion of the majority class in the target variable y
baseline_accuracy = y.value_counts(normalize=True).max()

print(f"Baseline Classification Accuracy (predicting the majority class): {baseline_accuracy:.4f}")

Ask Gemini: The baseline is so high. So my evaluation metics should be something else. What should it be?