# Load and Prepare the Data

## Preprocessing

In [None]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

# Import pandas and read the charity_data.csv from the provided cloud URL.
import pandas as pd
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

In [None]:
# Drop columns?


In [None]:
# Determine the number of unique values in each column.
.nunique()

In [None]:
# Look at specific value counts to identify and replace with "Other"
df[''].value_counts()

In [None]:
# # Choose a cutoff value and create a list of application types to be replaced
# application_types_to_replace = ['T9', 'T13', 'T12', 'T2', 'T25', 'T14', 'T29', 'T15', 'T17']

# # Replace in dataframe
# for app in application_types_to_replace:
#     application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# # Check to make sure replacement was successful
# application_df['APPLICATION_TYPE'].value_counts()

In [None]:
# Convert categorical data to numeric with `pd.get_dummies`(won't change numerical columns, only categorical)
categorical_dummies = pd.get_dummies(df)

In [None]:
## Reuse this data for all the models
# Split our preprocessed data into our features and target arrays
df = pd.read_csv("")
X = df.drop("weeks_in_top_10", axis=1)
y = df["weeks_in_top_10"]
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
classification_dummies.shape[1]

# Linear Regression Model

We started with a **Linear Regression model** as a simple and interpretable baseline. This helped us:

- Quickly test if a linear relationship exists between the features and the target
- Understand which features have the strongest impact on the outcome
- Provide an easy-to-explain model using coefficients
- Compare performance against more complex models like Random Forest and Keras

Even if it's not the best performer, it gives us a valuable starting point for analysis and model comparison.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Create and fit the model
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

# Predict
y_pred = lr.predict(X_test_scaled)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Linear Regression Mean Absolute Error (mae): {mae:.2f}")
print(f"Linear Regression Root Mean Squared Error: {rmse:.2f}")
print(f"Linear Regression R2: {r2:.2f}")


# Random Forest Model

We used a **Random Forest Regressor** to improve predictive performance over the linear model. This method:

- Handles complex, nonlinear relationships without requiring feature engineering
- Is robust to outliers and overfitting due to its use of multiple decision trees
- Automatically detects feature importance and interactions
- Requires minimal preprocessing and works well with both numeric and encoded categorical data

It’s a strong, reliable model for tabular data and a valuable benchmark before moving to deep learning.

In [None]:
# Import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor

## Random forest DOES NOT use scaled data

# Create and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest Mean Absolute Error: {mae_rf:.2f}")
print(f"Random Forest Root Mean Squared Error: {rmse_rf:.2f}")
print(f"Random Forest R² Score: {r2_rf:.2f}")


# Keras Neural Network Model

We used a **Keras neural network** to explore whether deep learning can capture more complex patterns in the data. This model:

- Learns nonlinear relationships and deep feature interactions
- Can potentially outperform traditional models with enough data and tuning
- Requires feature scaling and hyperparameter tuning for best results
- Is highly customizable in terms of architecture and optimization

This model helps us test if complexity leads to better predictive performance, especially after comparing it with simpler methods.


In [None]:
# Create the Keras Model

# This counts how many features (columns) your data has, so the model knows what kind of input to expect.
number_input_features = len(X_train[0])

hidden_nodes_layer1 = 80
hidden_nodes_layer2 = 40
hidden_nodes_layer3 = 40

# Start building the Sequential model
nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim= number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units= hidden_nodes_layer2, activation="relu"))

# Third Hidden layer
nn.add(tf.keras.layers.Dense(units= hidden_nodes_layer3, activation="relu"))

# Output layer – Removed sigmoid because we are predicting a continuous number (weeks), not a binary value (0 or 1)
nn.add(tf.keras.layers.Dense(units=1))

# Check the structure of the model
nn.summary()

# Compile the model
# loss='mse' directs the model to minimize mean squared error during training
# metrics=['mae'] tracks the average prediction error (in weeks) for easier interpretation
nn.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)


In [None]:
# Train the model
# validation_split=0.2 reserves 20% of training data for validation, helping us monitor overfitting and track performance on unseen data.
# batch_size=32 trains the model in small groups, balancing speed, memory use, and generalization.
fit_model_history = nn.fit(X_train_scaled, y_train, validation_split=0.2, epochs=50, batch_size=32)


In [None]:
# Evaluate the model on test data
# MAE (Mean Absolute Error) shows how many weeks off the predictions are, on average
loss, mae = nn.evaluate(X_test_scaled, y_test)
print(f"Test MAE: {mae:.2f}, Loss (MSE): {loss:.2f}")


# Keras Tuner

We used **Keras Tuner** to optimize our neural network’s architecture and hyperparameters. While our base Keras model gives us a starting point, Keras Tuner helps us:

- Automatically search for the best combination of layers, units, activations, and learning rates
- Improve model performance without manually guessing parameters
- Reduce overfitting or underfitting by finding better model configurations
- Explore multiple architectures efficiently

This step helps us fine-tune our model and potentially outperform traditional machine learning methods by leveraging the flexibility of deep learning.


In [None]:
# This counts how many features (columns) your data has, so the model knows what kind of input to expect.
number_input_features = len(X_train[0])

# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','selu'])

    # Allow kerastuner to decide number of neurons in first layer
    nn.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=8,
        max_value=128,
        step=8), activation=activation, input_dim=number_input_features))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 4)):
        nn.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=8,
            max_value=64,
            step=8),
            activation=activation))

    # Output layer for regression (no activation)
    nn.add(tf.keras.layers.Dense(units=1))

    # Compile the model
    nn.compile(loss="mse", optimizer="adam", metrics=["mae"])

    return nn

In [None]:
!pip install keras-tuner

In [None]:
import keras_tuner as kt

tuner = kt.Hyperband(
    create_model,
    objective="val_mae",         # Mean Absolute Error for regression
    max_epochs=20,
    hyperband_iterations=2)


In [None]:
# Run the Keras Tuner search for best hyperparameters
tuner.search(X_train_scaled, y_train,
             epochs=20,
             validation_split=0.2,
             batch_size=32)


In [None]:
# Get best model hyperparameters
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

In [None]:
# Evaluate best model against full test data
best_model = tuner.get_best_models(1)[0]

# Evaluate it on the test set
loss, mae = best_model.evaluate(X_test_scaled, y_test)
print(f"Best Tuned Model - Test Mean Absolute Error: {mae:.2f}")

# Save the model to a file
best_model.save('keras_tuner_model.keras')