# Discussion 1: Creating Widgets for Multiple Linear Regression

## Load Packages

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from ipywidgets import interact
import ipywidgets as widgets
from ipywidgets import interact, FloatSlider
from IPython.display import display, clear_output

## Data Loading

In [2]:
# Create a reproducible file path
fp = os.path.join("data", "Hurricane Irene and the Hudson River.xlsx")

do_data = pd.read_excel(fp, sheet_name = 5).drop(["Piermont D.O. (ppm)"], axis = 1)
rainfall_data =  pd.read_excel(fp, sheet_name = "Rainfall").drop(["Piermont  Rainfall Daily Accumulation (Inches)"], axis = 1)
turbidity_data =  pd.read_excel(fp, sheet_name = "Turbidity").drop(["Piermont Turbidity in NTU"], axis = 1)

## Data Cleaning

In [3]:
# Merge the two datasets
data = rainfall_data.merge(turbidity_data, on = "Date Time (ET)")
data = data.merge(do_data, on = "Date Time (ET)")

# Verify
data.head()

Unnamed: 0,Date Time (ET),Port of Albany Rainfall Daily Accumulation (Inches),Norrie Point Rainfall Daily Accumulation (Inches),Port of Albany Turbidity in NTU,Norrie Point Turbidity in NTU,Port of Albany D.O. (ppm),Norrie Point D.O. (ppm)
0,2011-08-25 00:00:00,0.0,0.0,4.0,9.3,7.68,7.81
1,2011-08-25 00:15:00,0.0,0.0,3.9,8.4,7.6,7.73
2,2011-08-25 00:30:00,0.0,0.0,4.3,7.9,7.57,7.63
3,2011-08-25 00:45:00,0.0,0.0,4.7,8.1,7.72,7.67
4,2011-08-25 01:00:00,0.0,0.0,4.4,8.4,7.74,7.63


In [4]:
# Update the column names
data.columns = ["date", "albany_rainfall", "norrie_rainfall", "albany_turbidity", "norrie_turbidity", "albany_do", "norrie_do"]

# Verify
data.head()

Unnamed: 0,date,albany_rainfall,norrie_rainfall,albany_turbidity,norrie_turbidity,albany_do,norrie_do
0,2011-08-25 00:00:00,0.0,0.0,4.0,9.3,7.68,7.81
1,2011-08-25 00:15:00,0.0,0.0,3.9,8.4,7.6,7.73
2,2011-08-25 00:30:00,0.0,0.0,4.3,7.9,7.57,7.63
3,2011-08-25 00:45:00,0.0,0.0,4.7,8.1,7.72,7.67
4,2011-08-25 01:00:00,0.0,0.0,4.4,8.4,7.74,7.63


In [5]:
# Convert to a datetime and update and set the index
data["date"] = pd.to_datetime(data["date"])
data.set_index("date", inplace = True)

# Verify
data.head()

Unnamed: 0_level_0,albany_rainfall,norrie_rainfall,albany_turbidity,norrie_turbidity,albany_do,norrie_do
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2011-08-25 00:00:00,0.0,0.0,4.0,9.3,7.68,7.81
2011-08-25 00:15:00,0.0,0.0,3.9,8.4,7.6,7.73
2011-08-25 00:30:00,0.0,0.0,4.3,7.9,7.57,7.63
2011-08-25 00:45:00,0.0,0.0,4.7,8.1,7.72,7.67
2011-08-25 01:00:00,0.0,0.0,4.4,8.4,7.74,7.63


## Multiple Linear Regression

In [6]:
# Define predictors and the target variable
X = data[["albany_rainfall", "albany_do"]]
y = data[["albany_turbidity"]]

# Split the data into trainig and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

# Create and fit the model
model = LinearRegression().fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}")
print(f"R-squared: {r2_score(y_test, y_pred)}")

RMSE: 221.9143474905527
R-squared: 0.490738951845751


## Create a Widget to Visualize Different Models

In [7]:
# Create a widget for selecting predictors
predictor_selector = widgets.SelectMultiple(
    options = data.columns,
    value = [data.columns[0]],
    description = "Predictors"
)

# Create a dropdown for selecting the target variable
target_selector = widgets.Dropdown(
    options = data.columns,
    values = data.columns[1],
    description = "Target"
)

# Button to evaluate model
evaluate_button = widgets.Button(
    description = "Evaluate Model"
)

# Output widget to display units
output = widgets.Output()

# Define the function to handle button clicks
def evaluate_model(b):
    with output:
        clear_output(wait = True) # Clear output of display area
        
        # Make sure the target is not in the predictors
        selected_predictors = [item for item in predictor_selector.value]
        if target_selector.value in selected_predictors:
            print("Target variable must not be in the predictors.")
            return
        
        # Prepare the data
        X = data[selected_predictors]
        y = data[target_selector.value]
        
        # Split data intro training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
        
        # Create and fit the model
        model = LinearRegression().fit(X_train, y_train)
        
        # Predict and calculate R^2 and MSE
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        root_mse = np.sqrt(mean_squared_error(y_test, y_pred))
        
        # Display the R^2 score and MSE
        print(f"R^2: {r2:.4f}")
        print(f"MSE: {root_mse:.4f}")

# Display the widgets and connect the butto9n to the function
display(predictor_selector, target_selector, evaluate_button, output)
evaluate_button.on_click(evaluate_model)

SelectMultiple(description='Predictors', index=(0,), options=('albany_rainfall', 'norrie_rainfall', 'albany_tu…

Dropdown(description='Target', options=('albany_rainfall', 'norrie_rainfall', 'albany_turbidity', 'norrie_turb…

Button(description='Evaluate Model', style=ButtonStyle())

Output()