In [1]:
# SECTION 1: Setup and Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.api import OLS, add_constant  # Import OLS for detailed regression analysis
from statsmodels.stats.outliers_influence import variance_inflation_factor
import ipywidgets as widgets
from IPython.display import display
import joblib  # Import for saving the model
from io import BytesIO
import os

In [6]:
# SECTION 1: Data Upload
def upload_data_ui():
    """UI for uploading data and displaying the preview."""
    upload_widget = widgets.FileUpload(accept='.csv', multiple=False)
    output = widgets.Output()

    def on_upload_change(change):
        global user_df
        if upload_widget.value:
            try:
                # Extract the uploaded file
                uploaded_file = list(upload_widget.value.values())[0]
                content = uploaded_file['content']

                # Read the CSV file into a DataFrame
                user_df = pd.read_csv(BytesIO(content))

                # Display a preview of the dataset
                with output:
                    output.clear_output()
                    print("Dataset Preview:")
                    display(user_df.head())
            except Exception as e:
                with output:
                    output.clear_output()
                    print(f"Error loading file: {e}")

    upload_widget.observe(on_upload_change, names='value')
    display(upload_widget, output)

# Placeholder for user-uploaded data
user_df = pd.DataFrame()

In [3]:
# Placeholder for uploaded data
df = None  # Replace with the upload_data function later.

In [4]:
# SECTION 4: Model Selection and Training
def train_simple_model(data, target_col, predictor_col, metric):
    """Trains a Simple Linear Regression model."""
    X = data[[predictor_col]]
    y = data[target_col]
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Metric Calculation
    if metric == 'MSE':
        mse = mean_squared_error(y_test, y_pred)
        print(f"Mean Squared Error: {mse}")
    elif metric == 'RMSE':
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        print(f"Root Mean Squared Error: {rmse}")
    elif metric == 'R²':
        r_squared = r2_score(y_test, y_pred)
        print(f"R² (Coefficient of Determination): {r_squared}")
    else:
        print("Unsupported metric selected.")
    
    # Visualization
    plt.scatter(y_test, y_pred)
    plt.xlabel("True Values")
    plt.ylabel("Predictions")
    plt.title("Simple Linear Regression Predictions vs True Values")
    plt.show()

    return model

def train_multi_model(data, target_col, predictor_cols, metric):
    """Trains a Multiple Linear Regression model using statsmodels OLS after checking multicollinearity."""
    X = data[predictor_cols]
    y = data[target_col]

    # Multicollinearity Check
    vif_result = check_multicollinearity(X)
    if vif_result.empty:
        print("Skipping model training due to issues with predictors.")
        return None

    # Add constant for OLS
    X = add_constant(X)

    # Train the model using OLS
    model = OLS(y, X).fit()
    print(model.summary())

    # Predictions
    y_pred = model.predict(X)

    # Metric Calculation (Train Set Metrics since OLS does not split data)
    if metric == 'MSE':
        mse = mean_squared_error(y, y_pred)
        print(f"Mean Squared Error: {mse}")
    elif metric == 'RMSE':
        rmse = mean_squared_error(y, y_pred, squared=False)
        print(f"Root Mean Squared Error: {rmse}")
    elif metric == 'R²':
        r_squared = r2_score(y, y_pred)
        print(f"R² (Coefficient of Determination): {r_squared}")
    else:
        print("Unsupported metric selected.")

    # Visualization
    plt.scatter(y, y_pred)
    plt.xlabel("True Values")
    plt.ylabel("Predictions")
    plt.title("Multiple Linear Regression Predictions vs True Values")
    plt.show()

    return model

In [5]:
# SECTION 5: Interactive Widgets
def interactive_ui():
    """Creates an interactive UI for testing the app logic."""

    load_user_data()  # Allow user to upload dataset

    missing_selector = widgets.Dropdown(
        options=['No Missing Values', 'Remove Rows', 'Impute Mean', 'Impute Median'],
        value='No Missing Values',
        description='Missing Values:'
    )

    duplicates_selector = widgets.ToggleButtons(
        options=['No', 'Yes'],
        description='Remove Duplicates:',
        disabled=False
    )

    target_selector = widgets.Dropdown(
        options=[],  # Populated after user uploads data
        description='Target:'
    )

    predictor_selector = widgets.SelectMultiple(
        options=[],  # Populated after user uploads data
        description='Predictors:'
    )

    single_predictor_selector = widgets.Dropdown(
        options=[],  # Populated after user uploads data
        description='Single Predictor:'
    )

    scaling_selector = widgets.Dropdown(
        options=['Don’t Scale', 'Normalize', 'Standardize'],
        value='Don’t Scale',
        description='Scaling:'
    )

    metric_selector = widgets.Dropdown(
        options=['MSE', 'RMSE', 'R²'],
        value='MSE',
        description='Metric:'
    )

    model_selector = widgets.Dropdown(
        options=['Simple Linear Regression', 'Multiple Linear Regression'],
        value='Simple Linear Regression',
        description='Model Type:'
    )

    run_button = widgets.Button(description="Run Model")
    clear_button = widgets.Button(description="Clear Model")

    display(
        missing_selector,
        duplicates_selector,
        target_selector,
        predictor_selector,
        single_predictor_selector,
        scaling_selector,
        metric_selector,
        model_selector,
        run_button,
        clear_button
    )

    def update_dropdowns():
        """Updates dropdowns dynamically after dataset upload."""
        if not user_df.empty:
            target_selector.options = user_df.select_dtypes(include=[np.number]).columns.tolist()
            single_predictor_selector.options = user_df.columns.tolist()
            predictor_selector.options = user_df.columns.tolist()
        else:
            print("No data loaded. Please upload a valid dataset.")

    def on_run_button_click(b):
        if user_df.empty:
            print("Please upload a dataset to proceed.")
            return

        update_dropdowns()  # Ensure dropdowns are updated

        if target_selector.value:
            data = user_df.copy()

            # Handle missing values
            data = handle_missing_values(data, missing_selector.value)

            # Handle duplicates
            data = handle_duplicates(data, duplicates_selector.value == 'Yes')

            if model_selector.value == 'Simple Linear Regression':
                # Simple Linear Regression
                if not single_predictor_selector.value:
                    print("Please select a single predictor for Simple Linear Regression.")
                    return

                predictor = single_predictor_selector.value
                data[predictor] = scale_features(data[[predictor]], scaling_selector.value)
                model = train_simple_model(
                    data,
                    target_col=target_selector.value,
                    predictor_col=predictor,
                    metric=metric_selector.value
                )

            elif model_selector.value == 'Multiple Linear Regression':
                # Multiple Linear Regression
                predictors = list(predictor_selector.value)
                if len(predictors) == 0:
                    print("Please select at least one predictor for Multiple Linear Regression.")
                    return

                data[predictors] = scale_features(data[predictors], scaling_selector.value)

                # Multicollinearity Check
                vif_result = check_multicollinearity(data[predictors])
                if vif_result.empty:
                    print("Skipping model training due to issues with predictors.")
                    return

                model = train_multi_model(
                    data,
                    target_col=target_selector.value,
                    predictor_cols=predictors,
                    metric=metric_selector.value
                )

    def on_clear_button_click(b):
        print("Model cleared. You can now train a new model.")

    run_button.on_click(on_run_button_click)
    clear_button.on_click(on_clear_button_click)

    # Update dropdowns when a dataset is loaded
    update_dropdowns()

# Run interactive UI
interactive_ui()

FileUpload(value=(), accept='.csv', description='Upload')

Output()

Dropdown(description='Missing Values:', options=('No Missing Values', 'Remove Rows', 'Impute Mean', 'Impute Me…

ToggleButtons(description='Remove Duplicates:', options=('No', 'Yes'), value='No')

Dropdown(description='Target:', options=(), value=None)

SelectMultiple(description='Predictors:', options=(), value=())

Dropdown(description='Single Predictor:', options=(), value=None)

Dropdown(description='Scaling:', options=('Don’t Scale', 'Normalize', 'Standardize'), value='Don’t Scale')

Dropdown(description='Metric:', options=('MSE', 'RMSE', 'R²'), value='MSE')

Dropdown(description='Model Type:', options=('Simple Linear Regression', 'Multiple Linear Regression'), value=…

Button(description='Run Model', style=ButtonStyle())

Button(description='Clear Model', style=ButtonStyle())

No data loaded. Please upload a valid dataset.
Error loading file: 'tuple' object has no attribute 'values'
