In [3]:
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from plotnine import ggplot, aes, geom_point, geom_smooth, labs
import statsmodels.formula.api as smf

In [4]:
np.random.seed(42)

In [5]:
def generate_data(y, n, xmin, xmax, sigma):
    x = pd.Series(np.random.uniform(xmin, xmax, n))
    epsilon = pd.Series(np.random.normal() for i in range(len(x)))
    y = pd.Series(eval(y) + epsilon * sigma)
    df = pd.DataFrame({"y": y, "x": x})
    return df

In [6]:
# is the relationship between your variables linear?
# Lets start with a good plot
# look at a plot, bc summary stats can be deceiving

df = generate_data(y="x+8", n=100, xmin=0, xmax=50, sigma=10)


def display_data(df):
    fig, ax = plt.subplots(figsize=(6, 4))
    ax.scatter(df["x"], df["y"])
    m1 = smf.ols("y ~x", data=df).fit()
    b0, b1 = m1.params
    ax.plot(df["x"], (b0 + b1 * df["x"]), color="green")
    ax.set_title("OLS Line")
    # FEATURE: toggle to show resid

    ax.vlines(df["x"], m1.predict(), df["y"], color="gray", linewidth=0.5)
    for x in ["top", "right"]:
        ax.spines[x].set_visible(False)
    plt.grid(axis="y", linewidth=".3")

    plt.show()


display_data(df)



In [7]:
def display_resid(df):
    m1 = smf.ols("y ~x", data=df).fit()
    resid = m1.resid
    fig, ax = plt.subplots(figsize=(6, 4))
    ax.scatter(df["x"], resid, s=10)
    ax.plot(df["x"], [0] * len(df), color="green")
    ax.set_title("Residuals")
    for x in ["top", "right"]:
        ax.spines[x].set_visible(False)
    plt.grid(axis="y", linewidth=".3")
    ax.vlines(df["x"], 0, resid, color="gray", linewidth=0.5)
    # notice how residuals are evenly distributed around O
    # This is a good thing

    plt.show()


display_resid(df)



In [8]:
df = generate_data(y="x**2", n=100, xmin=-20, xmax=50, sigma=700)
display_data(df)
display_resid(df)



#### What problem does this residual chart show us?
**Look at the residuals around x = -15 - all our residuals are so much higher than we predicted.**

**then from around x = 0 to x=30 our values are all below what we predicted**

**in summary, there is a clear pattern in our residuals: something that our model is failing to explain**

In [30]:
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from plotnine import ggplot, aes, geom_point, geom_smooth, labs, geom_line
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.formula.api import ols
from ss_decomp import ss_decomp
import plotly.graph_objs as go
from resid_plot import display_resid
from resid_hist import display_resid_histogram


def generate_multicollinear_data(n=10):

    np.random.seed(42)
    x1 = np.random.uniform(1, 10, n)
    x2 = 2 * x1 + np.random.normal(0, 0.1, n)
    y = 3 * x1 + 4 * x2 + np.random.normal(0, 1, n)

    return pd.DataFrame({"x1": x1, "x2": x2, "y": y})

df = generate_multicollinear_data()

def insert_non_multicolinnear_point(df_multicollinear, x):
    return df_multicollinear.append({"x1": 4, "x2": 10, "y": 45}, ignore_index=True)


df_multicollinear = insert_non_multicolinnear_point(df, 5)
mlr_model = smf.ols("y ~ x1 + x2", data=df_multicollinear).fit()

# Multicollinear Data
x1_range_1 = np.linspace(df_multicollinear["x1"].min(), df_multicollinear["x1"].max(), 20)
x2_range_1 = np.linspace(df_multicollinear["x2"].min(), df_multicollinear["x2"].max(), 20)
x1_grid_1, x2_grid_1 = np.meshgrid(x1_range_1, x2_range_1)


# Get regression plane parameters
b0_1, b1_1, b2_1 = mlr_model.params
y_grid_1 = b0_1 + b1_1 * x1_grid_1 + b2_1 * x2_grid_1

fig1 = go.Figure()

fig1.add_trace(
    go.Scatter3d(
        x=df_multicollinear["x1"],
        y=df_multicollinear["x2"],
        z=df_multicollinear["y"],
        mode="markers",
        marker=dict(size=5, opacity=0.8),
    )
)

fig1.add_trace(go.Surface(x=x1_grid_1, y=x2_grid_1, z=y_grid_1, opacity=0.5))


fig1.update_layout(
    title="Multicollinear Data",
    scene=dict(xaxis_title="x1", yaxis_title="x2", zaxis_title="y"),
    margin=dict(l=0, r=0, b=0, t=50),
)


fig1


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [29]:
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from plotnine import ggplot, aes, geom_point, geom_smooth, labs, geom_line
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.formula.api import ols
from ss_decomp import ss_decomp
import plotly.graph_objs as go
from resid_plot import display_resid
from resid_hist import display_resid_histogram


def generate_multicollinear_data(n=10):

    np.random.seed(42)
    x1 = np.random.uniform(1, 10, n)
    x2 = 2 * x1 + np.random.normal(0, 0.1, n)
    y = 3 * x1 + 4 * x2 + np.random.normal(0, 1, n)

    return pd.DataFrame({"x1": x1, "x2": x2, "y": y})

df = generate_multicollinear_data()

def insert_non_multicolinnear_point(df_multicollinear, x):
    return df_multicollinear.append({"x1": 4, "x2": 10, "y": 55}, ignore_index=True)


df_multicollinear = insert_non_multicolinnear_point(df, 0)
mlr_model = smf.ols("y ~ x1 + x2", data=df_multicollinear).fit()

# Multicollinear Data
x1_range_1 = np.linspace(df_multicollinear["x1"].min(), df_multicollinear["x1"].max(), 20)
x2_range_1 = np.linspace(df_multicollinear["x2"].min(), df_multicollinear["x2"].max(), 20)
x1_grid_1, x2_grid_1 = np.meshgrid(x1_range_1, x2_range_1)


# Get regression plane parameters
b0_1, b1_1, b2_1 = mlr_model.params
y_grid_1 = b0_1 + b1_1 * x1_grid_1 + b2_1 * x2_grid_1

fig1 = go.Figure()

fig1.add_trace(
    go.Scatter3d(
        x=df_multicollinear["x1"],
        y=df_multicollinear["x2"],
        z=df_multicollinear["y"],
        mode="markers",
        marker=dict(size=5, opacity=0.8),
    )
)

fig1.add_trace(go.Surface(x=x1_grid_1, y=x2_grid_1, z=y_grid_1, opacity=0.5))


fig1.update_layout(
    title="Multicollinear Data",
    scene=dict(xaxis_title="x1", yaxis_title="x2", zaxis_title="y"),
    margin=dict(l=0, r=0, b=0, t=50),
)


fig1


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



In [31]:
np.random.uniform(1, 10, 4)


array([1.58546434, 9.53996984, 9.6906883 , 8.27557613])