In [29]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import random

In [75]:
num = 100
difficulty = np.random.uniform(0, 1, (num,))
speed = np.maximum(np.random.normal(15, 5, (num,)) - difficulty * 10, 0)
accident = np.minimum(np.maximum(0.03*speed + 0.4*difficulty + np.random.normal(0, 0.3, (num,)), 0), 1)
df = pd.DataFrame({
    'difficulty': difficulty,
    'speed': speed,
    'accident': accident
})
df

Unnamed: 0,difficulty,speed,accident
0,0.673138,8.551017,0.720598
1,0.655696,0.000000,0.185299
2,0.770951,4.647342,0.633775
3,0.857322,8.070710,0.637710
4,0.687312,8.823271,0.402822
...,...,...,...
95,0.733279,5.998872,0.159924
96,0.595660,4.504236,0.517500
97,0.125024,12.905163,0.358685
98,0.677023,11.058720,0.764464


In [76]:
X = df.difficulty
y = df.speed
z = df.accident

In [None]:
def regression(data, X, y, num):
    coef = []
    for i in range(num):
        sample = bootstrap_sample(data)
        X_sample = sm.add_constant(sample[X])
        model = sm.OLS(sample[y], X_sample).fit()
        coef.append(model.params.iloc[1])
    return np.mean(coef)

regression_result = regression(df, 'difficulty', 'speed', 1000)
regression_result


np.float64(-9.8687484616101)

In [16]:
model = sm.OLS(y, sm.add_constant(X)).fit()
model.params.iloc[1]

np.float64(-6.334457634814191)

In [79]:
from sklearn.linear_model import LinearRegression

coefs = []
num_runs = 1000
num_samples = 10000  # many samples per dataset

for _ in range(num_runs):
    difficulty = np.random.uniform(0, 1, (num_samples,))
    speed = np.maximum(np.random.normal(15, 5, (num_samples, )) - difficulty * 10, 0)
    model = LinearRegression().fit(difficulty.reshape(-1, 1), speed)
    coefs.append(model.coef_[0])

print(f"Average coefficient: {np.mean(coefs):.2f}")
print(f"Std of coefficient: {np.std(coefs):.2f}")

Average coefficient: -9.66
Std of coefficient: 0.17


In [88]:
coefs = []
num_runs = 1000
num_samples = 10000

for _ in range(num_runs):
    difficulty = np.random.uniform(0, 1, (num_samples,))
    speed = np.maximum(np.random.normal(15, 5, (num_samples, )) - difficulty * 10, 0)
    accident = np.minimum(
        np.maximum(0.03 * speed + 0.4 * difficulty + np.random.normal(0, 0.3, (num_samples,)), 0),
        1
    )
    XZ = np.column_stack([difficulty, accident])
    model = LinearRegression().fit(XZ, speed)
    coefs.append(model.coef_[0])
    
print(f"Average coefficient with accident: {np.mean(coefs):.2f}")

Average coefficient with accident: -10.33
