In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(42)

In [2]:
num = 100000 

difficulty = np.random.uniform(0, 1, (num,))

speed = np.maximum(np.random.normal(15, 5, (num, )) - difficulty * 10, 0)

accident = np.minimum(np.maximum(0.03 * speed + 0.4 * difficulty + np.random.normal(0, 0.3, (num,)), 0), 1)

df = pd.DataFrame({'difficulty': difficulty, 'speed': speed, 'accident': accident})

print(f"Dataset shape: {df.shape}")

df.head(10)

Dataset shape: (100000, 3)


Unnamed: 0,difficulty,speed,accident
0,0.37454,15.468154,1.0
1,0.950714,18.112292,0.642649
2,0.731994,7.179403,0.405705
3,0.598658,0.0,0.124855
4,0.156019,20.56655,0.75309
5,0.155995,7.352468,0.414556
6,0.058084,16.722262,0.0
7,0.866176,9.027751,0.857879
8,0.601115,9.61962,0.553628
9,0.708073,0.0,0.52398


## Exploratory Data Analysis

In [3]:
print(df.corr())


            difficulty     speed  accident
difficulty    1.000000 -0.499545  0.090820
speed        -0.499545  1.000000  0.323241
accident      0.090820  0.323241  1.000000


In [4]:
from sklearn.linear_model import LinearRegression

n_experiments = 1000
sample_size = 100000

coefficients = []

for i in range(n_experiments):
    difficulty = np.random.uniform(0, 1, (sample_size,))
    speed = np.maximum(np.random.normal(15, 5, (sample_size,)) - difficulty * 10, 0)
    
    model = LinearRegression()
    model.fit(difficulty.reshape(-1, 1), speed)
    
    coefficients.append(model.coef_[0])

coefficients = np.array(coefficients)

avg_coefficient = np.mean(coefficients)
std_coefficient = np.std(coefficients)
median_coefficient = np.median(coefficients)

print(f"Average coefficient of X: {avg_coefficient:.4f}")


Average coefficient of X: -9.6656


In [5]:
coefficients_x_with_z = []
coefficients_z = []

for i in range(n_experiments):
    difficulty = np.random.uniform(0, 1, (sample_size,))
    speed = np.maximum(np.random.normal(15, 5, (sample_size,)) - difficulty * 10, 0)
    accident = np.minimum(np.maximum(0.03 * speed + 0.4 * difficulty + np.random.normal(0, 0.3, (sample_size,)), 0), 1)
    
    X_predictors = np.column_stack([difficulty, accident])
    model = LinearRegression()
    model.fit(X_predictors, speed)

    coefficients_x_with_z.append(model.coef_[0])
    coefficients_z.append(model.coef_[1])

coefficients_x_with_z = np.array(coefficients_x_with_z)
coefficients_z = np.array(coefficients_z)

avg_coef_x = np.mean(coefficients_x_with_z)
std_coef_x = np.std(coefficients_x_with_z)
median_coef_x = np.median(coefficients_x_with_z)

avg_coef_z = np.mean(coefficients_z)
std_coef_z = np.std(coefficients_z)

print(f"Average coefficient of X: {avg_coef_x:.4f}")


Average coefficient of X: -10.3246
