In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import LogisticRegression

# Advertising

In [5]:
df = pd.read_csv("Advertising.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,TV,radio,newspaper,sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [10]:
X = df.drop(columns=["Unnamed: 0",'sales'])
y = df["sales"]
X = sm.add_constant(X)

In [11]:
import statsmodels.api as sm

In [13]:
ols = sm.OLS(y, X)
results = ols.fit()
results.summary()

0,1,2,3
Dep. Variable:,sales,R-squared:,0.897
Model:,OLS,Adj. R-squared:,0.896
Method:,Least Squares,F-statistic:,570.3
Date:,"Thu, 20 Nov 2025",Prob (F-statistic):,1.58e-96
Time:,22:58:15,Log-Likelihood:,-386.18
No. Observations:,200,AIC:,780.4
Df Residuals:,196,BIC:,793.6
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.9389,0.312,9.422,0.000,2.324,3.554
TV,0.0458,0.001,32.809,0.000,0.043,0.049
radio,0.1885,0.009,21.893,0.000,0.172,0.206
newspaper,-0.0010,0.006,-0.177,0.860,-0.013,0.011

0,1,2,3
Omnibus:,60.414,Durbin-Watson:,2.084
Prob(Omnibus):,0.0,Jarque-Bera (JB):,151.241
Skew:,-1.327,Prob(JB):,1.44e-33
Kurtosis:,6.332,Cond. No.,454.0


In [14]:
# Errores est√°ndar
results.bse

Unnamed: 0,0
const,0.311908
TV,0.001395
radio,0.008611
newspaper,0.005871


In [19]:
from sklearn.utils import resample

In [20]:
B = 1000
coef_boot = np.zeros((B, X.shape[1]))

for b in range(B):
    X_b, y_b = resample(X, y, replace=True)

    # regresi√≥n OLS
    model_b = sm.OLS(y_b, X_b).fit()
    coef_boot[b, :] = model_b.params

# Media bootstrap de los coeficientes
coef_mean = coef_boot.mean(axis=0)

# Desviaci√≥n est√°ndar bootstrap
coef_std = coef_boot.std(axis=0)

print("Media de coeficientes:\n", coef_mean)
print("\nDesviaci√≥n est√°ndar:\n", coef_std)

Media de coeficientes:
 [ 2.96536123e+00  4.55791533e-02  1.88932616e-01 -1.06405323e-03]

Desviaci√≥n est√°ndar:
 [0.33267372 0.00183476 0.01056148 0.00618461]


## Comparaciones

Las diferencias entre la regresi√≥n cl√°sica y el bootstrap se deben a que miden la incertidumbre de manera distinta.
La regresi√≥n tradicional usa f√≥rmulas que dependen de supuestos fuertes (normalidad, varianza constante y modelo bien especificado).
El bootstrap re-muestrea los datos y observa c√≥mo cambian los coeficientes en la pr√°ctica.
Por eso, sus errores est√°ndar suelen ser un poco mayores, pues capturan mejor la variabilidad real de los datos.

In [38]:
df_regresion = pd.DataFrame({
    "Coeficiente OLS": [2.9389, 0.0458, 0.1885, -0.0010],
    "SE OLS": [0.311908, 0.001395, 0.008611, 0.005871],
    "Coef. Bootstrap": [2.96536123, 0.0455791533, 0.188932616, -0.001064053],
    "SE Bootstrap": [0.33267372, 0.00183476, 0.01056148, 0.00618461]
}, index=["const", "TV", "radio", "newspaper"])

df_regresion


Unnamed: 0,Coeficiente OLS,SE OLS,Coef. Bootstrap,SE Bootstrap
const,2.9389,0.311908,2.965361,0.332674
TV,0.0458,0.001395,0.045579,0.001835
radio,0.1885,0.008611,0.188933,0.010561
newspaper,-0.001,0.005871,-0.001064,0.006185


## Ridge

In [33]:
from sklearn.linear_model import Ridge, RidgeCV

In [31]:
df = pd.read_csv("Advertising.csv")
X = df.drop(columns=["Unnamed: 0",'sales'])
y = df["sales"]

In [34]:
alphas = np.logspace(-4, 4, 200)

ridge_cv = RidgeCV(alphas=alphas, store_cv_values=True)
ridge_cv.fit(X, y)

best_alpha = ridge_cv.alpha_
print("Mejor lambda:", best_alpha)

# Ajustar el modelo Ridge con el mejor hiperpar√°metro Œª
ridge_final = Ridge(alpha=best_alpha)
ridge_final.fit(X, y)

print("\nCoeficientes del modelo con regularizaci√≥n L2:")
for name, val in zip(X.columns, ridge_final.coef_):
    print(f"{name}: {val:.6f}")

# Bootstrap

B = 1000
coef_boot = np.zeros((B, X.shape[1]))

for b in range(B):
    X_res, y_res = resample(X, y)
    model = Ridge(alpha=best_alpha)
    model.fit(X_res, y_res)
    coef_boot[b, :] = model.coef_

# Desviaci√≥n est√°ndar de los coeficientes

std_boot = coef_boot.std(axis=0)

print("\nDesviaci√≥n est√°ndar bootstrap de los coeficientes:")
for name, sd in zip(X.columns, std_boot):
    print(f"{name}: {sd:.6f}")



Mejor lambda: 155.2225357427048

Coeficientes del modelo con regularizaci√≥n L2:
TV: 0.045764
radio: 0.187770
newspaper: -0.000852

Desviaci√≥n est√°ndar bootstrap de los coeficientes:
TV: 0.001882
radio: 0.010388
newspaper: 0.006396


# Default

In [26]:
df = pd.read_csv("Default.csv")
df.head()

Unnamed: 0,default,student,balance,income
0,No,No,729.526495,44361.625074
1,No,Yes,817.180407,12106.1347
2,No,No,1073.549164,31767.13895
3,No,No,529.250605,35704.49394
4,No,No,785.655883,38463.49588


In [30]:
from scipy.stats import norm

y = df["default"].map({"No": 0, "Yes": 1})
X = df[["balance", "income", "student"]].copy()

X["student"] = X["student"].map({"No": 0, "Yes": 1}).astype(float)

# Asegurar que balance e income tambi√©n sean float
X["balance"] = X["balance"].astype(float)
X["income"] = X["income"].astype(float)

# 2. Estima los coeficientes.
Lr = LogisticRegression()
Lr.fit(X, y)

b0 = Lr.intercept_[0]
b_rest = Lr.coef_[0]  # array con [balance, income, student]
beta = np.array([b0] + list(b_rest))

# 3. Calcula el error est√°ndar de tus estimaciones.
    # Usa tu modelo para encontrar  ùëùÃÇ (ùëã)
linear_pred = b0 + b_rest[0]*X["balance"].values + b_rest[1]*X["income"].values + b_rest[2]*X["student"].values
p_hat = 1 / (1 + np.exp(-linear_pred))

    # Calcula el error  ùëù(1‚àíùëù)
incertidumbre = p_hat*(1 - p_hat)
V = np.diagflat(incertidumbre)

    # Calcula la matriz de covarianza
X_matrix = np.column_stack((np.ones(len(X)), X.values))
cov = np.linalg.inv(X_matrix.T @ V @ X_matrix)

    # Extrae el error est√°ndar
se = np.sqrt(np.diag(cov))

coef_names = ["Intercepto", "Balance", "Income", "Student"]

print("\nCoeficientes y errores est√°ndar:")
for name, b, s in zip(coef_names, beta, se):
    print(f"{name:10s}  Coef: {b: .6f}   SE: {s: .6f}")


Coeficientes y errores est√°ndar:
Intercepto  Coef: -10.901808   SE:  0.493158
Balance     Coef:  0.005731   SE:  0.000232
Income      Coef:  0.000004   SE:  0.000008
Student     Coef: -0.612573   SE:  0.236394


## Bootstrap

In [23]:
df = pd.read_csv("Default.csv")
df.head()

Unnamed: 0,default,student,balance,income
0,No,No,729.526495,44361.625074
1,No,Yes,817.180407,12106.1347
2,No,No,1073.549164,31767.13895
3,No,No,529.250605,35704.49394
4,No,No,785.655883,38463.49588


In [25]:
B = 1000
coef_boot = np.zeros((B, X.shape[1] + 1))  # +1 para el intercepto

for b in range(B):
    # Remuestreo bootstrap
    X_b, y_b = resample(X, y, replace=True)

    # Regresi√≥n log√≠stica
    Lr_b = LogisticRegression(max_iter=1000)
    Lr_b.fit(X_b, y_b)

    # Guardar coeficientes
    b0_b = Lr_b.intercept_[0]
    b_rest_b = Lr_b.coef_[0]
    coef_boot[b, :] = np.concatenate(([b0_b], b_rest_b))

# Media de coeficientes
coef_mean = coef_boot.mean(axis=0)

# Desviaci√≥n est√°ndar
coef_std = coef_boot.std(axis=0)

coef_mean, coef_std


(array([-1.09059787e+01,  5.73816068e-03,  3.65309889e-06, -6.15057280e-01]),
 array([4.81080335e-01, 2.32408925e-04, 8.16615951e-06, 2.33457069e-01]))

## Comparaciones
Los resultados del modelo log√≠stico y del bootstrap son muy similares, pero no id√©nticos porque cada m√©todo mide la incertidumbre de forma diferente. El modelo log√≠stico calcula sus errores est√°ndar usando una f√≥rmula matem√°tica basada en supuestos ideales del modelo, mientras que el bootstrap vuelve a muestrear los datos muchas veces y observa c√≥mo var√≠an realmente los coeficientes. Esta diferencia hace que el bootstrap capture mejor la variabilidad real presente en los datos, por lo que sus errores est√°ndar pueden cambiar ligeramente.

In [36]:
df_logit = pd.DataFrame({
    "Coef. Logit": [-10.901808, 0.005731, 0.000004, -0.612573],
    "SE Logit": [0.493158, 0.000232, 0.000008, 0.236394],
    "Coef. Bootstrap": [-10.9059787, 0.00573816068, 0.0000036531, -0.61505728],
    "SE Bootstrap": [0.481080335, 0.0002324089, 0.0000081662, 0.233457069]
}, index=["Intercepto", "Balance", "Income", "Student"])

df_logit


Unnamed: 0,Coef. Logit,SE Logit,Coef. Bootstrap,SE Bootstrap
Intercepto,-10.901808,0.493158,-10.905979,0.48108
Balance,0.005731,0.000232,0.005738,0.000232
Income,4e-06,8e-06,4e-06,8e-06
Student,-0.612573,0.236394,-0.615057,0.233457
