In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression

In [3]:
# Sample input data
dataset = pd.read_csv('datos/50_Startups.csv')
X = dataset.iloc[:, 0:3].values
y = dataset.iloc[:, -1].values
X_back = X

In [4]:
# Fit the model using the backward elimination method
num_vars = X_back.shape[1]
for i in range(num_vars, 0, -1):
    # Fit model with i variables
    model = LinearRegression()
    model.fit(X_back[:, :i], y)

    # Check p-values of the model
    p_values = f_regression(X_back, y)[1]
    max_p_value = np.max(p_values)

    # If the maximum p-value is greater than the threshold, remove the variable
    # with the highest p-value
    if max_p_value > 0.05:
        var_to_remove = np.argmax(p_values)
        X_back = np.delete(X_back, var_to_remove, 1)

In [8]:
X_back

array([[165349.2 , 471784.1 ],
       [162597.7 , 443898.53],
       [153441.51, 407934.54],
       [144372.41, 383199.62],
       [142107.34, 366168.42],
       [131876.9 , 362861.36],
       [134615.46, 127716.82],
       [130298.13, 323876.68],
       [120542.52, 311613.29],
       [123334.88, 304981.62],
       [101913.08, 229160.95],
       [100671.96, 249744.55],
       [ 93863.75, 249839.44],
       [ 91992.39, 252664.93],
       [119943.24, 256512.92],
       [114523.61, 261776.23],
       [ 78013.11, 264346.06],
       [ 94657.16, 282574.31],
       [ 91749.16, 294919.57],
       [ 86419.7 ,      0.  ],
       [ 76253.86, 298664.47],
       [ 78389.47, 299737.29],
       [ 73994.56, 303319.26],
       [ 67532.53, 304768.73],
       [ 77044.01, 140574.81],
       [ 64664.71, 137962.62],
       [ 75328.87, 134050.07],
       [ 72107.6 , 353183.81],
       [ 66051.52, 118148.2 ],
       [ 65605.48, 107138.38],
       [ 61994.48,  91131.24],
       [ 61136.38,  88218.23],
       [

In [5]:
# Print the selected variables
print(X_back)

[[165349.2  471784.1 ]
 [162597.7  443898.53]
 [153441.51 407934.54]
 [144372.41 383199.62]
 [142107.34 366168.42]
 [131876.9  362861.36]
 [134615.46 127716.82]
 [130298.13 323876.68]
 [120542.52 311613.29]
 [123334.88 304981.62]
 [101913.08 229160.95]
 [100671.96 249744.55]
 [ 93863.75 249839.44]
 [ 91992.39 252664.93]
 [119943.24 256512.92]
 [114523.61 261776.23]
 [ 78013.11 264346.06]
 [ 94657.16 282574.31]
 [ 91749.16 294919.57]
 [ 86419.7       0.  ]
 [ 76253.86 298664.47]
 [ 78389.47 299737.29]
 [ 73994.56 303319.26]
 [ 67532.53 304768.73]
 [ 77044.01 140574.81]
 [ 64664.71 137962.62]
 [ 75328.87 134050.07]
 [ 72107.6  353183.81]
 [ 66051.52 118148.2 ]
 [ 65605.48 107138.38]
 [ 61994.48  91131.24]
 [ 61136.38  88218.23]
 [ 63408.86  46085.25]
 [ 55493.95 214634.81]
 [ 46426.07 210797.67]
 [ 46014.02 205517.64]
 [ 28663.76 201126.82]
 [ 44069.95 197029.42]
 [ 20229.59 185265.1 ]
 [ 38558.51 174999.3 ]
 [ 28754.33 172795.67]
 [ 27892.92 164470.71]
 [ 23640.93 148001.11]
 [ 15505.73

In [6]:
from sklearn.linear_model import Lasso

# Create a Lasso model with a regularization strength of 0.1
model = Lasso(alpha=0.1)

# Fit the model to the data
model.fit(X, y)

# The Lasso model will have eliminated some of the variables in the original data
# The remaining variables can be accessed by looking at the non-zero coefficients
# of the model
selected_variables = np.where(model.coef_ != 0)[0]

In [7]:
# Use the selected variables to create a new dataset containing only the
# selected variables
X_lasso = X[:, selected_variables]
print(X_lasso)

[[165349.2  136897.8  471784.1 ]
 [162597.7  151377.59 443898.53]
 [153441.51 101145.55 407934.54]
 [144372.41 118671.85 383199.62]
 [142107.34  91391.77 366168.42]
 [131876.9   99814.71 362861.36]
 [134615.46 147198.87 127716.82]
 [130298.13 145530.06 323876.68]
 [120542.52 148718.95 311613.29]
 [123334.88 108679.17 304981.62]
 [101913.08 110594.11 229160.95]
 [100671.96  91790.61 249744.55]
 [ 93863.75 127320.38 249839.44]
 [ 91992.39 135495.07 252664.93]
 [119943.24 156547.42 256512.92]
 [114523.61 122616.84 261776.23]
 [ 78013.11 121597.55 264346.06]
 [ 94657.16 145077.58 282574.31]
 [ 91749.16 114175.79 294919.57]
 [ 86419.7  153514.11      0.  ]
 [ 76253.86 113867.3  298664.47]
 [ 78389.47 153773.43 299737.29]
 [ 73994.56 122782.75 303319.26]
 [ 67532.53 105751.03 304768.73]
 [ 77044.01  99281.34 140574.81]
 [ 64664.71 139553.16 137962.62]
 [ 75328.87 144135.98 134050.07]
 [ 72107.6  127864.55 353183.81]
 [ 66051.52 182645.56 118148.2 ]
 [ 65605.48 153032.06 107138.38]
 [ 61994.4