### Let's measure the quality of Linear Regression after data preprocessing not only on the holdout dataset but also through 4-fold Cross-Validation

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import MinMaxScaler


processed_data = pd.read_csv('processed_data.csv', index_col='id')

In [3]:
processed_data.head()

Unnamed: 0_level_0,vendor_id,passenger_count,store_and_fwd_flag,distance_km,log_trip_duration
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
id2875421,1,930.399753,0,1.500479,6.122493
id2377394,0,930.399753,0,1.807119,6.498282
id3858529,1,930.399753,0,6.39208,7.661527
id3504673,1,930.399753,0,1.487155,6.063785
id2181028,1,930.399753,0,1.189925,6.077642


In [4]:
X = processed_data.drop("log_trip_duration", axis=1)
Y = processed_data["log_trip_duration"]

model = LinearRegression()

scoring = {'mse': 'neg_mean_squared_error'}

cross_validate_results = cross_validate(model, X, Y, cv=4, scoring=scoring)

losses_val = - cross_validate_results['test_mse']
losses_val = np.mean(losses_val)


print(f"MSLE on Cross-Validation: {round(np.mean(losses_val), 3)}")

MSLE on Cross-Validation: 0.426


## Let's attempt to utilize a dataframe with a substantial number of features

In [5]:
new_data = pd.read_csv('new_data.csv', index_col='id')

In [6]:
new_data.head()

Unnamed: 0_level_0,vendor_id,passenger_count,store_and_fwd_flag,distance_km,log_trip_duration,feature_1,feature_2,feature_3,feature_4,feature_5,...,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id2875421,1,930.399753,0,1.500479,6.122493,1,1,1,1,1,...,0,0,0,0,0,1.500479,2.251437,3.378234,5.068969,7.605881
id2377394,0,930.399753,0,1.807119,6.498282,0,0,0,0,0,...,0,0,0,0,0,1.807119,3.265681,5.901475,10.66467,19.272331
id3858529,1,930.399753,0,6.39208,7.661527,1,1,1,1,1,...,0,0,0,0,0,6.39208,40.85869,261.172025,1669.432545,10671.146803
id3504673,1,930.399753,0,1.487155,6.063785,1,1,1,1,1,...,0,0,0,0,0,1.487155,2.211629,3.289035,4.891303,7.274125
id2181028,1,930.399753,0,1.189925,6.077642,1,1,1,1,1,...,0,0,0,0,0,1.189925,1.415923,1.684842,2.004837,2.385606


In [7]:
X_new = new_data.drop("log_trip_duration", axis=1)
Y_new = new_data["log_trip_duration"]

model = LinearRegression()

scoring = {'mse': 'neg_mean_squared_error'}

cross_validate_results = cross_validate(model, X_new, Y_new, cv=4, scoring=scoring)

cross_val_error = - cross_validate_results['test_mse']
cross_val_error_2 = np.mean(cross_val_error)

print(f"MSLE on Cross-Validation: {round(cross_val_error_2, 3)}")

MSLE on Cross-Validation: 140.921


### Let's measure if there is an excess of information in our dataframe

In [11]:
rank_processed = np.linalg.matrix_rank(X)

rank_new = np.linalg.matrix_rank(X_new)

num_features_processed = X.shape[1]

num_features_new = X_new.shape[1]

In [12]:
print(f"In the first model, there are a total of features: {num_features_processed}, - and the rank is equal to {rank_processed}")

print(f"In the second model, there are a total of features: {num_features_new}, - and the rank is equal to {rank_new}")

In the first model, there are a total of features: 4, - and the rank is equal to 4
In the second model, there are a total of features: 24, - and the rank is equal to 5


Let's find the regularization parameter 𝛼 for the Ridge and Lasso cases such that the mean squared logarithmic error (MSLE) on cross-validation is strictly less than 0.4.

In [14]:
kf = KFold(n_splits=4)

X = new_data.drop('log_trip_duration', axis=1)
Y = new_data['log_trip_duration']


scores = []

for train_index, test_index in kf.split(X):
    
    X_train, X_test = X.values[train_index], X.values[test_index]
    Y_train, Y_test = Y.values[train_index], Y.values[test_index]

    scaler = MinMaxScaler()
    scaler.fit(X_train)
    
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model_lasso = Lasso(max_iter=100000) 
    model_lasso.fit(X_train_scaled, Y_train)
    
    predictions = model_lasso.predict(X_test_scaled)
    
    scores.append(np.mean((predictions - Y_test)**2))

    
print(f"MSLE on Cross-Validationн: {np.mean(scores)}")

MSLE on Cross-Validationн: 0.6332330617999488


In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(X_new, Y_new,
                                                   test_size=0.2,
                                                   random_state=42)

scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


alpha = 0.1
model_ridge = Ridge(alpha=alpha, max_iter=100000)

model_ridge.fit(X_train_scaled, Y_train)

scoring = {'mse': 'neg_mean_squared_error'}

cross_validate_results = cross_validate(model_ridge, X_train_scaled, Y_train, cv=4, scoring=scoring)

predictions = model_ridge.predict(X_test_scaled)

losses_val = - cross_validate_results['test_mse']
losses_val = np.mean(losses_val)


print(f"MSLE on Cross-Validationн: {round(np.mean(losses_val), 3)}")

MSLE on Cross-Validationн: 0.4


In [16]:
## Let's determine the optimal value of alpha
alphas = np.linspace(1e-5, 1e-4, 10)
mse_scores = []

for alpha in alphas:
    model_lasso = Lasso(alpha=alpha, max_iter=100000)

    model_lasso.fit(X_train_scaled, Y_train)

    cross_validate_results = cross_validate(model_lasso, X_train_scaled, Y_train, cv=4, scoring='neg_mean_squared_error')

    predictions = model_lasso.predict(X_test_scaled)

    losses_val = - cross_validate_results['test_score']
    mse_scores.append(losses_val)


best_alpha = alphas[np.argmin(mse_scores)]

print(f"Best alpha: {best_alpha}")

Best alpha: 1e-05


----