## Generalizing Ability Of The Model, Holdout Cross-Validation, And K-fold Cross-Validation

In [3]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

### There is a dataframe initial_data.csv with a information about taxi rides

In [4]:
df = pd.read_csv('initial_data.csv', index_col='id')

In [5]:
df = df[['vendor_id', 'passenger_count', 'pickup_longitude',
         'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
         'trip_duration']]

In [6]:
df

Unnamed: 0_level_0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
id2875421,2,1,-73.982155,40.767937,-73.964630,40.765602,455.0
id2377394,1,1,-73.980415,40.738564,-73.999481,40.731152,663.0
id3858529,2,1,-73.979027,40.763939,-74.005333,40.710087,2124.0
id3504673,2,1,-74.010040,40.719971,-74.012268,40.706718,429.0
id2181028,2,1,-73.973053,40.793209,-73.972923,40.782520,435.0
...,...,...,...,...,...,...,...
id2376096,2,4,-73.982201,40.745522,-73.994911,40.740170,778.0
id1049543,1,1,-74.000946,40.747379,-73.970184,40.796547,655.0
id2304944,2,1,-73.959129,40.768799,-74.004433,40.707371,764.0
id2714485,1,1,-73.982079,40.749062,-73.974632,40.757107,373.0


### First, let's calculate the error on Cross-Validation and Test for our most basic model before extracting any features, but simply by taking all real columns.

In [7]:
df = df.assign(log_trip_duration=np.log1p(df['trip_duration']))
df = df.drop('trip_duration', axis=1)

In [8]:
## Create samples
X = df.drop("log_trip_duration", axis=1)

## Create target
Y = df["log_trip_duration"]

## Let's use Holdout Method with 20 percent of a test size 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    test_size=0.2,
                                                    random_state=42)

In [9]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((1166915, 6), (291729, 6), (1166915,), (291729,))

In [10]:
kf = KFold(n_splits=20, shuffle=True, random_state=33)

In [13]:
## Let's verify the result using a cross-validation implementation loop.
losses_val = []
losses_train = []

for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train.values[train_index], X_train.values[val_index]
    Y_train_fold, Y_val_fold = Y_train.values[train_index], Y_train.values[val_index]

    model = LinearRegression()
    model.fit(X_train_fold, Y_train_fold)

    losses_val.append(np.mean((model.predict(X_val_fold) - Y_val_fold) ** 2))
    losses_train.append(np.mean((model.predict(X_train_fold) - Y_train_fold) ** 2))

msle = round(np.mean(losses_val), 3)

print(f'Mean Squared Logarithmic Error = {msle}')

Mean Squared Logarithmic Error = 0.613


In [14]:
# Let's verify the response using the classical cross_validate method

model = LinearRegression()

scoring = {'mse': 'neg_mean_squared_error'}

cv_results = cross_validate(model, X_train, Y_train, cv=20, scoring=scoring)

losses_val = -cv_results['test_mse']
mean_loss_val = np.mean(losses_val)
mean_loss_val = round(mean_loss_val, 3)
print(mean_loss_val)

0.614


In [17]:
## Let's train a model using the entire training data and make predictions on the test set

model = LinearRegression()
model.fit(X_train, Y_train)

Y_hat = model.predict(X_test)
losses_all = []

for i in range(len(Y_test)):
    loss = (Y_hat[i] - Y_test[i]) ** 2
    losses_all.append(loss)
    
print(f'Mean Squared Error = {round(np.mean(losses_all), 3)}')

Mean Squared Error = 0.606


In [18]:
# Let's evaluate the accuracy of the answer by calculating the mean squared error

mse = mean_squared_error(Y_test, Y_hat)

print(f'Mean Squared Error = {round(mse, 3)}')

Mean Squared Error = 0.606


### Next, we will utilize the processed_data.csv dataframe, which contains preprocessed samples for machine learning, and compare it with our initial_data.

In [19]:
processed_data = pd.read_csv('processed_data.csv', index_col='id')

In [20]:
processed_data.head()

Unnamed: 0_level_0,vendor_id,passenger_count,store_and_fwd_flag,trip_duration,distance_km
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
id2875421,1,930.399753,0,455.0,1.500479
id2377394,0,930.399753,0,663.0,1.807119
id3858529,1,930.399753,0,2124.0,6.39208
id3504673,1,930.399753,0,429.0,1.487155
id2181028,1,930.399753,0,435.0,1.189925


In [21]:
processed_data = processed_data.assign(log_trip_duration=np.log1p(processed_data['trip_duration']))
processed_data = processed_data.drop('trip_duration', axis=1)

In [22]:
X_2 = processed_data.drop('log_trip_duration', axis=1)
Y_2 = processed_data['log_trip_duration']

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X_2, Y_2,
                                                   test_size=0.2,
                                                   random_state=42)

In [25]:
kf = KFold(n_splits=20, shuffle=True, random_state=33)

losses_val = []
losses_train = []

for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train.values[train_index], X_train.values[val_index]
    Y_train_fold, Y_val_fold = Y_train.values[train_index], Y_train.values[val_index]

    model = LinearRegression()
    model.fit(X_train_fold, Y_train_fold)

    losses_val.append(np.mean((model.predict(X_val_fold) - Y_val_fold) ** 2))
    losses_train.append(np.mean((model.predict(X_train_fold) - Y_train_fold) ** 2))
    
msle = round(np.mean(losses_val), 3)

In [26]:
print(f'Mean Squared Logarithmic Error = {msle}')

Mean Squared Logarithmic Error = 0.431


In [27]:
# Checking the answer with cross_validate method

model = LinearRegression()

scoring = {'mse': 'neg_mean_squared_error'}


# cross_validate_result
cross_validate_result = cross_validate(model, X_train, Y_train, cv=20, scoring=scoring)

losses_val = -cross_validate_result['test_mse']
mean_loss_val = np.mean(losses_val)
mean_loss_val = round(mean_loss_val, 3)
print(mean_loss_val)

0.432


In [28]:
## Let's train a model using the entire training data and make predictions on the test set

model = LinearRegression()

model.fit(X_train, Y_train)

Y_hat = model.predict(X_test)
losses_all = []

for i in range(len(Y_test)):
    loss = (Y_hat[i] - Y_test[i]) ** 2
    losses_all.append(loss)
print(round(np.mean(losses_all), 3))

0.407


----