# Regularization and Scaling

## Tasks

### Task 1

Import data you need to build models. 

In [1]:
import numpy as np
import pandas as pd

In [2]:
processed_data = pd.read_csv('processed_data.csv', index_col='id')

In [3]:
processed_data.head()

Unnamed: 0_level_0,vendor_id,passenger_count,store_and_fwd_flag,distance_km,log_trip_duration
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
id2875421,1,930.399753,0,1.500479,6.122493
id2377394,0,930.399753,0,1.807119,6.498282
id3858529,1,930.399753,0,6.39208,7.661527
id3504673,1,930.399753,0,1.487155,6.063785
id2181028,1,930.399753,0,1.189925,6.077642


In [4]:
processed_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1458644 entries, id2875421 to id1209952
Data columns (total 5 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   vendor_id           1458644 non-null  int64  
 1   passenger_count     1458644 non-null  float64
 2   store_and_fwd_flag  1458644 non-null  int64  
 3   distance_km         1458644 non-null  float64
 4   log_trip_duration   1458644 non-null  float64
dtypes: float64(3), int64(2)
memory usage: 66.8+ MB


### Task 2

Build a simple linear regression model using a cross-validation technique. Use the whole set of data for cross validation, do not split it on to train and test samples.  

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

In [6]:
selector = KFold(n_splits=4)

X = processed_data.drop('log_trip_duration', axis=1)
y = processed_data['log_trip_duration']

model = LinearRegression()
metric = 'neg_mean_squared_error'

scores = cross_validate(model, X, y, scoring=metric, cv=selector, return_train_score=True)

In [7]:
round(np.mean(scores['train_score']) * -1, 3)

0.424

In [8]:
round(np.mean(scores['test_score']) * -1, 3)

0.426

### Task 3

Load another dataset having more features. 

In [9]:
new_data = pd.read_csv('new_data.csv', index_col='id')

In [10]:
new_data.head()

Unnamed: 0_level_0,vendor_id,passenger_count,store_and_fwd_flag,distance_km,log_trip_duration,feature_1,feature_2,feature_3,feature_4,feature_5,...,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id2875421,1,930.399753,0,1.500479,6.122493,1,1,1,1,1,...,0,0,0,0,0,1.500479,2.251437,3.378234,5.068969,7.605881
id2377394,0,930.399753,0,1.807119,6.498282,0,0,0,0,0,...,0,0,0,0,0,1.807119,3.265681,5.901475,10.66467,19.272331
id3858529,1,930.399753,0,6.39208,7.661527,1,1,1,1,1,...,0,0,0,0,0,6.39208,40.85869,261.172025,1669.432545,10671.146803
id3504673,1,930.399753,0,1.487155,6.063785,1,1,1,1,1,...,0,0,0,0,0,1.487155,2.211629,3.289035,4.891303,7.274125
id2181028,1,930.399753,0,1.189925,6.077642,1,1,1,1,1,...,0,0,0,0,0,1.189925,1.415923,1.684842,2.004837,2.385606


In [11]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1458644 entries, id2875421 to id1209952
Data columns (total 25 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   vendor_id           1458644 non-null  int64  
 1   passenger_count     1458644 non-null  float64
 2   store_and_fwd_flag  1458644 non-null  int64  
 3   distance_km         1458644 non-null  float64
 4   log_trip_duration   1458644 non-null  float64
 5   feature_1           1458644 non-null  int64  
 6   feature_2           1458644 non-null  int64  
 7   feature_3           1458644 non-null  int64  
 8   feature_4           1458644 non-null  int64  
 9   feature_5           1458644 non-null  int64  
 10  feature_6           1458644 non-null  float64
 11  feature_7           1458644 non-null  float64
 12  feature_8           1458644 non-null  float64
 13  feature_9           1458644 non-null  float64
 14  feature_10          1458644 non-null  float64
 15  feature_11

### Task 4

Build a simple linear regression model using a cross-validation technique. And compare results with the previous model you've built using less features. 

In [12]:
X = new_data.drop('log_trip_duration', axis=1)
y = new_data['log_trip_duration']

model = LinearRegression()
metric = 'neg_mean_squared_error'

scores = cross_validate(model, X, y, scoring=metric, cv=selector, return_train_score=True)

In [13]:
round(np.mean(scores['train_score']) * -1, 3)

0.339

In [14]:
round(np.mean(scores['test_score']) * -1, 3)

140.922

### Task 5

Compare matrix ranks and number of features of both datasets. 

In [15]:
rank_processed = np.linalg.matrix_rank(processed_data.drop('log_trip_duration', axis=1).values)
rank_new = np.linalg.matrix_rank(new_data.drop('log_trip_duration', axis=1).values)

In [16]:
num_features_processed = processed_data.drop('log_trip_duration', axis=1).shape[1]
num_features_new = new_data.drop('log_trip_duration', axis=1).shape[1]

In [17]:
print(f'The first model has: {num_features_processed} features and matrix rank equal to: {rank_processed}')
print(f'The second model has: {num_features_new} features and matrix rank equal to: {rank_new}')

The first model has: 4 features and matrix rank equal to: 4
The second model has: 24 features and matrix rank equal to: 5


### Task 6

Scale your data and use a Ridge regression model to obtain good model performance on a new dataset that has 24 features.

In [18]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Ridge

In [19]:
X = new_data.drop('log_trip_duration', axis=1)
Y = new_data['log_trip_duration']


scores = []
for train_index, test_index in selector.split(X):
    
    X_train, X_test = X.values[train_index], X.values[test_index]
    Y_train, Y_test = Y.values[train_index], Y.values[test_index]
    
    # we fit our scaler only using train data!!!
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    
    # applying MinMaxScaler to our data
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # default alpha value is 1.0
    model_ridge = Ridge(max_iter=100000, alpha=0.01) 
    model_ridge.fit(X_train_scaled, Y_train)
    
    predictions = model_ridge.predict(X_test_scaled)
    
    scores.append(np.mean((predictions - Y_test)**2))

    
print(f'MSLE: {np.mean(scores)}')

MSLE: 0.38256820336794817


### Task 7

Do the same using Lasso regression.

In [20]:
from sklearn.linear_model import Lasso

In [21]:
X = new_data.drop('log_trip_duration', axis=1)
Y = new_data['log_trip_duration']


scores = []
for train_index, test_index in selector.split(X):
    
    X_train, X_test = X.values[train_index], X.values[test_index]
    Y_train, Y_test = Y.values[train_index], Y.values[test_index]
    
    # we fit our scaler only using train data!!!
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    
    # applying MinMaxScaler to our data
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # default alpha value is 1.0
    model_lasso = Lasso(max_iter=100000, alpha=0.00005) 
    model_lasso.fit(X_train_scaled, Y_train)
    
    predictions = model_lasso.predict(X_test_scaled)
    
    scores.append(np.mean((predictions - Y_test)**2))

    
print(f'MSLE: {np.mean(scores)}')

MSLE: 0.39867805632772246
