In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pickle
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
tripdata_1 = pd.read_csv('./src/yellow_tripdata_2015-01.csv')
tripdata_2 = pd.read_csv('./src/yellow_tripdata_2016-01.csv')
tripdata_3 = pd.read_csv('./src/yellow_tripdata_2016-02.csv')
tripdata_4 = pd.read_csv('./src/yellow_tripdata_2016-03.csv')

tripdata = pd.concat([tripdata_1, tripdata_2, tripdata_3, tripdata_4], axis=0)
del tripdata_1, tripdata_2, tripdata_3, tripdata_4

In [3]:
tripdata.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,RatecodeID
0,2,2015-01-15 19:05:39,2015-01-15 19:23:42,1,1.59,-73.993896,40.750111,1.0,N,-73.974785,40.750618,1,12.0,1.0,0.5,3.25,0.0,0.3,17.05,
1,1,2015-01-10 20:33:38,2015-01-10 20:53:28,1,3.3,-74.001648,40.724243,1.0,N,-73.994415,40.759109,1,14.5,0.5,0.5,2.0,0.0,0.3,17.8,
2,1,2015-01-10 20:33:38,2015-01-10 20:43:41,1,1.8,-73.963341,40.802788,1.0,N,-73.95182,40.824413,2,9.5,0.5,0.5,0.0,0.0,0.3,10.8,
3,1,2015-01-10 20:33:39,2015-01-10 20:35:31,1,0.5,-74.009087,40.713818,1.0,N,-74.004326,40.719986,2,3.5,0.5,0.5,0.0,0.0,0.3,4.8,
4,1,2015-01-10 20:33:39,2015-01-10 20:52:58,1,3.0,-73.971176,40.762428,1.0,N,-74.004181,40.742653,2,15.0,0.5,0.5,0.0,0.0,0.3,16.3,


In [4]:
tripdata.isna().sum()

VendorID                        0
tpep_pickup_datetime            0
tpep_dropoff_datetime           0
passenger_count                 0
trip_distance                   0
pickup_longitude                0
pickup_latitude                 0
RateCodeID               34499859
store_and_fwd_flag              0
dropoff_longitude               0
dropoff_latitude                0
payment_type                    0
fare_amount                     0
extra                           0
mta_tax                         0
tip_amount                      0
tolls_amount                    0
improvement_surcharge           3
total_amount                    0
RatecodeID               12748986
dtype: int64

In [5]:
tripdata = tripdata.drop(['RateCodeID', 'RatecodeID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1)
tripdata['improvement_surcharge'] = tripdata['improvement_surcharge'].fillna(np.mean(tripdata['improvement_surcharge']))
tripdata = pd.concat([tripdata, pd.get_dummies(tripdata['store_and_fwd_flag'])], axis=1)
tripdata = tripdata.drop('store_and_fwd_flag', axis=1)

tripdata.head(10)

Unnamed: 0,VendorID,passenger_count,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,N,Y
0,2,1,1.59,-73.993896,40.750111,-73.974785,40.750618,1,12.0,1.0,0.5,3.25,0.0,0.3,17.05,1,0
1,1,1,3.3,-74.001648,40.724243,-73.994415,40.759109,1,14.5,0.5,0.5,2.0,0.0,0.3,17.8,1,0
2,1,1,1.8,-73.963341,40.802788,-73.95182,40.824413,2,9.5,0.5,0.5,0.0,0.0,0.3,10.8,1,0
3,1,1,0.5,-74.009087,40.713818,-74.004326,40.719986,2,3.5,0.5,0.5,0.0,0.0,0.3,4.8,1,0
4,1,1,3.0,-73.971176,40.762428,-74.004181,40.742653,2,15.0,0.5,0.5,0.0,0.0,0.3,16.3,1,0
5,1,1,9.0,-73.874374,40.774048,-73.986977,40.758194,1,27.0,0.5,0.5,6.7,5.33,0.3,40.33,1,0
6,1,1,2.2,-73.983276,40.726009,-73.99247,40.749634,2,14.0,0.5,0.5,0.0,0.0,0.3,15.3,1,0
7,1,3,0.8,-74.002663,40.734142,-73.99501,40.726326,1,7.0,0.5,0.5,1.66,0.0,0.3,9.96,1,0
8,1,3,18.2,-73.783043,40.644356,-73.987595,40.759357,2,52.0,0.0,0.5,0.0,5.33,0.3,58.13,1,0
9,1,2,0.9,-73.985588,40.767948,-73.985916,40.759365,1,6.5,0.5,0.5,1.55,0.0,0.3,9.35,1,0


# Прогон модели

In [6]:
X = tripdata.drop('total_amount', axis=1)
y = tripdata['total_amount']
del tripdata

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
del X, y

In [7]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge_model', Ridge())
])

param_grid = {
    'ridge_model__alpha': np.linspace(1, 200, 50),
    'ridge_model__max_iter': [250, 500, 1000]
}

splitter = KFold(n_splits=7)

search = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=splitter,
    scoring='neg_mean_squared_error',
    verbose=2
)

In [9]:
search.fit(X_train, y_train)

Fitting 7 folds for each of 150 candidates, totalling 1050 fits
[CV] END ..ridge_model__alpha=1.0, ridge_model__max_iter=250; total time=  34.5s
[CV] END ..ridge_model__alpha=1.0, ridge_model__max_iter=250; total time=  16.0s
[CV] END ..ridge_model__alpha=1.0, ridge_model__max_iter=250; total time=  16.1s
[CV] END ..ridge_model__alpha=1.0, ridge_model__max_iter=250; total time=  16.0s
[CV] END ..ridge_model__alpha=1.0, ridge_model__max_iter=250; total time=  16.0s
[CV] END ..ridge_model__alpha=1.0, ridge_model__max_iter=250; total time=  16.1s
[CV] END ..ridge_model__alpha=1.0, ridge_model__max_iter=250; total time=  16.0s
[CV] END ..ridge_model__alpha=1.0, ridge_model__max_iter=500; total time=  16.0s
[CV] END ..ridge_model__alpha=1.0, ridge_model__max_iter=500; total time=  16.1s
[CV] END ..ridge_model__alpha=1.0, ridge_model__max_iter=500; total time=  16.0s
[CV] END ..ridge_model__alpha=1.0, ridge_model__max_iter=500; total time=  16.2s
[CV] END ..ridge_model__alpha=1.0, ridge_mode

In [9]:
pkl_filename = './model/pkl_model.pkl'

# with open(pkl_filename, 'wb') as file:
    # pickle.dump(search, file)

In [None]:
pkl_filename = './model/pkl_model.pkl'
loaded_model = pickle.load(open(pkl_filename, 'rb'))

print(f'Model score: {-loaded_model.score(X_test, y_test)}')
loaded_model.best_estimator_

In [11]:
pred = loaded_model.predict(X_test)

In [12]:
pd.concat(
    [
        pd.DataFrame(pred[:1000]).reset_index(drop=True),
        pd.DataFrame(y_test[:1000]).reset_index(drop=True)
    ],
    axis=1
)

Unnamed: 0,0,total_amount
0,6.959873,6.96
1,7.801480,7.80
2,9.300005,9.30
3,8.300391,8.30
4,6.201489,6.20
...,...,...
995,11.159742,11.16
996,26.151280,26.15
997,32.851348,32.85
998,5.798697,5.80
