## Model training & saving
Using Joblib the model and preprocessors can be saved to then be loaded in the API application.         

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import SGDRegressor
import datetime
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import numpy as np
import pandas as pd 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from joblib import dump
import pathlib
import os
import shutil

# Model
clf = SGDRegressor()
poly = PolynomialFeatures(degree=9, include_bias=False)
scale = MinMaxScaler()

# Test data
test = pd.read_csv('test.csv')

X_test = test[['OrigLat', 'OrigLon', 'DestLat', 'DestLon', 'pickup_time', 'Distance']].values

y_test = test['travel_time'].values

last = datetime.datetime.now().replace(microsecond=0)
start = last

# Scale X test
X_test_poly = poly.fit_transform(scale.fit_transform(X_test))

parent = pathlib.Path().resolve()
if os.path.exists(str(parent) + "\model\\"):
    shutil.rmtree(str(parent) + "\model\\")
    os.mkdir(str(parent) + "\model")

# Train model and read train data
chunksize = 10 ** 5
with pd.read_csv('cleaned.csv', chunksize=chunksize) as reader:
    i = 0
    for chunk in reader:
        print("Iteration: " + str(i) + " ", end = '')
        
        # Remove any corrupt rows
        chunk = chunk.dropna()
        print(".", end = '')
        
        # Divide train and test X 
        X_train = chunk[['OrigLat', 'OrigLon', 'DestLat', 'DestLon', 'pickup_time', 'Distance']].values
        y_train = chunk['travel_time'].values
        print(".", end = '')

                
        # Scale X train
        X_train_poly = poly.transform(scale.transform(X_train))
        print(".", end = '')
        
        # Fit the model with this chunk
        clf.partial_fit(X_train_poly, y_train)
        print(".", end = '')

        # Print progress
        now = datetime.datetime.now().replace(microsecond=0)
        print(" - " + str(now-last) + " elapsed - " + str(clf.score(X_test_poly, y_test)) + " r2 ", end = '')
        
        # Export model to file every 10 iterations
        if(i%10 == 0):
            print(" - saving", end = '')
            path = str(parent) + "\model\model_" + str(i) + "\\"

            os.mkdir(path, mode = 0o777)

            dump(clf, path + 'nyc_taxi_model.joblib') 
            dump(poly, path + 'nyc_taxi_features.joblib') 
            dump(scale, path + 'nyc_taxi_scale.joblib') 
            print(" - done", end = '')
        
        # Prepare for next chunk
        print(" - " + str(now-start) + " elapsed total")
        last = now
        i += 1

Iteration: 0 .... - 0:00:06 elapsed - -0.3184021191872817 r2  - saving - done - 0:00:06 elapsed total
Iteration: 1 .... - 0:00:05 elapsed - -0.0731930850787077 r2  - 0:00:11 elapsed total
Iteration: 2 .... - 0:00:05 elapsed - -0.0976453423925503 r2  - 0:00:16 elapsed total
Iteration: 3 .... - 0:00:05 elapsed - -0.06169649350162576 r2  - 0:00:21 elapsed total
Iteration: 4 .... - 0:00:04 elapsed - -0.14903170966807888 r2  - 0:00:25 elapsed total
Iteration: 5 .... - 0:00:05 elapsed - -0.06597072351691491 r2  - 0:00:30 elapsed total
Iteration: 6 .... - 0:00:04 elapsed - -0.1753918696769945 r2  - 0:00:34 elapsed total
Iteration: 7 .... - 0:00:05 elapsed - -0.11048094669929331 r2  - 0:00:39 elapsed total
Iteration: 8 .... - 0:00:05 elapsed - -0.11369828768094536 r2  - 0:00:44 elapsed total
Iteration: 9 .... - 0:00:04 elapsed - -0.13635445348190056 r2  - 0:00:48 elapsed total
Iteration: 10 .... - 0:00:05 elapsed - -0.1661455693445646 r2  - saving - done - 0:00:53 elapsed total
Iteration: 11 .