In [1]:
import pandas as pd

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

In [4]:
from sklearn.metrics import mean_squared_error

In [5]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    
    df['trip_duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.trip_duration = df['trip_duration'].apply(lambda td: td.total_seconds() / 60.0)
    df = df[(df.trip_duration >= 1.0) & (df.trip_duration <= 60.0)]
    
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype('str')
    return df

In [6]:
df_train = read_dataframe('./data/yel_tripdata_01.parquet')

In [7]:
df_valid = read_dataframe('./data/yel_tripdata_02.parquet')

In [8]:
df_train.trip_duration.describe(percentiles=[0.95, 0.98, 0.99]).round(2)

count    2421440.00
mean          12.67
std            9.00
min            1.00
50%           10.23
95%           31.05
98%           39.72
99%           45.68
max           60.00
Name: trip_duration, dtype: float64

In [9]:
categorical = ['PULocationID', 'DOLocationID']

In [10]:
dv = DictVectorizer()

In [11]:
def vectorize(subset, valid=False):
    dicts = subset.to_dict(orient='records')
    if valid == True:
        vectorized = dv.transform(dicts)
    else:
        vectorized = dv.fit_transform(dicts)
    return vectorized

In [12]:
vectorized_train = vectorize(df_train[categorical])

In [13]:
vectorized_valid = vectorize(df_valid[categorical], valid=True)

In [14]:
X_train = vectorized_train
X_valid = vectorized_valid

In [15]:
target = 'trip_duration'
y_train = df_train[target]
y_valid = df_valid[target]

In [16]:
lr = LinearRegression()
lr.fit(X_train, y_train);

In [17]:
y_pred_train = lr.predict(X_train)
y_pred_valid = lr.predict(X_valid)

In [18]:
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
rmse_valid = mean_squared_error(y_valid, y_pred_valid, squared=False)
print('RMSE train:', round(rmse_train, 2))
print('RMSE valid:', round(rmse_valid, 2))

RMSE train: 6.99
RMSE valid: 7.79
