In [9]:
import pandas as pd
import pyarrow.parquet as pq
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pickle

In [10]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)

#### Q1. Read the data for January. How many records are there?

In [11]:
df = pq.read_table(source='fhv_tripdata_2021-01.parquet').to_pandas()

In [12]:
df.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


In [13]:
df.shape

(1154112, 7)

#### Q.2 What's the average trip duration in January?

In [14]:
 df['duration'] = df['dropOff_datetime'] -  df['pickup_datetime']

In [15]:
df['duration'] = df['duration'].apply(lambda td: td.total_seconds()/60)

In [16]:
np.mean(df['duration'])

19.167224093791006

#### Data preparation
Check the distribution of the duration variable. There are some outliers.

Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).

How many records did you drop?

In [17]:
df.duration.describe(percentiles = [0.95, 0.98, 0.99])

count   1154112.00000
mean         19.16722
std         398.69216
min           0.01667
50%          13.40000
95%          47.25000
98%          66.13333
99%          90.30000
max      423371.05000
Name: duration, dtype: float64

In [18]:
((df.duration>=1) & (df.duration<=60)).mean()
#96% data is between 1 min and 60 mins: It will be good to focus on them

0.9616276409915155

In [19]:
df_60 = df[(df.duration>=1) & (df.duration<=60)]

In [20]:
df.shape[0] - df_60.shape[0]

44286

#### Q3. Missing values
The features we'll use for our model are the pickup and dropoff location IDs.

But they have a lot of missing values there. Let's replace them with "-1".

What's the fractions of missing values for the pickup location ID? I.e. fraction of "-1"s after you filled the NAs.

In [21]:
df_60 ['PUlocationID'] = df_60['PUlocationID'].fillna(-1)
df_60 ['DOlocationID'] = df_60['DOlocationID'].fillna(-1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [22]:
df_60['PUlocationID'] == -1

0           True
1           True
3           True
4           True
5           True
           ...  
1154107    False
1154108    False
1154109    False
1154110    False
1154111     True
Name: PUlocationID, Length: 1109826, dtype: bool

In [23]:
(df_60[df_60['PUlocationID'] == -1].shape[0]/df_60.shape[0])*100

83.52732770722618

#### Q4. One-hot encoding

Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

 - Turn the dataframe into a list of dictionaries
 - Fit a dictionary vectorizer
 - Get a feature matrix from it
 - What's the dimensionality of this matrix? (The number of columns).

        - 2
        - 152
        - 352
        - 525
        - 725

In [24]:
categorical = ['PUlocationID', 'DOlocationID']

In [25]:
df_60[categorical] = df_60[categorical].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [26]:
train_dict = df_60[categorical].to_dict(orient = 'records')

In [27]:
dv = DictVectorizer()

In [28]:
X_train = dv.fit_transform(train_dict)

In [29]:
len(dv.feature_names_)

525

##### Now let's use the feature matrix from the previous step to train a model.

Train a plain linear regression model with default parameters
Calculate the RMSE of the model on the training data
What's the RMSE on train?

    - 5.52
    - 10.52
    - 15.52
    - 20.52

In [30]:
y_train = df_60.duration

In [31]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [32]:
y_pred = lr.predict(X_train)

In [33]:
mse  = mean_squared_error(y_train, y_pred)
mse**.5

10.528519107223724

###### Saving File

In [37]:
with open( "dict_vectorizer.p", "wb" ) as f:
    pickle.dump(dv, f)

In [38]:
with open( "model.p", "wb" ) as f:
    pickle.dump( lr, f)