# Import Packages

In [6]:
# import necessary library for training
import numpy as np 
import pandas as pd 
from subprocess import check_output
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import seaborn as sns
import matplotlib.pyplot as plt

# Import Data

In [7]:
bucket='test2642022'
data_key = 'yellow_tripdata_2021-01.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

df=pd.read_csv(data_location,dtype={"store_and_fwd_flag": str})
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1.0,2021-01-01 00:30:10,2021-01-01 00:36:12,1.0,2.1,1.0,N,142,43,2.0,8.0,3.0,0.5,0.0,0.0,0.3,11.8,2.5
1,1.0,2021-01-01 00:51:20,2021-01-01 00:52:19,1.0,0.2,1.0,N,238,151,2.0,3.0,0.5,0.5,0.0,0.0,0.3,4.3,0.0
2,1.0,2021-01-01 00:43:30,2021-01-01 01:11:06,1.0,14.7,1.0,N,132,165,1.0,42.0,0.5,0.5,8.65,0.0,0.3,51.95,0.0
3,1.0,2021-01-01 00:15:48,2021-01-01 00:31:01,0.0,10.6,1.0,N,138,132,1.0,29.0,0.5,0.5,6.05,0.0,0.3,36.35,0.0
4,2.0,2021-01-01 00:31:49,2021-01-01 00:48:21,1.0,4.94,1.0,N,68,33,1.0,16.5,0.5,0.5,4.06,0.0,0.3,24.36,2.5


In [8]:
df.shape

(1369765, 18)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1369765 entries, 0 to 1369764
Data columns (total 18 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   VendorID               1271413 non-null  float64
 1   tpep_pickup_datetime   1369765 non-null  object 
 2   tpep_dropoff_datetime  1369765 non-null  object 
 3   passenger_count        1271413 non-null  float64
 4   trip_distance          1369765 non-null  float64
 5   RatecodeID             1271413 non-null  float64
 6   store_and_fwd_flag     1271413 non-null  object 
 7   PULocationID           1369765 non-null  int64  
 8   DOLocationID           1369765 non-null  int64  
 9   payment_type           1271413 non-null  float64
 10  fare_amount            1369765 non-null  float64
 11  extra                  1369765 non-null  float64
 12  mta_tax                1369765 non-null  float64
 13  tip_amount             1369765 non-null  float64
 14  tolls_amount      

In [5]:
#Check missing value
df.isnull().sum()

VendorID                 98352
tpep_pickup_datetime         0
tpep_dropoff_datetime        0
passenger_count          98352
trip_distance                0
RatecodeID               98352
store_and_fwd_flag       98352
PULocationID                 0
DOLocationID                 0
payment_type             98352
fare_amount                  0
extra                        0
mta_tax                      0
tip_amount                   0
tolls_amount                 0
improvement_surcharge        0
total_amount                 0
congestion_surcharge         0
dtype: int64

In [9]:
#Drop NA
df=df.dropna()

In [10]:
df=df[df['total_amount']>0]

# Feature Engineering

In [11]:
df['tpep_dropoff_datetime']=df['tpep_dropoff_datetime'].astype('datetime64[ns]')
df['tpep_pickup_datetime']=df['tpep_pickup_datetime'].astype('datetime64[ns]')
df['VendorID']=df['VendorID'].astype('object')
df['RatecodeID']=df['RatecodeID'].astype('object')
df['payment_type']=df['payment_type'].astype('object')
df['ride_duration_min']= ((df['tpep_dropoff_datetime']-df['tpep_pickup_datetime']))

In [12]:
df['ride_duration_min']=df['ride_duration_min']/ np.timedelta64(1, 'm')

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1264309 entries, 0 to 1271412
Data columns (total 19 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   VendorID               1264309 non-null  object        
 1   tpep_pickup_datetime   1264309 non-null  datetime64[ns]
 2   tpep_dropoff_datetime  1264309 non-null  datetime64[ns]
 3   passenger_count        1264309 non-null  float64       
 4   trip_distance          1264309 non-null  float64       
 5   RatecodeID             1264309 non-null  object        
 6   store_and_fwd_flag     1264309 non-null  object        
 7   PULocationID           1264309 non-null  int64         
 8   DOLocationID           1264309 non-null  int64         
 9   payment_type           1264309 non-null  object        
 10  fare_amount            1264309 non-null  float64       
 11  extra                  1264309 non-null  float64       
 12  mta_tax                12643

In [14]:
df=pd.get_dummies(df,columns=['RatecodeID'])
df=pd.get_dummies(df,columns=['payment_type'])
df=pd.get_dummies(df,columns=['VendorID'])
df=pd.get_dummies(df,columns=['store_and_fwd_flag'])

In [15]:
df.drop(["tpep_pickup_datetime","tpep_dropoff_datetime","PULocationID","DOLocationID"], axis='columns',inplace=True)
df.head()

Unnamed: 0,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,...,RatecodeID_6.0,RatecodeID_99.0,payment_type_1.0,payment_type_2.0,payment_type_3.0,payment_type_4.0,VendorID_1.0,VendorID_2.0,store_and_fwd_flag_N,store_and_fwd_flag_Y
0,1.0,2.1,8.0,3.0,0.5,0.0,0.0,0.3,11.8,2.5,...,0,0,0,1,0,0,1,0,1,0
1,1.0,0.2,3.0,0.5,0.5,0.0,0.0,0.3,4.3,0.0,...,0,0,0,1,0,0,1,0,1,0
2,1.0,14.7,42.0,0.5,0.5,8.65,0.0,0.3,51.95,0.0,...,0,0,1,0,0,0,1,0,1,0
3,0.0,10.6,29.0,0.5,0.5,6.05,0.0,0.3,36.35,0.0,...,0,0,1,0,0,0,1,0,1,0
4,1.0,4.94,16.5,0.5,0.5,4.06,0.0,0.3,24.36,2.5,...,0,0,1,0,0,0,0,1,1,0


# Train Test Split

In [16]:
x = df.drop('total_amount',1)
y = df.total_amount

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [18]:
print("Shape of X_train: ",x_train.shape)
print("Shape of X_test: ", x_test.shape)
print("Shape of y_train: ",y_train.shape)
print("Shape of y_test",y_test.shape)

Shape of X_train:  (1011447, 25)
Shape of X_test:  (252862, 25)
Shape of y_train:  (1011447,)
Shape of y_test (252862,)


# Linear Regression

In [19]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(x_train, y_train)

LinearRegression()

In [20]:
y_pred_test = reg.predict(x_test)

In [21]:
mean_squared_error(y_test,y_pred_test)

0.08200381764530851

# Regression Tree

In [22]:
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor 

dt = DecisionTreeRegressor()

dt.fit(x_train, y_train)
y_pred_dt = dt.predict(x_test)

In [23]:
mean_squared_error(y_test,y_pred_dt)

5.974817455049594

In [3]:
# In addition to the basic models we can also perform hyperparamaeter tuning in order to get better results. 
# This can be a future enhancement for our project
# I have written the code and commented the same as it takes so long to run in the trail version of Sagemaker

In [None]:
x.head()

In [4]:
#Function to perform nested Cross Validation

In [5]:
#from sklearn.model_selection import GridSearchCV, cross_val_score, KFold

In [6]:

#def hp_tuning (estimator, p_grid, X_train, Y_train): 
#    NUM_TRIALS = 5
#    non_nested_scores = np.zeros(NUM_TRIALS) 
#    nested_scores = np.zeros(NUM_TRIALS)
#    for i in range(NUM_TRIALS):
    # Choose cross-validation techniques for the inner and outer loops, # independently of the dataset.
    # E.g "LabelKFold", "LeaveOneOut", "LeaveOneLabelOut", etc. 
    
#        inner_cv = KFold(n_splits=4, shuffle=True, random_state=i)
#        outer_cv = KFold(n_splits=4, shuffle=True, random_state=i)
#        # Non_nested parameter search and scoring
#        clf = GridSearchCV(estimator=estimator, param_grid=p_grid, cv=inner_cv, scoring='accuracy') 
#        clf.fit(X_train, Y_train)
#        non_nested_scores[i] = clf.best_score_
            # Nested CV with parameter optimization
#        nested_score = cross_val_score(clf, X=X_train, y=Y_train, cv=outer_cv) 
#        nested_scores[i] = nested_score.mean()
#    score_difference = non_nested_scores - nested_scores
#    print("Average R2 of {0:6f} with std. dev. of {1:6f}." .format(nested_scores.mean(), nested_scores.std()))


In [7]:
# SVM Regression

In [8]:
#from sklearn.svm import SVR
#p_grid = {"C": [1/100, 1/10, 1, 10, 100],
          #"gamma": [.01, .1]}
#        "kernel": ["linear", "poly", "rbf", "sigmoid"]} 
#svm = SVR()
#Calling the function to perform nested cv (hyper parameter tuning also)
#hp_tuning (svm, p_grid, x_train, y_train)

In [9]:
# Regression Tree

In [10]:
#from sklearn import tree
#from sklearn.tree import DecisionTreeRegressor 

#dt = DecisionTreeRegressor()

#p_grid = {'max_depth': list(np.arange(2, 10)),
#          'criterion': ["squared_error", "friedman_mse", "poisson"]}
#Calling the function to perform nested cv (hyper parameter tuning also)
#hp_tuning (dt, p_grid, X_train, Y_train)

In [11]:
# Gradient Boosting

In [12]:
#from sklearn import ensemble
#import numpy as np

#p_grid = {'n_estimators': [10, 20, 50],
#    'loss': ["squared_error", "huber", "absolute_error"], 'max_depth': list(np.arange(2, 10))}
#gb = ensemble.GradientBoostingRegressor() 
#hp_tuning (gb, p_grid, X_train, Y_train)

In [13]:
#### We can compare the results of the nested CV and chose the model with the least MSE as the best model.
#### Post that, we perform hypderparameter tuning on the chosen best model and get the best hyperparamters
#### then we can predict the taxi fare of the test dataset using this hyperparameter