# Taxi Fare Price Prediction

## Dataset Attributes

- key: a unique identifier for each trip
- fare_amount: the cost of each trip in USD
- pickup_datetime: date and time when meter was engaged
- passenger_count: number of passengers in the vehicle
- pickup_longitude: the longitude where the meter was engaged
- pickup_latitude: the latitude where the meter was engaged
- dropoff_longitute: the longitude where the meter was disengaged
- dropoff_latitude: the latitude where the meter was disengaged

## Objective

To predict the fare price of a ride based on the given attributes

## Approach

Before we could build a regression model to predict the prices, we need to preprocess the data to remove outliers, fix missing and null values, etc.

## Data Exploration

In [77]:
# importing libraries
import pandas as pd
import geopy.distance

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [28]:
# importing the dataset

dataset = pd.read_csv('uber.csv')

dataset.drop(['Unnamed: 0', 'key'], axis = 1, inplace = True)

target = 'fare_amount'

features = [i for i in dataset.columns if i not in [target]]

In [29]:
# check for empty elements
nvc = pd.DataFrame(dataset.isnull().sum().sort_values(), columns = ['Total Null Values'])
nvc['Percentage'] = round(nvc['Total Null Values']/dataset.shape[0], 3)*100
print(nvc)
dataset.dropna(inplace = True)

                   Total Null Values  Percentage
fare_amount                        0         0.0
pickup_datetime                    0         0.0
pickup_longitude                   0         0.0
pickup_latitude                    0         0.0
passenger_count                    0         0.0
dropoff_longitude                  1         0.0
dropoff_latitude                   1         0.0


In [30]:
# reframing the columns
dataset = dataset[(dataset.pickup_latitude<90) & (dataset.dropoff_latitude<90) &
        (dataset.pickup_latitude>-90) & (dataset.dropoff_latitude>-90) &
        (dataset.pickup_longitude<180) & (dataset.dropoff_longitude<180) &
        (dataset.pickup_longitude>-180) & (dataset.dropoff_longitude>-180)]

dataset.pickup_datetime = pd.to_datetime(dataset.pickup_datetime)

dataset['year'] = dataset.pickup_datetime.dt.year
dataset['month'] = dataset.pickup_datetime.dt.month
dataset['weekday'] = dataset.pickup_datetime.dt.weekday
dataset['hour'] = dataset.pickup_datetime.dt.hour

dataset['Monthly_Quarter'] = dataset.month.map({1:'Q1',2:'Q1',3:'Q1',4:'Q2',5:'Q2',6:'Q2',7:'Q3',
                                      8:'Q3',9:'Q3',10:'Q4',11:'Q4',12:'Q4'})
dataset['Hourly_Segments'] = dataset.hour.map({0:'H1',1:'H1',2:'H1',3:'H1',4:'H2',5:'H2',6:'H2',7:'H2',8:'H3',
                                     9:'H3',10:'H3',11:'H3',12:'H4',13:'H4',14:'H4',15:'H4',16:'H5',
                                     17:'H5',18:'H5',19:'H5',20:'H6',21:'H6',22:'H6',23:'H6'})

dataset['Distance']=[round(geopy.distance.distance((dataset.pickup_latitude[i], dataset.pickup_longitude[i]),(dataset.dropoff_latitude[i], dataset.dropoff_longitude[i])).m,2) for i in dataset.index]

dataset.drop(['pickup_datetime','month', 'hour',], axis=1, inplace=True)

data = dataset.copy(deep = True)

data.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,weekday,Monthly_Quarter,Hourly_Segments,Distance
0,7.5,-73.999817,40.738354,-73.999512,40.723217,1,2015,3,Q2,H5,1681.11
1,7.7,-73.994355,40.728225,-73.99471,40.750325,1,2009,4,Q3,H6,2454.36
2,12.9,-74.005043,40.74077,-73.962565,40.772647,1,2009,0,Q3,H6,5039.6
3,5.3,-73.976124,40.790844,-73.965316,40.803349,3,2009,4,Q2,H3,1661.44
4,16.0,-73.925023,40.744085,-73.973082,40.761247,5,2014,3,Q3,H5,4483.73


In [37]:
# checking number of unique rows in each feature
nu = dataset.drop([target], axis = 1).nunique().sort_values()

# numerical and categorical features
nf = []
cf = []
nnf = 0
ncf = 0

for i in range(dataset.drop([target], axis = 1).shape[1]):
    if nu.values[i] <= 24:
        cf.append(nu.index[i])
    else:
        nf.append(nu.index[i])

## Data Preprocessing

In [38]:
# removing duplicate rows
counter = 0
rows, cols = data.shape

dataset.drop_duplicates(inplace = True)
dataset.drop(['pickup_latitude','pickup_longitude',
         'dropoff_latitude','dropoff_longitude'],axis=1)

if dataset.shape == (rows, cols):
    print("No Duplicates Found")
else:
    print("Number of Duplicates: {}".format(rows - dataset.shape[0]))

Number of Duplicates: 109


In [39]:
# encoding categorical values

df1 = dataset.copy()
df3 = df1.copy()

ecc = nvc[nvc['Percentage']!=0].index.values
fcc = [i for i in cf if i not in ecc]

# one hot binary encoding
oh = True
dm = True

for i in fcc:
    if df3[i].nunique()==2:
        if oh==True: print("\033[1mOne-Hot Encoding on features:\033[0m")
        print(i);oh=False
        df3[i]=pd.get_dummies(df3[i], drop_first=True, prefix=str(i))
    if (df3[i].nunique()>2 and df3[i].nunique()<17):
        if dm==True: print("\n\033[1mDummy Encoding on features:\033[0m")
        print(i);dm=False
        df3 = pd.concat([df3.drop([i], axis=1), pd.DataFrame(pd.get_dummies(df3[i], drop_first=True, prefix=str(i)))],axis=1)
        
df3.shape


[1mDummy Encoding on features:[0m
Monthly_Quarter
Hourly_Segments
year
weekday
passenger_count


(199878, 33)

In [41]:
# removing outliers

df1 = df3.copy()

features1 = nf

for i in features1:
    Q1 = df1[i].quantile(0.25)
    Q3 = df1[i].quantile(0.75)
    IQR = Q3 - Q1
    df1 = df1[df1[i] <= (Q3+(1.5*IQR))]
    df1 = df1[df1[i] >= (Q1-(1.5*IQR))]
    df1 = df1.reset_index(drop=True)

display(df1.head())

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,Distance,Monthly_Quarter_Q2,Monthly_Quarter_Q3,Monthly_Quarter_Q4,Hourly_Segments_H2,...,weekday_4,weekday_5,weekday_6,passenger_count_1,passenger_count_2,passenger_count_3,passenger_count_4,passenger_count_5,passenger_count_6,passenger_count_208
0,7.5,-73.999817,40.738354,-73.999512,40.723217,1681.11,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,7.7,-73.994355,40.728225,-73.99471,40.750325,2454.36,0,1,0,0,...,1,0,0,1,0,0,0,0,0,0
2,12.9,-74.005043,40.74077,-73.962565,40.772647,5039.6,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
3,5.3,-73.976124,40.790844,-73.965316,40.803349,1661.44,1,0,0,0,...,1,0,0,0,0,1,0,0,0,0
4,4.9,-73.969019,40.75591,-73.969019,40.75591,0.0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0


## Data Manipulation

In [71]:
#splitting data into train and test sets
m = []
for i in df1.columns.values:
    m.append(i.replace(' ', '_'))
    
df1.columns = m
X = df1.drop([target], axis = 1)
y = df1[target]

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 100)
train_X.reset_index(drop = True, inplace = True)

In [72]:
# feature scaling
scaler = StandardScaler()

train_X = scaler.fit_transform(train_X)
train_X = pd.DataFrame(train_X, columns = X.columns)

test_X = scaler.transform(test_X)
test_X = pd.DataFrame(test_X, columns = X.columns)

## Regression Modelling

In [73]:
# linear regression

mlr = LinearRegression().fit(train_X, train_y)
y_pred = mlr.predict(test_X)

print("R2 Score: ",r2_score(test_y, y_pred))

R2 Score:  0.3294397974348473


In [74]:
# ridge regression
ridge = Ridge().fit(train_X, train_y)
y_pred = ridge.predict(test_X)

print("R2 Score: ",r2_score(test_y, y_pred))

R2 Score:  0.3294398074715096


In [75]:
# lasso regression
lasso = Lasso().fit(train_X, train_y)
y_pred = lasso.predict(test_X)

print("R2 Score: ",r2_score(test_y, y_pred))

R2 Score:  0.26226052163439806


In [76]:
# elastic-net regression
enr = ElasticNet().fit(train_X, train_y)
y_pred = enr.predict(test_X)

print("R2 Score: ",r2_score(test_y, y_pred))

R2 Score:  0.23952090035803697
