# NYC Taxi Fare Prediction - Data Cleaning

In [2]:
# Importing Libraries

import pandas as pd
import sklearn as sk
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from geopy.distance import geodesic
from pyproj import Geod
import seaborn as sns
from math import sin, cos, sqrt, atan2, radians
import dask.dataframe as dd

In [None]:
# Read CSV

train_data = pd.read_csv('train.csv')

In [None]:
# Visually inspecting the data

print(train_data.head())
print(train_data.shape)
print(train_data.info())
print(train_data.describe())
print(train_data.isnull().sum())

In [None]:
#Counting the values in passenger_count column

train_data.passenger_count.value_counts(dropna=False)

In [None]:
#Counting the values in pickup_datetime column

train_data.pickup_datetime.value_counts(dropna=False)

In [None]:
#Counting the values in fare_amount column

train_data.fare_amount.value_counts(dropna=False)

In [None]:
#Fare amount has some negative values

train_data[train_data.fare_amount<0]

In [None]:
# Dropping Null Values

print('Old size: %d' % len(train_data))
train_data = train_data.dropna(how = 'any', axis = 'rows')
print('New size: %d' % len(train_data))

In [None]:
# Removing passenger_count = 0 

print('Old size: %d' % len(train_data))
train_data = train_data[train_data.passenger_count>=1]
print('New size: %d' % len(train_data))

In [None]:
# Removing passenger_count greater than 6

print('Old size: %d' % len(train_data))
train_data = train_data[train_data.passenger_count<=6]
print('New size: %d' % len(train_data))

In [None]:
# Removing 0 fare_amount

print('Old size: %d' % len(train_data))
train_data = train_data[train_data.fare_amount>=1.0]
print('New size: %d' % len(train_data))

In [None]:
# Removing 0 pickup_longitude, pickup_longitude, dropoff_longitude, dropoff_latitude

print('Old size: %d' % len(train_data))
train_data = train_data[((train_data) != 0).all(1)]
print('New size: %d' % len(train_data))

In [None]:
# Removing fare_amount greater than 1000.00

print('Old size: %d' % len(train_data))
train_data = train_data[train_data.fare_amount<=1000.00]
print('New size: %d' % len(train_data))

In [None]:
#Exporting training data with new features to a new csv

train_data.to_csv('train_cleaned.csv')

In [None]:
# dropping the Unnamed column

train_data = train_data.drop(['Unnamed: 0'], axis=1)

In [None]:
# Calculate Distance between pickup and drop-off points
lat1 = train_data.loc[:,['pickup_latitude']]
long1 = train_data.loc[:,['pickup_longitude']]
lat2 = train_data.loc[:,['dropoff_latitude']]
long2 = train_data.loc[:,['dropoff_longitude']]

# To get the distance between the two rides we need to convert Latitude and Longitude in Radians
train_data['pickup_latitude_radians'] = train_data['pickup_latitude'].apply(lambda x: radians(x))
train_data['pickup_longitude_radians'] = train_data['pickup_longitude'].apply(lambda x: radians(x))
train_data['dropoff_latitude_radians'] = train_data['dropoff_latitude'].apply(lambda x: radians(x))
train_data['dropoff_longitude_radians'] = train_data['dropoff_longitude'].apply(lambda x: radians(x))

#checking difference between latitude and longitude in radians in training data
lat1 = train_data['pickup_latitude_radians']
long1 = train_data['pickup_longitude_radians']
lat2 = train_data['dropoff_latitude_radians']
long2 = train_data['dropoff_longitude_radians']

train_data['dlon'] = long2 - long1
train_data['dlat'] = lat2 - lat1

R = 6373.0
def distance(dlat,dlon,lat1,lat2,R):
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    dist = R * c
    return dist

In [None]:
train_data['result'] = train_data.map(distance(train_data['dlat'],train_data['dlon'],train_data['pickup_latitude_radians']),train_data['dropoff_latitude_radians'],R)

# NYC Taxi Fare Prediction - Data Story

In [None]:
# Plot a scatter plot

_ = plt.plot(train_data['fare_amount'], train_data['passenger_count'], marker='.',linestyle='none')
_ = plt.xlabel('Fare Amount')
_ = plt.ylabel('Passenger_Count')
# Show the plot
plt.show()

In [None]:
# Plot a scatter plot

_ = plt.plot(train_data['fare_amount'], train_data['passenger_count'], marker='.',linestyle='none')
_ = plt.xlabel('Fare Amount')
_ = plt.ylabel('Passenger_Count')
# Show the plot
plt.show()

In [None]:
# Barplot

sns.set(style="whitegrid")
ax = sns.barplot(x="fare_amount", y="passenger_count", data=train_data)