# NYC_Taxi

## 1. Data Loading

In [None]:
import os

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as snb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

%matplotlib inline
snb.set({'figure.figsize':(16,8), 'axes.titlesize':30, 'axes.labelsize':20})
#mpl.rcParams('axes.titilesize')=20

In [None]:
TRAINFILEPATH = os.path.join('..', 'input', 'nyc-taxi-duration', 'train.csv')

In [None]:
df = pd.read_csv(TRAINFILEPATH, index_col=0)

## 2. Data Exploration

**a- First steps in dataset exploration**

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isna().sum()

The dataset is complete. There is no missing data.

**b- Features vizualisation**

In [None]:
ax = df['passenger_count'].value_counts(normalize=True).plot.bar();
ax.set_ylabel("Percentage")
ax.set_xlabel("Passenger count")
ax.set_title("Repartition of passenger count");

In [None]:
df[df['passenger_count'] == 0].shape

70% of the trips has one passenger. There is 60 trips with no passengers! This is a little bit suspecious!

In [None]:
ax = df['vendor_id'].value_counts(normalize=True).plot.bar()
ax.set_xlabel("Vendor ID")
ax.set_ylabel("Frequency")
ax.set_title("Frequency of vendor ID");

Vendor ID may refer to the Taxi's campany. 

In [None]:
ax = df['store_and_fwd_flag'].value_counts(normalize=True).plot.bar()
ax.set_xlabel("store_and_fwd_flag")
ax.set_ylabel("Frequency")
ax.set_title("Frequency of store_and_fwd_flag");

Almost all the trip have the value 'N' in 'store_and_fwd_flag' variable which means that the trip data was sent immediately to the vendor.

In [None]:
plt.scatter(df['vendor_id'], df['trip_duration'])
plt.xlabel("Vendor ID");
plt.ylabel("Trip duration")
plt.title("Scatter plot of vendor ID & trip duration");

The large trip durations are made by the vendor ID 1

**c- Date transformation**

In [None]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])

In [None]:
df[df['pickup_datetime'].dt.year == 2016].shape, df[df['dropoff_datetime'].dt.year == 2016].shape

In [None]:
ax = df['pickup_datetime'].dt.year.value_counts(normalize=True, ascending=True,).plot.bar()
ax.set_xlabel("year");
ax.set_ylabel("Frequency")
ax.set_title("Pickup frequency by years ");

All the trips happened on 2016.

In [None]:
ax = df['pickup_datetime'].dt.month.value_counts(normalize=True, ascending=True,).plot.bar()
ax.set_xlabel("month");
ax.set_ylabel("Frequency")
ax.set_title("Pickup frequency by months ");

The trips happended between January and June on 2016.

**d- Target variable: trip_duration**

In [None]:
ax = df["trip_duration"].plot.hist()
ax.set_xlabel("Trip duration")
ax.set_ylabel("Frequency")
ax.set_title("Frequency of trip duration");

In [None]:
ax = df.loc[df['trip_duration'] < 5000, 'trip_duration'].hist(bins=20)
ax.set_xlabel("Trip duration")
ax.set_ylabel("Frequency")
ax.set_title("Frequency of trip duration - A zoom");

 There is a lot of trips with small duration ie. less than 4500 secondes (75 minutes) and a few larger trip duration that aren’t visible at all at the right.

 Let's try the log of the trip duration:

In [None]:
ax = np.log(df["trip_duration"]).hist(bins=50)
ax.set_xlabel("Log of trip duration")
ax.set_ylabel("Frequency")
ax.set_title("Frequency of trip durations log");

We can see that the log almost follows a normal distribution. We should also notice the values between log (11) and log(12).

In [None]:
snb.boxplot(df["trip_duration"]);

There are some suspecious trip durations that took about 2000000 secondes ie. 23 days! 

## 3. Data preprocessing

**a- Month, day, hour and minute of the pickup and dropoff extraction**

In [None]:
train = df # create a copy of our dataset

In [None]:
# adding the new columns to the train dataset
train['month_pickup']=df['pickup_datetime'].dt.month
train['day_pickup']=df['pickup_datetime'].dt.day
train['hour_pickup']=df['pickup_datetime'].dt.hour
train['minute_pickup']=df['pickup_datetime'].dt.minute
train['second_pickup']=df['pickup_datetime'].dt.second

train = train.drop(columns=['pickup_datetime'])

 **b- Crow flies distance of the trips**

We add a new colunm to the dataset that gives the crow flies distance between the pickup and dropoff trips. 

In [None]:
# This piece of code is inspired from: http://blog.tkbe.org/archive/python-calculating-the-distance-between-two-locations/
# It calculate the "crow flies" distance between two locations 
import math
 
def cosrad(n):
    "Return the cosine of ``n`` degrees in radians."
    return math.cos(math.radians(n))

def distance(row):
    """Calculate the distance between two points on earth.
    """
    lat1 = row['pickup_latitude']
    long1 = row['pickup_longitude']
    lat2 = row['dropoff_latitude']
    long2 = row['dropoff_longitude']
    earth_radius = 6371  # km
    dLat = math.radians(lat2 - lat1)
    dLong = math.radians(long2 - long1)
    a = (math.sin(dLat / 2) ** 2 +
         cosrad(lat1) * cosrad(lat2) * math.sin(dLong / 2) ** 2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    d = earth_radius * c
    return d

In [None]:
# adding the trip distance column
train['trip_distance']=df.apply(distance, axis=1)

**c- Log transform of the trip duration**

In [None]:
train['trip_duration_log']=df['trip_duration'].apply(np.log)
#train = train.drop(columns=['pickup_datetime','dropoff_datetime'])

In [None]:
train.head()

## 4. Features engineering

In [None]:
X = train[['trip_distance', 'month_pickup', 'day_pickup', 'hour_pickup', 'minute_pickup',]]
y = train['trip_duration_log']
X.shape, y.shape

## 5. Validation and scoring methods

We choose to usde the cross-validation method. The scoring function, which is RMSLE as specified by the competition, is impelemented.

In [None]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score

In [None]:
# Find in the comment of Enrique Pérez Herrero in: https://www.kaggle.com/marknagelberg/rmsle-function
def rmsle_func(ypred, ytest) :
    assert len(ytest) == len(ypred)
    return np.sqrt(np.mean((np.log1p(ypred) - np.log1p(ytest))**2))

In [None]:
rmsle = make_scorer(rmsle_func) # Make RMSLE as a scorer

## 6. Models selection

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression

In [None]:
rfr = RandomForestRegressor(n_estimators=100)
reg = LinearRegression()
sgdr = SGDRegressor()

## 7. Models training

In [None]:
scores_rfr = cross_val_score(rfr, X, y, cv=5, scoring=rmsle)

In [None]:
print("RMSLE: %0.2f (+/- %0.4f)" % (scores_rfr.mean(), scores_rfr.std() * 2))

In [None]:
scores_reg = cross_val_score(reg, X, y, cv=5, scoring=rmsle)
print("RMSLE: %0.2f (+/- %0.4f)" % (scores_reg.mean(), scores_reg.std() * 2))

In [None]:
scores_sgdr= cross_val_score(sgdr, X, y, cv=5, scoring=rmsle)

In [None]:
print("RMSLE: %0.2f (+/- %0.4f)" % (scores_sgdr.mean(), scores_sgdr.std() * 2))

We notice that the Random Forest Regressor gives the smallest mean of the five cross-validation RMSLEs (0.07). So, we choose this model to fit it.

## 8. Chosen model fiting

In [None]:
rfr.fit(X, y)

## 9. Predictions

**a- Laod the test file**

In [None]:
TESTFILEPATH = os.path.join('..', 'input', 'nyc-taxi-duration', 'test.csv')
test = pd.read_csv(TESTFILEPATH)

**b- Data transformation**

In [None]:
# Date type transformation
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'])
# It calculate the "crow flies" distance between two locations 
test['month_pickup']=test['pickup_datetime'].dt.month
test['day_pickup']=test['pickup_datetime'].dt.day
test['hour_pickup']=test['pickup_datetime'].dt.hour
test['minute_pickup']=test['pickup_datetime'].dt.minute
test['second_pickup']=test['pickup_datetime'].dt.second
test = test.drop(columns=['pickup_datetime'])
# adding the trip distance column
test['trip_distance']=test.apply(distance, axis=1)

In [None]:
test.head()

In [None]:
# test independant variables (features)
test_X = test[['trip_distance', 'month_pickup', 'day_pickup', 'hour_pickup', 'minute_pickup']]

**c- Predictions using the chosen model**

In [None]:
predicted_duration_log = rfr.predict(test_X) 

In [None]:
predicted_duration = np.exp(predicted_duration_log) # reverse the log predictions
predicted_duration

## 10. Prediction file submission

In [None]:
my_submission = pd.DataFrame({'id': test['id'], 'trip_duration': predicted_duration})
my_submission.to_csv('submission.csv', index=False)