In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import warnings

In [5]:
# Suppress UserWarning
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
df_trips = pd.read_csv('trips/trips2015_1.5M.csv.zip')
df_trips.head()

# Memory usage optimization

In [None]:
df_trips.info()

In [None]:
# To reduce memory usage, we will limit the datatypes of the columns
df_trips[['VendorID', 'passenger_count', 'RateCodeID', 'payment_type']] = df_trips[['VendorID', 'passenger_count', 'RateCodeID', 'payment_type']].astype('int8')
df_trips[['fare_amount','dropoff_longitude', 'dropoff_latitude', 'pickup_longitude', 'pickup_latitude', 'extra', 'mta_tax', 'tip_amount','tolls_amount' ]] = df_trips[['fare_amount','dropoff_longitude', 'dropoff_latitude', 'pickup_longitude', 'pickup_latitude', 'extra', 'mta_tax', 'tip_amount','tolls_amount']].astype('float16')
df_trips['total_amount'] = df_trips['total_amount'].astype('float32')
df_trips['trip_distance'] = df_trips['trip_distance'].astype('float16')
df_trips.info()

# Data cleaning

## Adding the trip distance length in time units

In [None]:
# To create the duration cost matrix later, we will need to use the trip distances in time units, which we currently do not have
# We can do this based on the two datetime columns

# First, turning the two columns into datetime objects
df_trips["tpep_pickup_datetime"] = pd.to_datetime(df_trips["tpep_pickup_datetime"])
df_trips["tpep_dropoff_datetime"] = pd.to_datetime(df_trips["tpep_dropoff_datetime"])

# Then, we can calculate the time length of the trip
df_trips['time_length'] = (df_trips["tpep_dropoff_datetime"] - df_trips["tpep_pickup_datetime"]).dt.total_seconds() / 3600
df_trips = df_trips.drop(columns=['tpep_dropoff_datetime', 'tpep_pickup_datetime', 'store_and_fwd_flag'])

df_trips

## Limiting the variables' values

In [None]:
# To clean the data, we will remove rows with missing values and outliers by filtering the data on specific conditions (explained in the report)
print(f"Dataframe shape before cleaning: {df_trips.shape}")

df_trips_filtered = df_trips.loc[
    (df_trips['trip_distance'] < 21.0) & 
    (df_trips['trip_distance'] > 0.2) & 
    (df_trips['dropoff_longitude'] <= -73) & 
    (df_trips['dropoff_longitude'] > -75) & 
    (df_trips['dropoff_latitude'] >= 40) & 
    (df_trips['dropoff_latitude'] < 42) &
    (df_trips['pickup_longitude'] <= -73) & 
    (df_trips['pickup_longitude'] > -75) & 
    (df_trips['pickup_latitude'] >= 40) & 
    (df_trips['pickup_latitude'] < 42) &
    (df_trips['time_length'] > 0.02)
]

print(f"Dataframe shape after cleaning: {df_trips_filtered.shape}")

# Data splitting

In [12]:
inputs = df_trips_filtered[['pickup_longitude', 'pickup_latitude','dropoff_longitude', 'dropoff_latitude']]
labelDistance = df_trips_filtered['trip_distance']

x_trainDistance, x_testDistance, y_trainDistance, y_testDistance = train_test_split(inputs, labelDistance, random_state=69)

# Choice of regression model

## Comparison of possible regression models

In [8]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet, BayesianRidge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

In [None]:
# List of models for comparison
models = [
    ('Linear Regression', LinearRegression()),
    ('Random Forest Regression', RandomForestRegressor()),
    ('Decision Tree Regression', DecisionTreeRegressor()),
    ('K-Nearest Neighbors Regression', KNeighborsRegressor()),
    ('Gradient Boosting Regression', GradientBoostingRegressor()),
    ('AdaBoost Regression', AdaBoostRegressor()),
    ('Ridge Regression', Ridge()),
    ('Lasso Regression', Lasso()),
    ('Elastic Net Regression', ElasticNet()),
    ('Bayesian Regression', BayesianRidge()),
    ('Polynomial Regression', make_pipeline(PolynomialFeatures(degree=2), LinearRegression()))
]

# Loop through models and calculate MAE
for name, model in models:
    model.fit(x_trainDistance, y_trainDistance)
    y_pred = model.predict(x_testDistance)
    mae = mean_absolute_error(y_testDistance, y_pred)
    print(f"{name} MAE: {mae}")

## Comparison Result
As can be seen from above, the RF regression model is the most accurate. This will therefore be the model that will be kept for the rest of this code.

# Predictive model - distance cost matrix

## Final model training

In [None]:
# Train chosen model
finModelDistance = RandomForestRegressor()
finModelDistance.fit(x_trainDistance, y_trainDistance)

## Predictions

### Importing depot and shop coordinates

In [None]:
# Depot coordinates and DF
depot_long = -73.941868
depot_lat = 40.725516
df_depot = pd.DataFrame({'id':'Depot', 'lat': [depot_lat], 'long': [depot_long]})

# Shops' coordinates and DF
df_shops = pd.read_csv('2015_shop_locations.csv')
df_shops = df_shops.drop(columns=['demand(kg)', 'stage'])

# Create new df that concatenates depot coordinates with df_shops
df_points = pd.concat([df_depot, df_shops], ignore_index=True)
df_points

### Distance cost matrix

In [None]:
# Predict the cost matrix using every couple of ids in df_points
## First create a new df for the cost matrix
costMatrixDistance = pd.DataFrame(columns=df_points['id'], index=df_points['id'])

## Second fill the cost matrix with the predicted values
for id1 in df_points['id']:
    # Get the latitude and longitude for id1
    lat1 = df_points.loc[df_points['id'] == id1, 'lat'].values[0]
    long1 = df_points.loc[df_points['id'] == id1, 'long'].values[0]
    
    for id2 in df_points['id']:
        # Get the latitude and longitude for id2
        lat2 = df_points.loc[df_points['id'] == id2, 'lat'].values[0]
        long2 = df_points.loc[df_points['id'] == id2, 'long'].values[0]

        costMatrixDistance.loc[id1, id2] = finModelDistance.predict([[long1, lat1, long2, lat2]])[0]

costMatrixDistance

In [55]:
# Export the DataFrame to a CSV file
costMatrixDistance.to_csv('costMatrixDistance.csv')

# Predictive model - duration cost matrix

## Data splitting

In [42]:
labelTime = df_trips_filtered['time_length']

x_trainTime, x_testTime, y_trainTime, y_testTime = train_test_split(inputs, labelTime, random_state=420)

## Final model training

In [None]:
# Train chosen model
finModelDuration = RandomForestRegressor()
finModelDuration.fit(x_trainTime, y_trainTime)

## Predictions

### Importing depot and shop coordinates

In [None]:
# Depot coordinates and DF
depot_long = -73.941868
depot_lat = 40.725516
df_depot = pd.DataFrame({'id':'Depot', 'lat': [depot_lat], 'long': [depot_long]})

# Shops' coordinates and DF
df_shops = pd.read_csv('2015_shop_locations.csv')
df_shops = df_shops.drop(columns=['demand(kg)', 'stage'])

# Create new df that concatenates depot coordinates with df_shops
df_points = pd.concat([df_depot, df_shops], ignore_index=True)
df_points

### Duration cost matrix

In [None]:
# Predict the cost matrix using every couple of ids in df_points
## First create a new df for the cost matrix
costMatrixDuration = pd.DataFrame(columns=df_points['id'], index=df_points['id'])

## Second fill the cost matrix with the predicted values
for id1 in df_points['id']:
    # Get the latitude and longitude for id1
    lat1 = df_points.loc[df_points['id'] == id1, 'lat'].values[0]
    long1 = df_points.loc[df_points['id'] == id1, 'long'].values[0]
    
    for id2 in df_points['id']:
        # Get the latitude and longitude for id2
        lat2 = df_points.loc[df_points['id'] == id2, 'lat'].values[0]
        long2 = df_points.loc[df_points['id'] == id2, 'long'].values[0]

        costMatrixDuration.loc[id1, id2] = finModelDuration.predict([[long1, lat1, long2, lat2]])[0]

costMatrixDuration

In [54]:
# Export the DataFrame to a CSV file
costMatrixDuration.to_csv('costMatrixDuration.csv')

# Predictive model - financial cost matrix

## Data splitting

In [14]:
labelFinancial = df_trips_filtered['total_amount'] - df_trips_filtered['tip_amount']

x_trainFinancial, x_testFinancial, y_trainFinancial, y_testFinancial = train_test_split(inputs, labelFinancial, random_state=420)

## Final model training

In [None]:
# Train chosen model
finModelFinancial = RandomForestRegressor()
finModelFinancial.fit(x_trainFinancial, y_trainFinancial)

## Predictions

### Importing depot and shop coordinates

In [None]:
# Depot coordinates and DF
depot_long = -73.941868
depot_lat = 40.725516
df_depot = pd.DataFrame({'id':'Depot', 'lat': [depot_lat], 'long': [depot_long]})

# Shops' coordinates and DF
df_shops = pd.read_csv('2015_shop_locations.csv')
df_shops = df_shops.drop(columns=['demand(kg)', 'stage'])

# Create new df that concatenates depot coordinates with df_shops
df_points = pd.concat([df_depot, df_shops], ignore_index=True)
df_points

### Financial cost matrix

In [None]:
# Predict the cost matrix using every couple of ids in df_points
## First create a new df for the cost matrix
costMatrixFinancial = pd.DataFrame(columns=df_points['id'], index=df_points['id'])

## Second fill the cost matrix with the predicted values
for id1 in df_points['id']:
    # Get the latitude and longitude for id1
    lat1 = df_points.loc[df_points['id'] == id1, 'lat'].values[0]
    long1 = df_points.loc[df_points['id'] == id1, 'long'].values[0]
    
    for id2 in df_points['id']:
        # Get the latitude and longitude for id2
        lat2 = df_points.loc[df_points['id'] == id2, 'lat'].values[0]
        long2 = df_points.loc[df_points['id'] == id2, 'long'].values[0]

        costMatrixFinancial.loc[id1, id2] = finModelFinancial.predict([[long1, lat1, long2, lat2]])[0]

costMatrixFinancial

In [20]:
# Export the DataFrame to a CSV file
costMatrixFinancial.to_csv('costMatrixFinancial.csv')