In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import warnings

In [None]:
# Suppress UserWarning
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
df_trips = pd.read_csv('trips2020_1.5M.csv.zip')
df_trips.head()

# Data cleaning

## Adding the trip distance length in time units

In [None]:
# To create the duration cost matrix later, we will need to use the trip distances in time units, which we currently do not have
# We can do this based on the two datetime columns

# First, turning the two columns into datetime objects
df_trips["tpep_pickup_datetime"] = pd.to_datetime(df_trips["tpep_pickup_datetime"])
df_trips["tpep_dropoff_datetime"] = pd.to_datetime(df_trips["tpep_dropoff_datetime"])

# Then, we can calculate the time length of the trip
df_trips['time_length'] = (df_trips["tpep_dropoff_datetime"] - df_trips["tpep_pickup_datetime"]).dt.total_seconds() / 3600

df_trips

## Dropping useless columns

In [None]:
df_trips = df_trips.drop(columns=['tpep_dropoff_datetime', 'tpep_pickup_datetime', 'store_and_fwd_flag', 'VendorID', 'RatecodeID', 'passenger_count', 'payment_type'])
df_trips

## Limiting the variables' values

In [None]:
# To clean the data, we will remove rows with missing values and outliers by filtering the data on specific conditions (explained in the report)
# Manually checking the data for DOLocationID and PULocationID using Excel's filter feature, we can see that there are no missing values or outliers in these columns
print(f"Dataframe shape before cleaning: {df_trips.shape}")

df_trips_filtered = df_trips.loc[
    (df_trips['trip_distance'] < 21.0) & 
    (df_trips['trip_distance'] > 0.2) & 
    (df_trips['time_length'] > 0.02)
]

print(f"Dataframe shape after cleaning: {df_trips_filtered.shape}")

# Memory usage optimization

In [None]:
df_trips.info()

In [None]:
# To reduce memory usage, we will limit the datatypes of the columns
df_trips[['fare_amount', 'extra', 'mta_tax', 'tip_amount','tolls_amount', 'improvement_surcharge', 'congestion_surcharge', 'trip_distance']] = df_trips[['fare_amount', 'extra', 'mta_tax', 'tip_amount','tolls_amount', 'improvement_surcharge', 'congestion_surcharge', 'trip_distance']].astype('float16')
df_trips['total_amount'] = df_trips['total_amount'].astype('float32')
df_trips.info()

# Data splitting

In [None]:
inputs = df_trips_filtered[['PULocationID', 'DOLocationID']]
labelDistance = df_trips_filtered['trip_distance']

x_trainDistance, x_testDistance, y_trainDistance, y_testDistance = train_test_split(inputs, labelDistance, random_state=69)

# Predictive model - distance cost matrix

## Final model training

In [None]:
# Train chosen model
finModelDistance = RandomForestRegressor()
finModelDistance.fit(x_trainDistance, y_trainDistance)

## Predictions

### Importing depot and shop coordinates

In [None]:
# Depot coordinates and DF
df_depot = pd.DataFrame({'id':'Depot', 'location_id':112}, index=[0])

# Shops' coordinates and DF
df_shops = pd.read_csv('2020_shop_locations.csv')
df_shops = df_shops.drop(columns=['Unnamed: 0', 'demand(kg)', 'stage', 'lat', 'long', 'borought', 'zone'])

# Create new df that concatenates depot coordinates with df_shops
df_points = pd.concat([df_depot, df_shops], ignore_index=True)
df_points

### Distance cost matrix

In [None]:
# Predict the cost matrix using every couple of ids in df_points
## First create a new df for the cost matrix
costMatrixDistance = pd.DataFrame(columns=df_points['id'], index=df_points['id'])

## Second fill the cost matrix with the predicted values
for id1 in df_points['id']:
    # Get the latitude and longitude for id1
    locationID1 = df_points.loc[df_points['id'] == id1, 'location_id'].values[0]
    
    for id2 in df_points['id']:
        # Get the latitude and longitude for id2
        locationID2 = df_points.loc[df_points['id'] == id2, 'location_id'].values[0]

        costMatrixDistance.loc[id1, id2] = finModelDistance.predict([[locationID1, locationID2]])[0]

costMatrixDistance

In [None]:
# Export the DataFrame to a CSV file
costMatrixDistance.to_csv('costMatrixDistance.csv')

# Predictive model - duration cost matrix

## Data splitting

In [None]:
labelTime = df_trips_filtered['time_length']

x_trainTime, x_testTime, y_trainTime, y_testTime = train_test_split(inputs, labelTime, random_state=420)

## Final model training

In [None]:
# Train chosen model
finModelDuration = RandomForestRegressor()
finModelDuration.fit(x_trainTime, y_trainTime)

## Predictions

### Duration cost matrix

In [None]:
# Predict the cost matrix using every couple of ids in df_points
## First create a new df for the cost matrix
costMatrixDuration = pd.DataFrame(columns=df_points['id'], index=df_points['id'])

## Second fill the cost matrix with the predicted values
for id1 in df_points['id']:
    # Get the latitude and longitude for id1
    locationID1 = df_points.loc[df_points['id'] == id1, 'location_id'].values[0]
    
    for id2 in df_points['id']:
        # Get the latitude and longitude for id2
        locationID2 = df_points.loc[df_points['id'] == id2, 'location_id'].values[0]

        costMatrixDuration.loc[id1, id2] = finModelDistance.predict([[locationID1, locationID2]])[0]

costMatrixDuration

In [None]:
# Export the DataFrame to a CSV file
costMatrixDuration.to_csv('costMatrixDuration.csv')

# Predictive model - financial cost matrix

## Data splitting

In [None]:
labelFinancial = df_trips_filtered['total_amount'] - df_trips_filtered['tip_amount']

x_trainFinancial, x_testFinancial, y_trainFinancial, y_testFinancial = train_test_split(inputs, labelFinancial, random_state=420)

## Final model training

In [None]:
# Train chosen model
finModelFinancial = RandomForestRegressor()
finModelFinancial.fit(x_trainFinancial, y_trainFinancial)

## Predictions

### Financial cost matrix

In [None]:
# Predict the cost matrix using every couple of ids in df_points
## First create a new df for the cost matrix
costMatrixFinancial = pd.DataFrame(columns=df_points['id'], index=df_points['id'])

## Second fill the cost matrix with the predicted values
for id1 in df_points['id']:
    # Get the latitude and longitude for id1
    locationID1 = df_points.loc[df_points['id'] == id1, 'location_id'].values[0]
    
    for id2 in df_points['id']:
        # Get the latitude and longitude for id2
        locationID2 = df_points.loc[df_points['id'] == id2, 'location_id'].values[0]

        costMatrixFinancial.loc[id1, id2] = finModelDistance.predict([[locationID1, locationID2]])[0]

costMatrixFinancial

In [None]:
# Export the DataFrame to a CSV file
costMatrixFinancial.to_csv('costMatrixFinancial.csv')