Imports and loading data

In [19]:
import pandas as pd
import pymongo
from pymongo import MongoClient
import datetime

taxitripdata = pd.read_csv(
    r'C:\Users\maraw\Desktop\Assignment 1\datasets\taxi_trip_data.csv', encoding='cp1252')
taxizonegeo = pd.read_csv(
    r'C:\Users\maraw\Desktop\Assignment 1\datasets\taxi_zone_geo.csv', encoding='cp1252')

if (taxitripdata is not None):
    print("taxitripdata loaded")

if (taxizonegeo is not None):
    print("taxizonegeo loaded")

taxitripdata loaded
taxizonegeo loaded


Data preprocessing

In [20]:
# a) Remove the columns “store_and_fwd_flag”, “rate_code” and “total_amount” from taxitripdata
taxitripdata = taxitripdata.drop(['store_and_fwd_flag', 'rate_code', 'total_amount'], axis=1)
# b) Drop rows with missing essential details that would be required to fulfill the upcoming queries
taxitripdata = taxitripdata.dropna(subset=['vendor_id', 'pickup_datetime', 'passenger_count', 'trip_distance', 'payment_type', 'fare_amount'])
# Convert date strings to datetime format
# Drop any rows where pickup or dropoff datetime is not a valid date
taxitripdata['pickup_datetime'] = pd.to_datetime(taxitripdata['pickup_datetime'], errors='coerce')
taxitripdata['dropoff_datetime'] = pd.to_datetime(taxitripdata['dropoff_datetime'], errors='coerce')
taxitripdata = taxitripdata.dropna(subset=['pickup_datetime', 'dropoff_datetime'])
# Remove any rows with negative values for fare_amount and tip_amount
taxitripdata = taxitripdata[(taxitripdata['fare_amount'] >= 0) & (taxitripdata['tip_amount'] >= 0)]
# Drop any rows where the payment type is not a valid value
taxitripdata = taxitripdata[(taxitripdata['payment_type'] == '1') | (taxitripdata['payment_type'] == '2') | (taxitripdata['payment_type'] == '3') | (taxitripdata['payment_type'] == '4') | (taxitripdata['payment_type'] == '5')]

Mongo connection

In [25]:
# c) Connect to the MongoDB server and create the database and collections
import urllib.parse

username = urllib.parse.quote_plus('marawan1805')
password = urllib.parse.quote_plus("Maroo@m12")

url = "mongodb+srv://marawan1805:Maroo%40m12@cluster0.iegm6mx.mongodb.net/test?authMechanism=DEFAULT".format(
    username, password)
cluster = MongoClient(url)
db = cluster["nyc_taxi_db"]
taxi_trip_col = db["taxi_trip_data"]
taxi_zone_col = db["taxi_zone_data"]

 Mongo insertion

In [3]:
from pymongo import InsertOne

# Function to insert data in batches
def insert_data_in_batches(data, collection, batch_size):
    total_records = len(data)
    num_batches = (total_records + batch_size - 1) // batch_size

    for i in range(num_batches):
        start_index = i * batch_size
        end_index = min((i + 1) * batch_size, total_records)
        current_batch = data[start_index:end_index]
        print(f"Inserting records {start_index + 1} to {end_index}...")

        try:
            requests = [InsertOne(record) for record in current_batch]
            collection.bulk_write(requests, ordered=False)
            print("Batch insertion complete.")
        except Exception as e:
            print(f"Error while inserting data in batch {i + 1}: {e}")

batch_size = 5000

data1 = taxitripdata.to_dict('records')[:20000]

insert_data_in_batches(data1, taxi_trip_col, batch_size)

taxi_zone_col.insert_many(taxizonegeo.to_dict("records"))

NameError: name 'taxitripdata' is not defined

Questions d-h

In [8]:
# d) Calculate the duration for each trip and add it as a new field in your database
taxi_trip_col.update_many(
    {},
    [
        {
            "$set": {
                "duration": {
                    "$divide": [
                        {
                            "$subtract": [
                                {"$toDate": "$dropoff_datetime"},
                                {"$toDate": "$pickup_datetime"}
                            ]
                        },
                        1000 * 60
                    ]
                }
            }
        }
    ]
)

# Fetch the first 5 records from the taxi_trip_col collection
cursor = taxi_trip_col.find().limit(5)

# Print the records with the newly added "duration" field
for record in cursor:
    print(record)

{'_id': ObjectId('641f0bdd711b726737505691'), 'vendor_id': 1, 'pickup_datetime': '2018-05-11 17:40:16', 'dropoff_datetime': '2018-05-11 17:55:35', 'passenger_count': 1, 'trip_distance': 1.6, 'payment_type': 1, 'fare_amount': 11.5, 'extra': 1.0, 'mta_tax': 0.5, 'tip_amount': 0.0, 'tolls_amount': 0.0, 'imp_surcharge': 0.3, 'pickup_location_id': 48, 'dropoff_location_id': 68, 'duration': 15.316666666666666}
{'_id': ObjectId('641f0bdd711b726737505692'), 'vendor_id': 2, 'pickup_datetime': '2018-03-22 23:01:41', 'dropoff_datetime': '2018-03-22 23:25:36', 'passenger_count': 1, 'trip_distance': 9.52, 'payment_type': 1, 'fare_amount': 28.5, 'extra': 0.5, 'mta_tax': 0.5, 'tip_amount': 5.96, 'tolls_amount': 0.0, 'imp_surcharge': 0.3, 'pickup_location_id': 138, 'dropoff_location_id': 230, 'duration': 23.916666666666668}
{'_id': ObjectId('641f0bdd711b726737505693'), 'vendor_id': 2, 'pickup_datetime': '2018-07-24 09:58:45', 'dropoff_datetime': '2018-07-24 10:22:37', 'passenger_count': 1, 'trip_dista

In [24]:
# e) Calculate the total trip cost and add it as a new field in your database
taxi_trip_col.update_many(
    {},
    [{"$set": {
        "total_trip_cost": {
            "$add": [
                "$fare_amount", "$extra", "$mta_tax", "$tip_amount", "$tolls_amount", "$imp_surcharge"
            ]
        }
    }}]
)

# Fetch the first 5 records from the taxi_trip_col collection
cursor = taxi_trip_col.find().limit(5)

# Print the records with the newly added "duration" field
for record in cursor:
    print(record)

In [18]:
# f) What is the most common payment type used per time of day?
f = [
    {"$addFields": {
        "pickup_datetime": {"$dateFromString": {"dateString": "$pickup_datetime"}},
        "time_of_day": {
            "$switch": {
                "branches": [
                    {
                        "case": {
                            "$and": [
                                {"$gte": [{"$hour": "$pickup_datetime"}, 6]},
                                {"$lt": [{"$hour": "$pickup_datetime"}, 12]}
                            ]
                        },
                        "then": "morning"
                    },
                    {
                        "case": {
                            "$and": [
                                {"$gte": [{"$hour": "$pickup_datetime"}, 12]},
                                {"$lt": [{"$hour": "$pickup_datetime"}, 18]}
                            ]
                        },
                        "then": "afternoon"
                    },
                    {
                        "case": {
                            "$and": [
                                {"$gte": [{"$hour": "$pickup_datetime"}, 18]},
                                {"$lt": [{"$hour": "$pickup_datetime"}, 24]}
                            ]
                        },
                        "then": "evening"
                    }
                ],
                "default": "night"
            }
        }
    }},
    {"$group": {
        "_id": "$time_of_day",
        "count": {"$sum": 1},
        "payment_type": {"$push": "$payment_type"}
    }},
    {"$addFields": {
        "most_common_payment_type": {
            "$arrayElemAt": [
                {
                    "$filter": {
                        "input": {"$setUnion": ["$payment_type"]},
                        "as": "type",
                        "cond": {"$eq": ["$$type", {"$min": "$payment_type"}]}
                    }
                },
                0
            ]
        }
    }},
    {"$project": {
        "_id": 1,
        "most_common_payment_type": 1
    }}
]

most_common_payment = list(taxi_trip_col.aggregate(f))
print("Most common payment type per time of day:", most_common_payment)


OperationFailure: PlanExecutor error during aggregation :: caused by :: can't convert from BSON type string to Date, full error: {'ok': 0.0, 'errmsg': "PlanExecutor error during aggregation :: caused by :: can't convert from BSON type string to Date", 'code': 16006, 'codeName': 'Location16006', '$clusterTime': {'clusterTime': Timestamp(1679758285, 1), 'signature': {'hash': b"H\xfd\xe4\xd9\x9f\x97w'\x19n\xa5\xb3\x10H\xa7\xca\x031\x1b?", 'keyId': 7150722443026366467}}, 'operationTime': Timestamp(1679758285, 1)}

In [None]:
# g) What is the average tip amount per passenger count?
g = [
{"$group": {
"_id": "$passenger_count",
"avg_tip_amount": {"$avg": "$tip_amount"}
}},
{"$sort": {"_id": 1}}
]

avg_tip_per_passenger = list(taxi_trip_col.aggregate(g))
print("Average tip amount per passenger count:", avg_tip_per_passenger)

In [None]:
# h) What are the best 5 locations for drivers to pick up passengers from?
h = [
{"$group": {
"_id": "$pickuplocationid",
"count": {"$sum": 1}
}},
{"$sort": {"count": -1}},
{"$limit": 5},
{"$lookup": {
"from": "taxi_zone_data",
"localField": "_id",
"foreignField": "zone_id",
"as": "zone_data"
}},
{"$unwind": "$zone_data"}
]

top_5_pickup_locations = list(taxi_trip_col.aggregate(h))
print("Top 5 pickup locations:", top_5_pickup_locations)

Bonus

In [None]:
# Correlation between trip distance and tip amount
bonus1 = [
    {"$group": {
        "_id": None,
        "correlation": {
            "$avg": {
                "$multiply": [{"$stdDevPop": "$trip_distance"}, {"$stdDevPop": "$tip_amount"}]
            }
        }
    }}
]

correlation = list(taxi_trip_col.aggregate(bonus1))
print("Correlation between trip distance and tip amount:", correlation)

Visualizations

In [None]:
import matplotlib.pyplot as plt

# f) Most common payment type used per time of day
time_of_day = [record["_id"] for record in most_common_payment]
payment_type = [record["payment_type"] for record in most_common_payment]

plt.bar(time_of_day, payment_type)
plt.xlabel("Time of Day")
plt.ylabel("Most Common Payment Type")
plt.title("Most Common Payment Type per Time of Day")
plt.show()

# g) Average tip amount per passenger count
passenger_counts = [record["_id"] for record in avg_tip_per_passenger]
avg_tip_amounts = [record["avg_tip_amount"] for record in avg_tip_per_passenger]

plt.plot(passenger_counts, avg_tip_amounts)
plt.xlabel("Passenger Count")
plt.ylabel("Average Tip Amount")
plt.title("Average Tip Amount per Passenger Count")
plt.show()

# h) Top 5 pickup locations
pickup_locations = [record["zone_data"]["zone_name"] for record in top_5_pickup_locations]
pickup_counts = [record["count"] for record in top_5_pickup_locations]

plt.bar(pickup_locations, pickup_counts)
plt.xlabel("Pickup Location")
plt.xticks(rotation=45, ha="right")
plt.ylabel("Number of Pickups")
plt.title("Top 5 Pickup Locations")
plt.show()