In [9]:
!pip install kaggle
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!mkdir -p visualizations

mv: cannot stat 'kaggle.json': No such file or directory


In [10]:
!pip install mpld3




In [11]:
!kaggle datasets download -d nagasai524/nyc-taxi-trip-records-from-jan-2023-to-jun-2023
!unzip nyc-taxi-trip-records-from-jan-2023-to-jun-2023.zip -d data/

Dataset URL: https://www.kaggle.com/datasets/nagasai524/nyc-taxi-trip-records-from-jan-2023-to-jun-2023
License(s): U.S. Government Works
nyc-taxi-trip-records-from-jan-2023-to-jun-2023.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  nyc-taxi-trip-records-from-jan-2023-to-jun-2023.zip
replace data/nyc_yellow_taxi_trip_records_from_Jan_to_Aug_2023.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: data/nyc_yellow_taxi_trip_records_from_Jan_to_Aug_2023.csv  A



In [None]:
import pandas as pd

columns = ['tpep_pickup_datetime', 'trip_distance', 'fare_amount', 'passenger_count', 'payment_type']

df_sample = pd.read_csv('/content/data/nyc_yellow_taxi_trip_records_from_Jan_to_Aug_2023.csv',
                        usecols=columns,
                        nrows=100000)

df_sample.head()

Unnamed: 0,tpep_pickup_datetime,passenger_count,trip_distance,payment_type,fare_amount
0,2023-06-30 23:59:59,2.0,17.62,1,70.0
1,2023-06-30 23:59:57,1.0,3.32,1,18.4
2,2023-06-30 23:59:55,1.0,2.8,1,14.9
3,2023-06-30 23:59:55,1.0,0.89,1,7.2
4,2023-06-30 23:59:55,4.0,1.56,1,10.0


In [None]:
df_sample = df_sample.dropna(subset=['trip_distance', 'fare_amount', 'passenger_count', 'payment_type'])

df_sample = df_sample[(df_sample['trip_distance'] > 0) & (df_sample['fare_amount'] > 0)]

df_sample.describe()


Unnamed: 0,passenger_count,trip_distance,payment_type,fare_amount
count,94188.0,94188.0,94188.0,94188.0
mean,1.395645,3.857354,1.223404,20.134561
std,0.919535,31.900032,0.47821,19.930086
min,0.0,0.01,1.0,0.01
25%,1.0,1.1,1.0,9.3
50%,1.0,1.85,1.0,13.5
75%,1.0,3.67,1.0,21.9
max,8.0,9673.23,4.0,887.8


In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import mpld3


df_filtered = df_sample[(df_sample['trip_distance'] <= 50) & (df_sample['fare_amount'] <= 500)]

fig, ax = plt.subplots(figsize=(10, 6))

scatter = ax.scatter(df_filtered['trip_distance'], df_filtered['fare_amount'],
                     c=df_filtered['passenger_count'], cmap='viridis', alpha=0.7, s=20)

ax.grid(True, linestyle='--', alpha=0.5)

cbar = plt.colorbar(scatter, ax=ax)
cbar.set_label('Passenger Count')

ax.set_xlabel("Trip Distance (miles)", fontsize=14, color='white')
ax.set_ylabel("Fare Amount ($)", fontsize=14, color='white')
ax.set_title("NYC Taxi: Trip Distance vs Fare Amount (Filtered, Colored by Passenger Count)", fontsize=16, color='white')


mpld3.display(fig)
mpld3.save_html(fig, "visualizations/trip_distance_vs_fare_amount.html")


In [None]:

valid_payment_types = [1, 2, 3, 4, 5, 6]
df_filtered_payment = df_sample[df_sample['payment_type'].isin(valid_payment_types)]

payment_type_counts = df_filtered_payment['payment_type'].value_counts()

fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.bar(payment_type_counts.index, payment_type_counts.values, color='teal', alpha=0.8)

ax.set_xlabel("Payment Type", fontsize=14, color='white')
ax.set_ylabel("Frequency", fontsize=14, color='white')
ax.set_title("NYC Taxi: Payment Method Distribution", fontsize=16, color='white')
ax.grid(True, linestyle='--', alpha=0.5)
ax.tick_params(axis='x', colors='white')
ax.tick_params(axis='y', colors='white')
total_count = payment_type_counts.sum()
for bar in bars:
    height = bar.get_height()
    percentage = (height / total_count) * 100
    ax.annotate(f'{percentage:.2f}%', xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=10)

mpld3.display(fig)
mpld3.save_html(fig, "visualizations/payment_method_distribution.html")


In [None]:
df_filtered_hour = df_sample.dropna(subset=['tpep_pickup_datetime'])

df_filtered_hour['pickup_datetime'] = pd.to_datetime(df_filtered_hour['tpep_pickup_datetime'])
df_filtered_hour['hour'] = df_filtered_hour['pickup_datetime'].dt.hour

hourly_rides = df_filtered_hour.groupby('hour').size()

fig, ax = plt.subplots(figsize=(10, 6))

ax.plot(hourly_rides.index, hourly_rides.values, marker='o', color='green', linestyle='-', linewidth=2)

ax.set_xlabel("Hour of Day", fontsize=14, color='white')
ax.set_ylabel("Number of Rides",fontsize=14, color='white')
ax.set_title("NYC Taxi: Hourly Ride Frequency",fontsize=16, color='white')
ax.grid(True, linestyle='--', alpha=0.5)

ax.axvspan(8, 10, color='yellow', alpha=0.3, label="Morning Rush (8-10 AM)")
ax.axvspan(17, 19, color='orange', alpha=0.3, label="Evening Rush (5-7 PM)")

ax.legend()

mpld3.display(fig)
mpld3.save_html(fig, "visualizations/hourly_ride_frequency.html")


