# Big Data Analytics – NYC Taxi Sample Dataset

This notebook demonstrates basic analytics on a real-style taxi dataset.

In [7]:
import pandas as pd

df = pd.read_csv('nyc_taxi_sample.csv')
df.head()


Unnamed: 0,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,fare_amount,tip_amount,total_amount
0,2023-01-01 08:00:00,2023-01-01 08:15:00,1,1.2,6.5,1.0,7.5
1,2023-01-01 09:00:00,2023-01-01 09:15:00,2,3.5,14.2,2.5,16.7
2,2023-01-01 10:00:00,2023-01-01 10:15:00,1,2.1,9.8,0.0,9.8
3,2023-01-01 11:00:00,2023-01-01 11:15:00,3,5.0,21.0,4.0,25.0
4,2023-01-01 12:00:00,2023-01-01 12:15:00,2,0.8,5.5,0.0,5.5


## Task 1 – Data Overview

In [17]:
df.shape
print("Number of rows", len(df))
print("Number of columns", df.shape[1])

print("Number of missing values in each column")
print(df.isnull().sum())
# also visibly no missing values

print("\n")

# Invalid amounts would only be negative amounts, so only save
# the amount that are larger than 0:

df = df[df['trip_distance'] > 0]
df = df[df['fare_amount'] > 0]

# MIssing would be the value of null, if it is present, change
# it for 0:

df['tip_amount'] = df['tip_amount'].fillna(0)
df.head()





Number of rows 10
Number of columns 7
Number of missing values in each column
pickup_datetime     0
dropoff_datetime    0
passenger_count     0
trip_distance       0
fare_amount         0
tip_amount          0
total_amount        0
dtype: int64




Unnamed: 0,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,fare_amount,tip_amount,total_amount
0,2023-01-01 08:00:00,2023-01-01 08:15:00,1,1.2,6.5,1.0,7.5
1,2023-01-01 09:00:00,2023-01-01 09:15:00,2,3.5,14.2,2.5,16.7
2,2023-01-01 10:00:00,2023-01-01 10:15:00,1,2.1,9.8,0.0,9.8
3,2023-01-01 11:00:00,2023-01-01 11:15:00,3,5.0,21.0,4.0,25.0
4,2023-01-01 12:00:00,2023-01-01 12:15:00,2,0.8,5.5,0.0,5.5


## Task 2 – Descriptive Statistics

In [18]:
df[['trip_distance','fare_amount','total_amount']].describe()



Unnamed: 0,trip_distance,fare_amount,total_amount
count,10.0,10.0,10.0
mean,3.03,13.28,15.42
std,1.757555,6.816777,8.387928
min,0.8,5.5,5.5
25%,1.65,7.85,8.75
50%,2.85,12.25,14.5
75%,4.025,16.525,19.55
max,6.3,26.8,31.8


## Task 3 – Average Trip Duration

In [None]:

df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])
df['trip_duration_min'] = (df['dropoff_datetime'] - df['pickup_datetime']).dt.total_seconds() / 60
df['trip_duration_min'].mean()


## Task 4 – Revenue Analysis

In [None]:

total_revenue = df['total_amount'].sum()
avg_revenue = df['total_amount'].mean()
total_revenue, avg_revenue


## Task 5 – Tip Analysis

In [None]:

avg_tip = df['tip_amount'].mean()
zero_tip_pct = (df['tip_amount'] == 0).mean() * 100
avg_tip, zero_tip_pct


## Task 6 – Distance vs Fare Correlation

In [None]:
df[['trip_distance','fare_amount']].corr()