### Instructions:
* Download the Yellow Taxi Data for April from the main source here https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
* Use pandas to inspect the data and perform some transformations
* Finally, write the output as Parquet file 

In [81]:
import pandas as pd

In [82]:
df = pd.read_parquet('yellow_tripdata_2024-04.parquet')

In [83]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,1,2024-04-01 00:02:40,2024-04-01 00:30:42,0.0,5.2,1.0,N,161,7,1,29.6,3.5,0.5,8.65,0.0,1.0,43.25,2.5,0.0
1,2,2024-04-01 00:41:12,2024-04-01 00:55:29,1.0,5.6,1.0,N,264,264,1,25.4,1.0,0.5,10.0,0.0,1.0,37.9,0.0,0.0
2,2,2024-04-01 00:48:42,2024-04-01 01:05:30,1.0,3.55,1.0,N,186,236,1,20.5,1.0,0.5,5.1,0.0,1.0,30.6,2.5,0.0
3,2,2024-04-01 00:56:02,2024-04-01 01:05:09,1.0,1.06,1.0,N,137,164,2,10.0,1.0,0.5,0.0,0.0,1.0,15.0,2.5,0.0
4,1,2024-04-01 00:08:32,2024-04-01 00:10:24,1.0,0.7,1.0,N,236,263,1,5.1,3.5,0.5,2.0,0.0,1.0,12.1,2.5,0.0


In [87]:
df['tpep_pickup_datetime'].to_csv('inspect.csv')

In [56]:
df.shape

(3514289, 19)

In [57]:
# inspect the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3514289 entries, 0 to 3514288
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int32         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int32         
 8   DOLocationID           int32         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  Airport_fee           

In [58]:
# calculate Trip Duration in Minutes 

df['Trip_Duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() // 60

In [59]:
# Calculate Total Trip Charge to inclide: fare amount, extra, mta_tax, tolls amount, improvement surcharge, congestion surcharge, airport fee, and tip amount
df['Total_Trip_Charge'] = df['fare_amount'] + df['extra'] + df['mta_tax'] + df['tolls_amount'] + df['improvement_surcharge'] + df['congestion_surcharge'] + df['Airport_fee'] + df['tip_amount']

In [60]:
df['passenger_count'].value_counts()

passenger_count
1.0    2399938
2.0     445306
3.0     104075
4.0      64286
0.0      39591
5.0      31597
6.0      20898
8.0         12
7.0          6
9.0          4
Name: count, dtype: int64

In [62]:
# Add Trip Date, Trip Month, Trip Day, and Trip Year

df['Trip_Date'] = pd.to_datetime(df['tpep_pickup_datetime'].dt.date)
df['Trip_Month'] = df['Trip_Date'].dt.month_name()
df['Trip_Day'] = df['Trip_Date'].dt.day_name()
df['Trip_Year'] = df['Trip_Date'].dt.year

In [63]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,Trip_Duration,Total_Trip_Charge,Trip_Date,Trip_Month,Trip_Day,Trip_Year
0,1,2024-04-01 00:02:40,2024-04-01 00:30:42,0.0,5.2,1.0,N,161,7,1,...,1.0,43.25,2.5,0.0,28.0,45.75,2024-04-01,April,Monday,2024
1,2,2024-04-01 00:41:12,2024-04-01 00:55:29,1.0,5.6,1.0,N,264,264,1,...,1.0,37.9,0.0,0.0,14.0,37.9,2024-04-01,April,Monday,2024
2,2,2024-04-01 00:48:42,2024-04-01 01:05:30,1.0,3.55,1.0,N,186,236,1,...,1.0,30.6,2.5,0.0,16.0,30.6,2024-04-01,April,Monday,2024
3,2,2024-04-01 00:56:02,2024-04-01 01:05:09,1.0,1.06,1.0,N,137,164,2,...,1.0,15.0,2.5,0.0,9.0,15.0,2024-04-01,April,Monday,2024
4,1,2024-04-01 00:08:32,2024-04-01 00:10:24,1.0,0.7,1.0,N,236,263,1,...,1.0,12.1,2.5,0.0,1.0,14.6,2024-04-01,April,Monday,2024


In [64]:
# memory used
df.memory_usage()

Index                         128
VendorID                 14057156
tpep_pickup_datetime     28114312
tpep_dropoff_datetime    28114312
passenger_count          28114312
trip_distance            28114312
RatecodeID               28114312
store_and_fwd_flag       28114312
PULocationID             14057156
DOLocationID             14057156
payment_type             28114312
fare_amount              28114312
extra                    28114312
mta_tax                  28114312
tip_amount               28114312
tolls_amount             28114312
improvement_surcharge    28114312
total_amount             28114312
congestion_surcharge     28114312
Airport_fee              28114312
Trip_Duration            28114312
Total_Trip_Charge        28114312
Trip_Date                28114312
Trip_Month               28114312
Trip_Day                 28114312
Trip_Year                14057156
dtype: int64

In [65]:
# inspect after
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3514289 entries, 0 to 3514288
Data columns (total 25 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int32         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int32         
 8   DOLocationID           int32         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  Airport_fee           

In [66]:
# list existing columns
df.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'Airport_fee', 'Trip_Duration',
       'Total_Trip_Charge', 'Trip_Date', 'Trip_Month', 'Trip_Day',
       'Trip_Year'],
      dtype='object')

In [69]:
# keep only specific columns 
# 'VendorID', 'passenger_count', 'trip_distance', 'store_and_fwd_flag', 'payment_type', 'Trip_Duration', 'Total_Trip_Charge', 'Trip_Date', 'Trip_Month', 'Trip_Year', 'Trip_Day'

# Reorder the columns, starting with Vendor ID, followed by all the different date/time columns example Date, Year, Month, and Day, then followed by the remaining columns 

cols = ['VendorID', 'Trip_Date', 'Trip_Year', 'Trip_Month', 'Trip_Day',
       'passenger_count', 'trip_distance', 'store_and_fwd_flag',
       'payment_type', 'Trip_Duration',
       'Total_Trip_Charge' ]

In [70]:
df = df[cols]
df.head()

Unnamed: 0,VendorID,Trip_Date,Trip_Year,Trip_Month,Trip_Day,passenger_count,trip_distance,store_and_fwd_flag,payment_type,Trip_Duration,Total_Trip_Charge
0,1,2024-04-01,2024,April,Monday,0.0,5.2,N,1,28.0,45.75
1,2,2024-04-01,2024,April,Monday,1.0,5.6,N,1,14.0,37.9
2,2,2024-04-01,2024,April,Monday,1.0,3.55,N,1,16.0,30.6
3,2,2024-04-01,2024,April,Monday,1.0,1.06,N,2,9.0,15.0
4,1,2024-04-01,2024,April,Monday,1.0,0.7,N,1,1.0,14.6


In [72]:
# Rename the columns 
df.rename(columns={
    'VendorID': 'Vendor_ID',
    'passenger_count': 'No_of_Passengers',
    'store_and_fwd_flag': 'SF_Flag',
    'payment_type': 'Payment_Type'
}, inplace=True)

df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={


Unnamed: 0,Vendor_ID,Trip_Date,Trip_Year,Trip_Month,Trip_Day,No_of_Passengers,trip_distance,SF_Flag,Payment_Type,Trip_Duration,Total_Trip_Charge
0,1,2024-04-01,2024,April,Monday,0.0,5.2,N,1,28.0,45.75
1,2,2024-04-01,2024,April,Monday,1.0,5.6,N,1,14.0,37.9
2,2,2024-04-01,2024,April,Monday,1.0,3.55,N,1,16.0,30.6
3,2,2024-04-01,2024,April,Monday,1.0,1.06,N,2,9.0,15.0
4,1,2024-04-01,2024,April,Monday,1.0,0.7,N,1,1.0,14.6


In [75]:
# Save as Parquet parition by Year, Month
df.to_parquet('Yellow Taxi Transformed.parquet', partition_cols=['Trip_Year', 'Trip_Month'])

**Challenge**

* Inspect the Parquet folder, look at the partitions. Is there someting strange? Yes/No
* If Yes. Find which rows are causing this and capture them
* What do you think happened?