In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Load the dataset
df = pd.read_csv('uber.csv')

# Display the first few rows
df.head()
# Print df columns with types
print(df.dtypes)

Unnamed: 0             int64
key                   object
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object


### Handle missing values

In [3]:
# Checking for missing values
missing_values = df.isnull().sum()

# Drop rows with any missing values
df = df.dropna()

# Alternatively, fill missing values (if appropriate)
# df = df.fillna(method='ffill')

missing_values_after = df.isnull().sum()
missing_values_after

Unnamed: 0           0
key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [4]:
# Validating latitude and longitude ranges
valid_longitude = (-180, 180)
valid_latitude = (-90, 90)

df = df[(df['pickup_longitude'].between(valid_longitude[0], valid_longitude[1])) &
        (df['pickup_latitude'].between(valid_latitude[0], valid_latitude[1])) &
        (df['dropoff_longitude'].between(valid_longitude[0], valid_longitude[1])) &
        (df['dropoff_latitude'].between(valid_latitude[0], valid_latitude[1]))]

# Filtering out rows with invalid fare_amount and passenger_count
df = df[df['fare_amount'] > 0]
df = df[df['passenger_count'] > 0]
df = df[df['passenger_count'] <= 6]

df.shape

(199256, 9)

## Data Transformation

In [5]:
# Extracting significant datetime information
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

# Extracting features from pickup_datetime
df['pickup_year'] = df['pickup_datetime'].dt.year
df['pickup_month'] = df['pickup_datetime'].dt.month
df['pickup_day'] = df['pickup_datetime'].dt.day
df['pickup_hour'] = df['pickup_datetime'].dt.hour
df['pickup_minute'] = df['pickup_datetime'].dt.minute
df['pickup_day_of_week'] = df['pickup_datetime'].dt.dayofweek
# 0 - Night , 1 - Morning, 2 - Afternoon, 3 - Evening
df['pickup_time_of_day'] = pd.cut(df['pickup_datetime'].dt.hour, bins=[0, 4, 12, 18, 24], labels=[0, 1, 2, 3], right=False)

# Create the naive datetime column (if not already created)
df['pickup_datetime_naive'] = df['pickup_datetime'].dt.tz_convert(None)

# Normalize to remove the time component
df['pickup_datetime_naive'] = df['pickup_datetime_naive'].dt.normalize()


# Display the first few rows after transformation
print("First few rows after datetime transformation:")
print(df.head())

df.to_csv('updated.csv', index=False)

First few rows after datetime transformation:
   Unnamed: 0                            key  fare_amount  \
0    24238194    2015-05-07 19:52:06.0000003          7.5   
1    27835199    2009-07-17 20:04:56.0000002          7.7   
2    44984355   2009-08-24 21:45:00.00000061         12.9   
3    25894730    2009-06-26 08:22:21.0000001          5.3   
4    17610152  2014-08-28 17:47:00.000000188         16.0   

            pickup_datetime  pickup_longitude  pickup_latitude  \
0 2015-05-07 19:52:06+00:00        -73.999817        40.738354   
1 2009-07-17 20:04:56+00:00        -73.994355        40.728225   
2 2009-08-24 21:45:00+00:00        -74.005043        40.740770   
3 2009-06-26 08:22:21+00:00        -73.976124        40.790844   
4 2014-08-28 17:47:00+00:00        -73.925023        40.744085   

   dropoff_longitude  dropoff_latitude  passenger_count  pickup_year  \
0         -73.999512         40.723217                1         2015   
1         -73.994710         40.750325        

## Data Extending
### Adding weather data to our dataset from the [link](https://www.ncei.noaa.gov/cdo-web/):

In [6]:
pre_processed_weather_df = pd.read_csv('nyc_weather.csv')
# Checking for missing values
missing_values = pre_processed_weather_df.isnull().sum()

missing_values

STATION       0
DATE          0
AWND         35
EVAP       2922
RHAV         80
RHMN         78
RHMX         78
SNOW          0
SNWD          0
TAVG       2922
TMAX          0
TMIN          0
TOBS       2922
TSUN       2922
WDMV       2922
WESD       2922
WSFM       2922
WT01       2040
WT03       2922
WT07       2871
WT14       2921
WT16       2344
WT17       2914
dtype: int64

In [7]:
weather_df = pre_processed_weather_df[['DATE', 'SNOW', 'SNWD', 'TMAX', 'TMIN']].copy() # Selecting only the required columns

weather_df['DATE'] = pd.to_datetime(weather_df['DATE'])

weather_df.set_index('DATE', inplace=True)

weather_df.head()

Unnamed: 0_level_0,SNOW,SNWD,TMAX,TMIN
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009-01-01,0,0,-3.3,-9.4
2009-01-02,0,0,1.1,-5.0
2009-01-03,0,0,3.3,-1.7
2009-01-04,0,0,5.6,-3.9
2009-01-05,0,0,6.1,3.3


In [8]:
df['SNOW'] = df['pickup_datetime_naive'].map(weather_df['SNOW'])
df['SNWD'] = df['pickup_datetime_naive'].map(weather_df['SNWD'])
df['TMAX'] = df['pickup_datetime_naive'].map(weather_df['TMAX'])
df['TMIN'] = df['pickup_datetime_naive'].map(weather_df['TMIN'])

df.head()
df.to_csv('updated.csv', index=False)

### Holiday Data [Link](https://www.kaggle.com/datasets/donnetew/us-holiday-dates-2004-2021):


In [9]:
# Read the CSV file into a DataFrame
us_holidays_df = pd.read_csv('us_holidays.csv')

us_holidays_df['Date'] = pd.to_datetime(us_holidays_df['Date'])

# Display the DataFrame
us_holidays_df.dtypes

Date       datetime64[ns]
Holiday            object
WeekDay            object
Month               int64
Day                 int64
Year                int64
dtype: object

In [10]:
holidays_dates_df = us_holidays_df['Date'].copy()
holidays_dates_df.head()

0   2004-07-04
1   2005-07-04
2   2006-07-04
3   2007-07-04
4   2008-07-04
Name: Date, dtype: datetime64[ns]

In [11]:
df = df.merge(holidays_dates_df, left_on='pickup_datetime_naive', right_on='Date', how='left')
df['is_holiday'] = df['Date'].notnull().astype(int)
df.drop(columns='Date', inplace=True)
df.head()
df.sort_values(by='pickup_datetime', inplace=True)
df.to_csv('uber_updated.csv', index=False)

In [12]:
df.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,...,pickup_hour,pickup_minute,pickup_day_of_week,pickup_time_of_day,pickup_datetime_naive,SNOW,SNWD,TMAX,TMIN,is_holiday
100584,55072468,2009-01-01 01:15:22.0000006,8.5,2009-01-01 01:15:22+00:00,-73.981918,40.779456,-73.957685,40.771043,2,2009,...,1,15,3,0,2009-01-01,0,0,-3.3,-9.4,1
43861,30813112,2009-01-01 01:59:17.0000001,13.0,2009-01-01 01:59:17+00:00,-73.983759,40.721389,-73.994833,40.687179,2,2009,...,1,59,3,0,2009-01-01,0,0,-3.3,-9.4,1
7616,8688426,2009-01-01 02:05:03.0000003,10.6,2009-01-01 02:05:03+00:00,-73.956635,40.771254,-73.991528,40.749778,2,2009,...,2,5,3,0,2009-01-01,0,0,-3.3,-9.4,1
118451,13190369,2009-01-01 02:09:13.0000003,12.2,2009-01-01 02:09:13+00:00,-73.984605,40.72802,-73.955746,40.77683,1,2009,...,2,9,3,0,2009-01-01,0,0,-3.3,-9.4,1
89530,45716268,2009-01-01 02:13:41.0000001,11.0,2009-01-01 02:13:41+00:00,-73.980127,40.737425,-74.009544,40.726025,4,2009,...,2,13,3,0,2009-01-01,0,0,-3.3,-9.4,1
