In [27]:
import pandas as pd

# load the dataset
df = pd.read_csv("uber_data.csv")

df.info()
df.describe()
df.head()
df.duplicated().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         200000 non-null  int64  
 1   key                200000 non-null  object 
 2   fare_amount        200000 non-null  float64
 3   pickup_datetime    200000 non-null  object 
 4   pickup_longitude   200000 non-null  float64
 5   pickup_latitude    200000 non-null  float64
 6   dropoff_longitude  199999 non-null  float64
 7   dropoff_latitude   199999 non-null  float64
 8   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(2)
memory usage: 13.7+ MB


0

In [28]:
# checking for missing values
missing = df.isnull().sum()
print(missing[missing > 0])

dropoff_longitude    1
dropoff_latitude     1
dtype: int64


In [29]:
# drop columns from above
df = df.dropna(subset=['dropoff_longitude', 'dropoff_latitude'])

# drop unnamed column
df = df.drop(columns=['Unnamed: 0'])

# Check for missing values again
print(df.isnull().sum())



key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64


In [30]:
# Ranging fares between $1 and $100
df = df[(df['fare_amount'] >= 1) & (df['fare_amount'] <= 100)]

# remove trips with identical pickup & dropoff locations
df = df[~((df['pickup_latitude'] == df['dropoff_latitude']) &
          (df['dropoff_longitude'] == df['dropoff_longitude']))]

# checking df shape
print(df.shape)


(194231, 8)


In [31]:
# Remove trips outside of NYC
df = df[(df['pickup_latitude'].between(40.5, 41.0)) &
        (df['pickup_longitude'].between(-74.3, -73.7)) &
        (df['dropoff_latitude'].between(40.5, 41.0)) &
        df['dropoff_longitude'].between(-74.3, -73.7)]

# Checking df shape
print(df.shape)

(193515, 8)


In [32]:
# Create trip_distance variable

#imports
from math import radians, cos, sin, asin, sqrt

# Defining haversine distance
def haversine(lat1, lon1, lat2, lon2):
    R = 6371 # Earth's Radius

    # coverting degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * asin(sqrt(a))

    return R * c

# Applying function to create trip_distance column
df['trip_distance'] = df.apply(lambda row: haversine(
    row['pickup_latitude'], row['pickup_longitude'],
    row['dropoff_latitude'], row['dropoff_longitude']), axis=1
)

# Checking if column exists
print(df['trip_distance'].describe())

count    193515.000000
mean          3.342294
std           3.561226
min           0.000111
25%           1.280925
50%           2.179727
75%           3.934569
max          36.687406
Name: trip_distance, dtype: float64


In [33]:
# Converting pickup_datetime to datetime
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

# Extracting hour of the day
df['pickup_hour'] = df['pickup_datetime'].dt.hour

# Extracting day of the week
df['pickup_dayofweek'] = df['pickup_datetime'].dt.day_name()

# Check
print(df[['pickup_hour', 'pickup_dayofweek']].head())

   pickup_hour pickup_dayofweek
0           19         Thursday
1           20           Friday
2           21           Monday
3            8           Friday
4           17         Thursday


In [None]:
# Save cleaned dataset as a csv
try:
    df.to_csv("uber_cleaned.csv", index=False)
    print("CSV saved successfully!")
except Exception as e:
    print("Failed to save CSV:", e)



CSV saved successfully!


: 