In [15]:
import pandas as pd
import numpy as np
from math import radians, sin, cos, sqrt

In [2]:
df = pd.read_csv('/content/drive/MyDrive/uber_rides_data.xlsx - sample_train.csv')


In [3]:
shape = df.shape
print("Number of Rows:", shape[0])
print("Number of Columns:", shape[1])


Number of Rows: 200000
Number of Columns: 8


In [4]:
integer_columns = df.select_dtypes(include=['int64']).columns
num_integer_columns = len(integer_columns)

In [5]:
print("Number of Integer Columns:", num_integer_columns)

Number of Integer Columns (by default): 2


In [6]:
missing_values = df['dropoff_longitude'].isnull().sum()

In [7]:
print("Number of missing values in the 'dropoff_longitude' column:", missing_values)

Number of missing values in the 'dropoff_longitude' column: 1


In [9]:
pickup_datetime_dtype = df['pickup_datetime'].dtype

In [10]:
print("Data type of 'pickup_datetime' feature:", pickup_datetime_dtype)

Data type of 'pickup_datetime' feature: object


In [11]:
df.dropna(inplace=True)

In [12]:
average_fare_amount = df['fare_amount'].mean()

In [13]:
print("Average Fare Amount:", average_fare_amount)

Average Fare Amount: 11.359891549457748


In [34]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of the Earth in kilometers

    # Converting latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c

    return distance

In [36]:
# Calculate Haversine distance for each row
df['haversine_distance'] = df.apply(lambda row: haversine(row['pickup_latitude'], row['pickup_longitude'], row['dropoff_latitude'], row['dropoff_longitude']), axis=1)

In [32]:
# Calculate the median Haversine distance (in kilometers)
median_haversine_distance = df['haversine_distance'].median()

In [27]:
print("Median Haversine Distance (in kilometers):", median_haversine_distance)

Median Haversine Distance (in kilometers): 2.1209923961833708


In [31]:
# Find the maximum Haversine distance (in kilometers)
max_haversine_distance = df['haversine_distance'].max()

In [29]:
print("Maximum Haversine Distance (in kilometers):", max_haversine_distance)

Maximum Haversine Distance (in kilometers): 16409.239135313168


In [30]:
# Counting the number of rides with 0.0 Haversine distance
rides_with_zero_distance = df[df['haversine_distance'] == 0.0]
num_rides_with_zero_distance = len(rides_with_zero_distance)

In [37]:
print("Number of rides with 0.0 Haversine distance:", num_rides_with_zero_distance)

Number of rides with 0.0 Haversine distance: 5632


In [38]:
# Filtering rides with 0.0 Haversine distance
rides_with_zero_distance = df[df['haversine_distance'] == 0.0]

In [39]:
# Calculating the mean 'fare_amount' for rides with 0.0 Haversine distance
mean_fare_amount_zero_distance = rides_with_zero_distance['fare_amount'].mean()

In [40]:
print("Mean 'fare_amount' for rides with 0.0 Haversine distance:", mean_fare_amount_zero_distance)

Mean 'fare_amount' for rides with 0.0 Haversine distance: 11.585317826704546


In [41]:
# Finding the maximum 'fare_amount'
max_fare_amount = df['fare_amount'].max()

In [42]:
print("Maximum 'fare_amount' for a ride:", max_fare_amount)

Maximum 'fare_amount' for a ride: 499.0


In [43]:
# Finding the row with the maximum 'fare_amount'
costliest_ride = df[df['fare_amount'] == df['fare_amount'].max()]

In [44]:
# Extracting pickup and dropoff coordinates
pickup_lat = costliest_ride['pickup_latitude'].values[0]
pickup_lon = costliest_ride['pickup_longitude'].values[0]
dropoff_lat = costliest_ride['dropoff_latitude'].values[0]
dropoff_lon = costliest_ride['dropoff_longitude'].values[0]

In [45]:
# Calculating the Haversine distance for the costliest ride
haversine_distance = haversine(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon)

In [46]:
print("Haversine Distance for the Costliest Ride (in kilometers):", haversine_distance)

Haversine Distance for the Costliest Ride (in kilometers): 0.0007899213191009993


In [49]:
# Converting 'pickup_datetime' to a datetime object
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

In [56]:
# Extracting the year from 'pickup_datetime'
df['pickup_year'] = df['pickup_datetime'].dt.year

# Extracting the quarter from 'pickup_datetime'
df['pickup_quarter'] = df['pickup_datetime'].dt.quarter

# Extracting the month from 'pickup_datetime'
df['pickup_month'] = df['pickup_datetime'].dt.month

In [54]:
# Counting the number of rides in the first quarter of 2014 (Q1 2014)
rides_in_q1_2014 = len(df[(df['pickup_year'] == 2014) & (df['pickup_quarter'] == 1)])

In [51]:
# Counting the number of rides in the year 2014
rides_in_2014 = len(df[df['pickup_year'] == 2014])

In [52]:
print("Number of rides recorded in the year 2014:", rides_in_2014)

Number of rides recorded in the year 2014: 29968


In [55]:
print("Number of rides recorded in the first quarter of 2014:", rides_in_q1_2014)

Number of rides recorded in the first quarter of 2014: 7687


In [57]:
# Filtering for rides in September 2010
september_2010_rides = df[(df['pickup_year'] == 2010) & (df['pickup_month'] == 9)]

In [58]:
# Extracting the day of the week and count rides for each day
day_of_week_counts = september_2010_rides['pickup_datetime'].dt.day_name().value_counts()

In [60]:
# Finding the day with the maximum rides
max_rides_day = day_of_week_counts.idxmax()
max_rides_count = day_of_week_counts.max()

In [61]:
print(f"On {max_rides_day} of September 2010, the maximum rides were recorded with {max_rides_count} rides.")

On Thursday of September 2010, the maximum rides were recorded with 457 rides.


In [83]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score

In [79]:
# Converting 'pickup_datetime' to datetime and extract 'ride_week_day'
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['ride_week_day'] = df['pickup_datetime'].dt.day_name()

In [94]:
df['passenger_counts']=df['passenger_count']

In [96]:
# Defining input features (X) and target variable (y)
X = df[['passenger_counts', 'haversine_distance', 'ride_week_day']]
y = df['fare_amount']

In [97]:
# Creating a ColumnTransformer to one-hot encode 'ride_week_day'
preprocessor = ColumnTransformer(
    transformers=[
        ('ride_week_day', OneHotEncoder(), ['ride_week_day'])
    ],
    remainder='passthrough'
)

In [98]:
# Creating a pipeline with preprocessing and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

In [99]:
# Performing a 70-30 split of the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [100]:
# Fitting the pipeline on the training data
pipeline.fit(X_train, y_train)

In [106]:
# Predicting on the test data
y_pred = pipeline.predict(X_test)

In [108]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_squared_log_error

In [112]:
# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

In [113]:
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

Mean Absolute Error (MAE): 6.05
Mean Squared Error (MSE): 102.76
Root Mean Squared Error (RMSE): 10.14
