In [27]:
import pandas as pd
import numpy as np

# Load the CSV file into a pandas DataFrame (update the file path)
uber_df = pd.read_csv("uber.csv")

# Function to calculate the Haversine distance between two points on the Earth
def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    
    # Radius of Earth in kilometers
    r = 6371
    return c * r  # Distance in kilometers

# Step 1: Calculate the distance between pickup and dropoff points
uber_df['distance_km'] = haversine(
    uber_df['pickup_latitude'], 
    uber_df['pickup_longitude'], 
    uber_df['dropoff_latitude'], 
    uber_df['dropoff_longitude']
)

# Function to remove outliers using the IQR method
def remove_outliers(df, column):
    # Calculate the Interquartile Range (IQR)
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    
    # Define bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Filter the DataFrame to remove outliers
    df_no_outliers = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    
    return df_no_outliers

# Apply the function to remove outliers from both 'fare_amount' and 'distance_km'
uber_df_cleaned_no_outliers = remove_outliers(uber_df, 'fare_amount')
uber_df_cleaned_no_outliers = remove_outliers(uber_df_cleaned_no_outliers, 'distance_km')

# Print the resulting DataFrame
print(uber_df_cleaned_no_outliers)
# Step 2: Drop the unnamed first column and coordinate columns
uber_df_cleaned = uber_df.drop(columns=['Unnamed: 0', 'key', 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude', 'passenger_count'])

# Step 3: Modify the 'pickup_datetime' column to only display the time (remove date)
uber_df_cleaned['pickup_time'] = pd.to_datetime(uber_df_cleaned['pickup_datetime']).dt.time

# Drop the original 'pickup_datetime' column
uber_df_cleaned = uber_df_cleaned.drop(columns=['pickup_datetime'])

# Step 4: Calculate the average distance
average_distance = uber_df_cleaned['distance_km'].mean()

# Step 5: Calculate the average price (if 'price' column exists)
if 'fare_amount' in uber_df_cleaned.columns:
    average_price = uber_df_cleaned['fare_amount'].mean()
else:
    average_price = "Price column not found in the dataset."

# Print the averages
print(f"Average Distance (km): {average_distance}")
print(f"Average Price: {average_price}")
print(uber_df_cleaned)  

# Optionally, save the cleaned DataFrame to a new CSV file
# uber_df_cleaned.to_csv('uber_cleaned_with_distance_and_time.csv', index=False)


        Unnamed: 0                            key  fare_amount  \
0         24238194    2015-05-07 19:52:06.0000003          7.5   
1         27835199    2009-07-17 20:04:56.0000002          7.7   
2         44984355   2009-08-24 21:45:00.00000061         12.9   
3         25894730    2009-06-26 08:22:21.0000001          5.3   
4         17610152  2014-08-28 17:47:00.000000188         16.0   
...            ...                            ...          ...   
199994     3189201  2014-01-31 14:42:00.000000181         12.0   
199995    42598914   2012-10-28 10:49:00.00000053          3.0   
199996    16382965    2014-03-14 01:09:00.0000008          7.5   
199998    20259894    2015-05-20 14:56:25.0000004         14.5   
199999    11951496   2010-05-15 04:08:00.00000076         14.1   

                pickup_datetime  pickup_longitude  pickup_latitude  \
0       2015-05-07 19:52:06 UTC        -73.999817        40.738354   
1       2009-07-17 20:04:56 UTC        -73.994355        40.728225 

# 2. Using various models to predict a linear combination of distance and the pickup time to predict the fare amount of the Uber

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error


# Assuming that uber_df_cleaned is already available from your preprocessing steps
# Preprocessing: Extract hour from pickup_time and add it as a new feature
uber_df_cleaned['pickup_hour'] = pd.to_datetime(uber_df_cleaned['pickup_time'], format='%H:%M:%S').dt.hour

# Step 1: Prepare features (distance_km and pickup_hour) and target (fare_amount)
X = uber_df_cleaned[['distance_km', 'pickup_hour']]
y = uber_df_cleaned['fare_amount']

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train models

# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Random Forest Regressor
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)

# Gradient Boosting Regressor
gb_reg = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_reg.fit(X_train, y_train)

# Step 4: Make predictions
y_pred_lin = lin_reg.predict(X_test)
y_pred_rf = rf_reg.predict(X_test)
y_pred_gb = gb_reg.predict(X_test)

# Step 5: Evaluate models

def evaluate_model(y_test, y_pred, model_name):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    print(f"{model_name} Performance:")
    print(f"Mean Absolute Error: {mae}")
    print(f"Mean Squared Error: {mse}")
    print(f"Root Mean Squared Error: {rmse}")
    print("-" * 40)

# Evaluate Linear Regression
evaluate_model(y_test, y_pred_lin, "Linear Regression")

# Evaluate Random Forest
evaluate_model(y_test, y_pred_rf, "Random Forest")

# Evaluate Gradient Boosting
evaluate_model(y_test, y_pred_gb, "Gradient Boosting")

ModuleNotFoundError: No module named 'sklearn'