In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model, preprocessing, model_selection, metrics, ensemble
import warnings
import zipfile
import os

# Ignore warnings
warnings.filterwarnings('ignore')

# Extract the specific files from the outer ZIP archive
with zipfile.ZipFile('/content/drive/MyDrive/nyc-taxi-trip-duration.zip', 'r') as outer_zip:
    outer_zip.extractall('/content/temp_extracted')

# Extract the train.zip file
with zipfile.ZipFile('/content/temp_extracted/train.zip', 'r') as train_zip:
    train_zip.extractall('/content/temp_extracted/train')

# Load training data
train_file_path = os.path.join('/content/temp_extracted/train', 'train.csv')
taxi_data = pd.read_csv(train_file_path)
print('Train data shape: {}'.format(taxi_data.shape))
taxi_data.head()

# Convert datetime features to the correct format
taxi_data['pickup_datetime'] = pd.to_datetime(taxi_data['pickup_datetime'], format='%Y-%m-%d %H:%M:%S')
taxi_data['dropoff_datetime'] = pd.to_datetime(taxi_data['dropoff_datetime'], format='%Y-%m-%d %H:%M:%S')

# Function to add datetime features
def add_datetime_features(data):
    data['pickup_date'] = data['pickup_datetime'].dt.date
    data['pickup_hour'] = data['pickup_datetime'].dt.hour
    data['pickup_day_of_week'] = data['pickup_datetime'].dt.dayofweek
    return data

# Apply the function
add_datetime_features(taxi_data)

# Check the transformation
print(taxi_data[['pickup_date', 'pickup_hour', 'pickup_day_of_week']].head())
print('Quantity of trips daily:', np.round(taxi_data['pickup_date'].count()/taxi_data['pickup_date'].nunique()))

# Function to calculate haversine distance
def get_haversine_distance(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    EARTH_RADIUS = 6371
    lat_delta = lat2 - lat1
    lng_delta = lng2 - lng1
    d = np.sin(lat_delta * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng_delta * 0.5) ** 2
    h = 2 * EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

# Function to calculate angle direction
def get_angle_direction(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    lng_delta_rad = lng2 - lng1
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    alpha = np.degrees(np.arctan2(y, x))
    return alpha

# Function to add geographical features
def add_geographical_features(data):
    data['haversine_distance'] = get_haversine_distance(data['pickup_latitude'], data['pickup_longitude'], data['dropoff_latitude'], data['dropoff_longitude'])
    data['direction'] = get_angle_direction(data['pickup_latitude'], data['pickup_longitude'], data['dropoff_latitude'], data['dropoff_longitude'])
    return data

# Apply the function
add_geographical_features(taxi_data)

# Check new features
print(taxi_data[['haversine_distance', 'direction']].head())

# Encode 'store_and_fwd_flag' column
taxi_data['store_and_fwd_flag'] = taxi_data['store_and_fwd_flag'].map({'N': 0, 'Y': 1})

# Log transform trip duration
taxi_data['trip_duration_log'] = np.log(taxi_data['trip_duration'] + 1)

# Drop unnecessary columns
taxi_data.drop(['id', 'pickup_datetime', 'dropoff_datetime', 'pickup_date'], axis=1, inplace=True)
print('Shape of data after dropping columns:  {}'.format(taxi_data.shape))

# Define features and target
X = taxi_data.drop(['trip_duration', 'trip_duration_log'], axis=1)
y = taxi_data['trip_duration']
y_log = taxi_data['trip_duration_log']

# Split the data into training and validation sets
X_train, X_valid, y_train_log, y_valid_log = model_selection.train_test_split(
    X, y_log,
    test_size=0.33,
    random_state=42
)

# Normalize the data
scaler = preprocessing.MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

# Train Linear Regression model
lr_model = linear_model.LinearRegression()
lr_model.fit(X_train, y_train_log)
y_train_pred = lr_model.predict(X_train)
y_valid_pred = lr_model.predict(X_valid)
print('Linear Regression Train RMSLE: {:.2f}'.format(np.sqrt(metrics.mean_squared_error(y_train_log, y_train_pred))))
print('Linear Regression Valid RMSLE: {:.2f}'.format(np.sqrt(metrics.mean_squared_error(y_valid_log, y_valid_pred))))

# Train Random Forest model
rf_model = ensemble.RandomForestRegressor(n_estimators=200, max_depth=12, random_state=42)
rf_model.fit(X_train, y_train_log)
y_train_pred = rf_model.predict(X_train)
y_valid_pred = rf_model.predict(X_valid)
print('Random Forest Train RMSLE: {:.2f}'.format(np.sqrt(metrics.mean_squared_error(y_train_log, y_train_pred))))
print('Random Forest Valid RMSLE: {:.2f}'.format(np.sqrt(metrics.mean_squared_error(y_valid_log, y_valid_pred))))

# Convert predictions back to original scale
y_valid = np.exp(y_valid_log) - 1
y_lr_pred = np.exp(y_valid_pred) - 1
y_rf_pred = np.exp(y_valid_pred) - 1

# Calculate evaluation metrics
def evaluate(y_true, y_pred):
    mse = metrics.mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = metrics.mean_absolute_error(y_true, y_pred)
    r2 = metrics.r2_score(y_true, y_pred)
    return mse, rmse, mae, r2

lr_mse, lr_rmse, lr_mae, lr_r2 = evaluate(y_valid, y_lr_pred)
rf_mse, rf_rmse, rf_mae, rf_r2 = evaluate(y_valid, y_rf_pred)

print(f'Linear Regression MSE: {lr_mse:.2f}, RMSE: {lr_rmse:.2f}, MAE: {lr_mae:.2f}, R2: {lr_r2:.2f}')
print(f'Random Forest MSE: {rf_mse:.2f}, RMSE: {rf_rmse:.2f}, MAE: {rf_mae:.2f}, R2: {rf_r2:.2f}')

Train data shape: (1458644, 11)
  pickup_date  pickup_hour  pickup_day_of_week
0  2016-03-14           17                   0
1  2016-06-12            0                   6
2  2016-01-19           11                   1
3  2016-04-06           19                   2
4  2016-03-26           13                   5
Quantity of trips daily: 8015.0
   haversine_distance   direction
0            1.498521   99.970196
1            1.805507 -117.153768
2            6.385098 -159.680165
3            1.485498 -172.737700
4            1.188588  179.473585
Shape of data after dropping columns:  (1458644, 13)
Linear Regression Train RMSLE: 0.65
Linear Regression Valid RMSLE: 0.64
Random Forest Train RMSLE: 0.41
Random Forest Valid RMSLE: 0.43
Linear Regression MSE: 9720032.95, RMSE: 3117.70, MAE: 317.26, R2: 0.03
Random Forest MSE: 9720032.95, RMSE: 3117.70, MAE: 317.26, R2: 0.03


In [4]:
def calculate_accuracy(y_true, y_pred, tolerance=0.1):
    """
    Calculate the custom accuracy for regression where a prediction is
    considered accurate if it falls within a certain percentage (tolerance) of the actual value.

    Parameters:
    - y_true: Actual values
    - y_pred: Predicted values
    - tolerance: Tolerance percentage (default is 10%)

    Returns:
    - accuracy: The custom accuracy metric
    """
    accurate_predictions = np.abs(y_true - y_pred) <= (tolerance * y_true)
    accuracy = np.mean(accurate_predictions)
    return accuracy

# Example usage:
# Assuming y_valid contains the actual values and y_pred contains the predicted values
# Custom accuracy with a 10% tolerance
accuracy_lr = calculate_accuracy(y_valid, y_lr_pred, tolerance=0.1)
accuracy_rf = calculate_accuracy(y_valid, y_rf_pred, tolerance=0.1)

print(f'Linear Regression Accuracy: {accuracy_lr:.2f}')
print(f'Random Forest Accuracy: {accuracy_rf:.2f}')

Linear Regression Accuracy: 0.26
Random Forest Accuracy: 0.26
