In [None]:
Simport pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression,ridge_regression,Ridge,Lasso,LassoCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error,mean_squared_log_error
import xgboost as xgb

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/MyDrive/data/data_cleaned_v3.csv')

In [None]:
data = df.copy().drop(['pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude'],axis = 1)

In [None]:
data = data.drop_duplicates().reset_index(drop = True)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2249335 entries, 0 to 2249334
Data columns (total 16 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   pickup_datetime         object 
 1   Trip Seconds            float64
 2   Company                 int64  
 3   Pickup Community Area   float64
 4   Dropoff Community Area  float64
 5   month                   int64  
 6   hour                    int64  
 7   day_of_week             int64  
 8   day                     int64  
 9   distance                float64
 10  tempearture             float64
 11  precipitation           float64
 12  humidity                float64
 13  wind_speed              float64
 14  is_rush_hour            int64  
 15  is_work_day             int64  
dtypes: float64(8), int64(7), object(1)
memory usage: 274.6+ MB


In [None]:
data.shape

(2249335, 16)

In [None]:
X = data.drop(['Trip Seconds'],axis = 1)
y = data['Trip Seconds']

In [None]:
X['distance'] = np.log1p(X['distance'] + 1)
X['humidity'] = np.log1p(X['humidity'] + 1)
X['precipitation'] = np.log1p(X['precipitation'] + 1)

In [None]:
X.drop(['pickup_datetime'],axis = 1, inplace = True)

In [None]:
X_original = X.drop(['humidity','precipitation','tempearture','wind_speed','is_work_day','is_rush_hour'],axis = 1)
X_original_weather = X.drop(['is_work_day','is_rush_hour'],axis = 1)
X_original_congestion = X.drop(['humidity','precipitation','tempearture','wind_speed'],axis = 1)
X_original_weather_congestion = X.copy()

In [None]:
models = {
    #'XGBoost': xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=6, learning_rate=0.1)
    'Ridge' : Ridge(alpha = 0.1),
    'Linear Regression': LinearRegression()
}
# Define your feature sets
feature_sets = {
    'Original Dataset': X_original.iloc[:, :],
    'Original + Weather Features': X_original_weather.iloc[:, :],
    'Original + Congestion Features': X_original_congestion.iloc[:, :],
    'Original + Weather + Congestion Features': X_original_weather_congestion.iloc[:, :]
}


In [None]:
X_original_weather_congestion

Unnamed: 0,Company,Pickup Community Area,Dropoff Community Area,month,hour,day_of_week,day,distance,tempearture,precipitation,humidity,wind_speed,is_rush_hour,is_work_day
0,32,6.0,32.0,1,0,6,1,2.469426,2.05,0.928219,4.583742,5.17,0,0
1,29,24.0,8.0,1,0,6,1,1.747667,2.05,0.928219,4.583742,5.17,0,0
2,29,32.0,8.0,1,0,6,1,1.604972,2.05,0.928219,4.583742,5.17,0,0
3,19,23.0,8.0,1,0,6,1,2.230328,2.05,0.928219,4.583742,5.17,0,0
4,35,8.0,32.0,1,0,6,1,1.604928,2.05,0.928219,4.583742,5.17,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2249330,30,33.0,35.0,8,0,1,1,1.524061,21.22,0.693147,4.347823,3.65,0,1
2249331,13,8.0,21.0,8,0,1,1,2.548763,21.22,0.693147,4.347823,3.65,0,1
2249332,30,28.0,41.0,8,0,1,1,2.820300,21.22,0.693147,4.347823,3.65,0,1
2249333,13,28.0,5.0,8,0,1,1,2.475435,21.22,0.693147,4.347823,3.65,0,1


In [None]:
y_transformed = np.log1p(y)  # Transform y using np.log1p

results = {}
for model_name, model in models.items():
    for set_name, X in feature_sets.items():
        print(f'running {model_name} with {set_name}')
        std_scl = StandardScaler()
        X_scaled = std_scl.fit_transform(X)
        kf = KFold(n_splits=10, shuffle=True, random_state=42)

        fold_rmse = []
        for train_index, test_index in kf.split(X_scaled):
            X_train, X_test = X_scaled[train_index], X_scaled[test_index]
            y_train, y_test = y_transformed[train_index], y_transformed[test_index]

            model.fit(X_train, y_train)
            y_pred_transformed = model.predict(X_test)

            # Inverse transform predictions
            y_pred = np.expm1(y_pred_transformed)
            y_test_original = np.expm1(y_test)

            # Calculate RMSE
            rmse = np.sqrt(mean_squared_log_error(y_test_original, y_pred))
            fold_rmse.append(rmse)

        results[f'{model_name} - {set_name}'] = fold_rmse

# Print and analyze results
for key, values in results.items():
    print(f'{key}: Mean RMSLE = {np.mean(values):.4f}, Std RMSLE = {np.std(values):.4f}')

running Ridge with Original Dataset
running Ridge with Original + Weather Features
running Ridge with Original + Congestion Features
running Ridge with Original + Weather + Congestion Features
running Linear Regression with Original Dataset
running Linear Regression with Original + Weather Features
running Linear Regression with Original + Congestion Features
running Linear Regression with Original + Weather + Congestion Features
Ridge - Original Dataset: Mean RMSLE = 0.3729, Std RMSLE = 0.0007
Ridge - Original + Weather Features: Mean RMSLE = 0.3713, Std RMSLE = 0.0007
Ridge - Original + Congestion Features: Mean RMSLE = 0.3695, Std RMSLE = 0.0007
Ridge - Original + Weather + Congestion Features: Mean RMSLE = 0.3674, Std RMSLE = 0.0007
Linear Regression - Original Dataset: Mean RMSLE = 0.3729, Std RMSLE = 0.0007
Linear Regression - Original + Weather Features: Mean RMSLE = 0.3713, Std RMSLE = 0.0007
Linear Regression - Original + Congestion Features: Mean RMSLE = 0.3695, Std RMSLE = 0

In [None]:
result= pd.DataFrame(results)

In [None]:
result

Unnamed: 0,Ridge - Original Dataset,Ridge - Original + Weather Features,Ridge - Original + Congestion Features,Ridge - Original + Weather + Congestion Features,Linear Regression - Original Dataset,Linear Regression - Original + Weather Features,Linear Regression - Original + Congestion Features,Linear Regression - Original + Weather + Congestion Features
0,0.373556,0.37187,0.370051,0.367778,0.373556,0.37187,0.370051,0.367778
1,0.373504,0.371878,0.370164,0.367953,0.373504,0.371878,0.370164,0.367953
2,0.37232,0.370751,0.368973,0.366839,0.37232,0.370751,0.368973,0.366839
3,0.372931,0.371339,0.369448,0.36727,0.372931,0.371339,0.369448,0.36727
4,0.372736,0.371221,0.369252,0.367178,0.372736,0.371221,0.369252,0.367178
5,0.373085,0.371746,0.369647,0.367743,0.373085,0.371746,0.369647,0.367743
6,0.374029,0.3724,0.370666,0.368442,0.374029,0.3724,0.370666,0.368442
7,0.372071,0.370445,0.368778,0.366583,0.372071,0.370445,0.368778,0.366583
8,0.373079,0.371531,0.369824,0.367741,0.373079,0.371531,0.369824,0.367741
9,0.371679,0.37017,0.36819,0.366109,0.371679,0.37017,0.36819,0.366109


In [None]:
result.columns

Index(['Ridge - Original Dataset', 'Ridge - Original + Weather Features',
       'Ridge - Original + Congestion Features',
       'Ridge - Original + Weather + Congestion Features',
       'Linear Regression - Original Dataset',
       'Linear Regression - Original + Weather Features',
       'Linear Regression - Original + Congestion Features',
       'Linear Regression - Original + Weather + Congestion Features'],
      dtype='object')

In [None]:
result

Unnamed: 0,Ridge - Original Dataset,Ridge - Original + Weather Features,Ridge - Original + Congestion Features,Ridge - Original + Weather + Congestion Features,Linear Regression - Original Dataset,Linear Regression - Original + Weather Features,Linear Regression - Original + Congestion Features,Linear Regression - Original + Weather + Congestion Features
0,0.373556,0.37187,0.370051,0.367778,0.373556,0.37187,0.370051,0.367778
1,0.373504,0.371878,0.370164,0.367953,0.373504,0.371878,0.370164,0.367953
2,0.37232,0.370751,0.368973,0.366839,0.37232,0.370751,0.368973,0.366839
3,0.372931,0.371339,0.369448,0.36727,0.372931,0.371339,0.369448,0.36727
4,0.372736,0.371221,0.369252,0.367178,0.372736,0.371221,0.369252,0.367178
5,0.373085,0.371746,0.369647,0.367743,0.373085,0.371746,0.369647,0.367743
6,0.374029,0.3724,0.370666,0.368442,0.374029,0.3724,0.370666,0.368442
7,0.372071,0.370445,0.368778,0.366583,0.372071,0.370445,0.368778,0.366583
8,0.373079,0.371531,0.369824,0.367741,0.373079,0.371531,0.369824,0.367741
9,0.371679,0.37017,0.36819,0.366109,0.371679,0.37017,0.36819,0.366109


In [None]:
result_df = pd.DataFrame()

In [None]:
result_df['original_dataset'] = result[['Linear Regression - Original Dataset','Ridge - Original Dataset']].mean(axis = 1)
result_df['original_dataset+weather_features'] = result[['Linear Regression - Original + Weather Features','Ridge - Original + Weather Features']].mean(axis = 1)
result_df['original_dataset+congestion_features'] = result[['Linear Regression - Original + Congestion Features','Ridge - Original + Congestion Features']].mean(axis = 1)
result_df['original_dataset+weather+congestion_features'] = result[['Linear Regression - Original + Weather + Congestion Features','Ridge - Original + Weather + Congestion Features']].mean(axis = 1)

In [None]:
result_df

Unnamed: 0,original_dataset,original_dataset+weather_features,original_dataset+congestion_features,original_dataset+weather+congestion_features
0,0.373556,0.37187,0.370051,0.367778
1,0.373504,0.371878,0.370164,0.367953
2,0.37232,0.370751,0.368973,0.366839
3,0.372931,0.371339,0.369448,0.36727
4,0.372736,0.371221,0.369252,0.367178
5,0.373085,0.371746,0.369647,0.367743
6,0.374029,0.3724,0.370666,0.368442
7,0.372071,0.370445,0.368778,0.366583
8,0.373079,0.371531,0.369824,0.367741
9,0.371679,0.37017,0.36819,0.366109


In [None]:
for column in result_df.columns:
  print(f'{column}: Mean RMSLE = {np.mean(result_df[column]):.4f}')

original_dataset: Mean RMSLE = 0.3729
original_dataset+weather_features: Mean RMSLE = 0.3713
original_dataset+congestion_features: Mean RMSLE = 0.3695
original_dataset+weather+congestion_features: Mean RMSLE = 0.3674


In [None]:
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.stats.multicomp as mc

results_melted = result_df.melt(var_name='Model', value_name='Score')

# Perform ANOVA
anova_model = ols('Score ~ C(Model)', data=results_melted).fit()
anova_results = sm.stats.anova_lm(anova_model, typ=2)
print(anova_results)

# If the ANOVA is significant, perform post-hoc testing
if anova_results['PR(>F)'][0] < 0.05:
    print("ANOVA is significant, performing post-hoc tests...")
    comp = mc.MultiComparison(results_melted['Score'], results_melted['Model'])
    post_hoc_res = comp.tukeyhsd()
    print(post_hoc_res)
else:
    print("ANOVA is not significant, no need for post-hoc tests.")

            sum_sq    df           F        PR(>F)
C(Model)  0.000171   3.0  111.517945  2.767761e-18
Residual  0.000018  36.0         NaN           NaN
ANOVA is significant, performing post-hoc tests...
                                      Multiple Comparison of Means - Tukey HSD, FWER=0.05                                       
                   group1                                       group2                    meandiff p-adj   lower   upper  reject
--------------------------------------------------------------------------------------------------------------------------------
                            original_dataset         original_dataset+congestion_features  -0.0034    0.0 -0.0043 -0.0025   True
                            original_dataset original_dataset+weather+congestion_features  -0.0055    0.0 -0.0064 -0.0047   True
                            original_dataset            original_dataset+weather_features  -0.0016 0.0001 -0.0024 -0.0007   True
        original_datas

In [None]:
y_transformed

0          6.945051
1          6.739337
2          5.726848
3          7.074117
4          6.293419
             ...   
2249330    5.484797
2249331    6.803505
2249332    7.186144
2249333    6.660575
2249334    6.928538
Name: Trip Seconds, Length: 2249335, dtype: float64