In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from utils import *

In [20]:
df = pd.read_csv("rides_data_condense.csv")

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1036418 entries, 0 to 1036417
Data columns (total 9 columns):
 #   Column                      Non-Null Count    Dtype  
---  ------                      --------------    -----  
 0   rider_rating                1019444 non-null  float64
 1   requested_car_category      1036418 non-null  object 
 2   surge_factor                1036418 non-null  float64
 3   rider_id                    1036418 non-null  int64  
 4   base_fare                   1036418 non-null  float64
 5   rate_per_mile               1036418 non-null  float64
 6   rate_per_minute             1036418 non-null  float64
 7   travel_distance_miles       1036418 non-null  float64
 8   customer_wait_time_seconds  1036418 non-null  int64  
dtypes: float64(6), int64(2), object(1)
memory usage: 71.2+ MB


In [22]:
df['rider_rating'] = df['rider_rating'].fillna(0)

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1036418 entries, 0 to 1036417
Data columns (total 9 columns):
 #   Column                      Non-Null Count    Dtype  
---  ------                      --------------    -----  
 0   rider_rating                1036418 non-null  float64
 1   requested_car_category      1036418 non-null  object 
 2   surge_factor                1036418 non-null  float64
 3   rider_id                    1036418 non-null  int64  
 4   base_fare                   1036418 non-null  float64
 5   rate_per_mile               1036418 non-null  float64
 6   rate_per_minute             1036418 non-null  float64
 7   travel_distance_miles       1036418 non-null  float64
 8   customer_wait_time_seconds  1036418 non-null  int64  
dtypes: float64(6), int64(2), object(1)
memory usage: 71.2+ MB


In [24]:
# Define X and y
X = df.loc[:, ~df.columns.isin(['customer_wait_time_seconds'])]
y = df.customer_wait_time_seconds

In [25]:
### First preprocess the data
# Separate numerical and categorical features
X_num, X_cat = separate(X)

# Convert categorical features to labels
X_cat = X_cat.apply(LabelEncoder().fit_transform)

In [26]:
# First combine the numerical and categorical features
fullX = pd.concat([X_num,X_cat], axis = 1)

In [27]:
fullX.head()

Unnamed: 0,rider_rating,surge_factor,rider_id,base_fare,rate_per_mile,rate_per_minute,travel_distance_miles,requested_car_category
0,5.0,0.0,0,1.5,1.5,0.25,2.255262,3
1,5.0,0.0,0,1.5,1.5,0.25,0.214196,3
2,5.0,0.0,0,1.5,1.5,0.25,0.212807,3
3,5.0,0.0,1,1.5,1.5,0.25,2.115222,3
4,5.0,0.0,2,1.5,1.5,0.25,0.119355,3


In [28]:
# Define training and testing
X_train, X_valid, y_train, y_valid = train_test_split(fullX, y, test_size = .2, random_state = 0)

### linear reg

In [29]:
# Create linear regression model
lr_reg = LinearRegression()

In [30]:
# Train linear regression model
lr_y_pred = lr_reg.fit(X_train, y_train).predict(X_valid)
print("r2 = ", r2_score(y_valid, lr_y_pred))
print("rmse = ", root_mean_squared_error(y_valid, lr_y_pred))
print("mae = ", mean_absolute_error(y_valid, lr_y_pred))

r2 =  0.061288406452270516
rmse =  205.10487635489056
mae =  145.79643512744414


### DT

In [31]:
# Create decision tree regressor
dt_reg = DecisionTreeRegressor()

In [32]:
# Train DT regressor
dt_y_pred = dt_reg.fit(X_train, y_train).predict(X_valid)
print("r2 = ", r2_score(y_valid, dt_y_pred))
print("rmse = ", root_mean_squared_error(y_valid, dt_y_pred))
print("mae = ", mean_absolute_error(y_valid, dt_y_pred))

r2 =  -0.7311038186645524
rmse =  278.5296620306463
mae =  194.55272114399233
