## Modeling - Random Forest Regressor

The goal of this notebook is to try out the Random Forest Regressor to see if it can be a valuable predictor of taxi trip duration based on the features that we engineered.

In [35]:
import pandas as pd 
import numpy as np
from sklearn.metrics import r2_score

In [5]:
df = pd.read_csv('data/final_data_1.csv')

In [6]:
#kernel keeps dying , trying with 10% of dataset
df = df.sample(frac=0.1, replace=True, random_state=1)

In [7]:
#df = df.drop(['Unnamed: 0', 'X'], axis =1 )

In [8]:
df.columns

Index(['Unnamed: 0', 'X', 'id', 'vendor_id', 'pickup_datetime',
       'dropoff_datetime', 'passenger_count', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'store_and_fwd_flag', 'trip_duration', 'distance_km', 'speed_km_sec',
       'day', 'month', 'hour', 'geometry', 'pickup_date', 'pickup_hour',
       'dropoff_date', 'dropoff_hour', 'pickup_dayofmonth',
       'pickup_dateinnumbers', 'pickup_monthinnumbers', 'average_temperature',
       'precipitation', 'snowfall', 'snowdepth', 'kmeanscluster',
       'neighborhood'],
      dtype='object')

In [9]:
#excluding pickup and dropoff date as that doesn't seem to be relevant, also excluding passenger count
dfu = df[['distance_km', 'speed_km_sec', 'hour', 'average_temperature', 'precipitation', 'snowfall',
         'snowdepth', 'kmeanscluster', 'neighborhood', 'trip_duration']]

In [10]:
dfu.head()

Unnamed: 0,distance_km,speed_km_sec,hour,average_temperature,precipitation,snowfall,snowdepth,kmeanscluster,neighborhood,trip_duration
128037,1.293588,0.003839,16,62.5,0.16,0.0,0,1,Lower Manhattan,337
491755,1.295411,0.003511,23,37.0,0.47,T,0,1,Lower Manhattan,369
470924,3.449869,0.004417,21,46.5,0.0,0.0,0,1,Lower Manhattan,781
491263,1.058453,0.004485,10,33.5,0.05,0.1,0,1,Lower Manhattan,236
836489,1.573762,0.004396,19,43.0,0.0,0.0,17,0,Lower Manhattan,358


In [11]:
#engineer dummy variables for the categorical variables
#categorical features list
#
dfu.head()

Unnamed: 0,distance_km,speed_km_sec,hour,average_temperature,precipitation,snowfall,snowdepth,kmeanscluster,neighborhood,trip_duration
128037,1.293588,0.003839,16,62.5,0.16,0.0,0,1,Lower Manhattan,337
491755,1.295411,0.003511,23,37.0,0.47,T,0,1,Lower Manhattan,369
470924,3.449869,0.004417,21,46.5,0.0,0.0,0,1,Lower Manhattan,781
491263,1.058453,0.004485,10,33.5,0.05,0.1,0,1,Lower Manhattan,236
836489,1.573762,0.004396,19,43.0,0.0,0.0,17,0,Lower Manhattan,358


In [12]:
dfd = pd.get_dummies(dfu, columns=['hour', 'kmeanscluster', 'neighborhood'])
#dfu = dfu.drop(['hour', 'kmeanscluster', 'neighborhood'], axis=1)
#dfa = pd.concat([dfu, dfd], axis=1)

In [13]:
dfd.head()

Unnamed: 0,distance_km,speed_km_sec,average_temperature,precipitation,snowfall,snowdepth,trip_duration,hour_0,hour_1,hour_2,...,kmeanscluster_0,kmeanscluster_1,kmeanscluster_2,kmeanscluster_3,neighborhood_Brooklyn,neighborhood_Harlem,neighborhood_Hempstead,neighborhood_Lower Manhattan,neighborhood_Midtown Manhattan,neighborhood_Washington Heights
128037,1.293588,0.003839,62.5,0.16,0.0,0,337,0,0,0,...,0,1,0,0,0,0,0,1,0,0
491755,1.295411,0.003511,37.0,0.47,T,0,369,0,0,0,...,0,1,0,0,0,0,0,1,0,0
470924,3.449869,0.004417,46.5,0.0,0.0,0,781,0,0,0,...,0,1,0,0,0,0,0,1,0,0
491263,1.058453,0.004485,33.5,0.05,0.1,0,236,0,0,0,...,0,1,0,0,0,0,0,1,0,0
836489,1.573762,0.004396,43.0,0.0,0.0,17,358,0,0,0,...,1,0,0,0,0,0,0,1,0,0


In [14]:
#hot fix for snowfall, precipitation, snowdepth
#we assumed that trace precipitation, snowdepth and snowfall was equivalent to 0
dfd.loc[df['precipitation'] == 'T'] = 0.00
dfd.loc[df['snowfall'] == 'T'] = 0.00
dfd.loc[df['snowdepth'] == 'T'] = 0.00

In [15]:
dfd['precipitation'].unique()

array(['0.16', 0.0, '0.00', '0.05', '0.02', '1.65', '0.04', '0.01',
       '0.12', '1.80', '0.18', '0.09', '0.03', '0.24', '1.01', '0.45',
       '0.06', '0.25', '0.22', '0.73', '0.14', '0.54', '0.61', '0.44',
       '0.53', '1.22', '0.20', '0.38', '0.91', '0.40', '0.29', '2.31'],
      dtype=object)

In [16]:
dfd[['precipitation', 'snowfall', 'snowdepth']] = dfd[['precipitation', 'snowfall', 'snowdepth']].apply(pd.to_numeric)

In [17]:
dfd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 108745 entries, 128037 to 413158
Data columns (total 41 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   distance_km                      108745 non-null  float64
 1   speed_km_sec                     108745 non-null  float64
 2   average_temperature              108745 non-null  float64
 3   precipitation                    108745 non-null  float64
 4   snowfall                         108745 non-null  float64
 5   snowdepth                        108745 non-null  float64
 6   trip_duration                    108745 non-null  float64
 7   hour_0                           108745 non-null  float64
 8   hour_1                           108745 non-null  float64
 9   hour_2                           108745 non-null  float64
 10  hour_3                           108745 non-null  float64
 11  hour_4                           108745 non-null  float64
 1

In [18]:
#dfa['snowfall'].info()

In [19]:
#now that we have converted all of the categorical columns to dummy variables, we can move on with modeling

In [20]:
#load the test data


In [21]:
df_test = pd.read_csv('data/test.csv')

In [22]:
#need to engineer the same exact steps for the test data ??
df_test.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N


## Scaling

In [23]:
from sklearn.model_selection import train_test_split

#Split into x and y 
X = np.array(dfd.drop(['trip_duration'], axis =1))
y = np.array(dfd[['trip_duration']])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 123)

In [24]:
from sklearn import preprocessing
import numpy as np
# build scaler based on training data and apply it to test data to then also scale the test data
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [25]:
print((X_train_scaled.shape))
print((X_test_scaled.shape))

(81558, 40)
(27187, 40)


## Modeling

We wanted to continue investigating feature selection methods

In [26]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 10, random_state = 42)

rf.fit(X_train_scaled, y_train)

  after removing the cwd from sys.path.


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [38]:
model = rf.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

  """Entry point for launching an IPython kernel.


In [40]:
predictions = rf.predict(X_test_scaled)
# Calculate the absolute errors
errors = abs(predictions - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))
print('Mean Squared Error: ', round(np.mean(np.sqrt(errors))))
print('R squared: ', round(r2_score(y_pred, y_test)))

Mean Absolute Error: 351.78
Mean Squared Error:  17.0
R squared:  1.0


In [41]:
feature_list = list(dfd.columns)

In [42]:
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:10} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: distance_km Importance: 0.77
Variable: speed_km_sec Importance: 0.23
Variable: average_temperature Importance: 0.0
Variable: precipitation Importance: 0.0
Variable: snowfall   Importance: 0.0
Variable: snowdepth  Importance: 0.0
Variable: trip_duration Importance: 0.0
Variable: hour_0     Importance: 0.0
Variable: hour_1     Importance: 0.0
Variable: hour_2     Importance: 0.0
Variable: hour_3     Importance: 0.0
Variable: hour_4     Importance: 0.0
Variable: hour_5     Importance: 0.0
Variable: hour_6     Importance: 0.0
Variable: hour_7     Importance: 0.0
Variable: hour_8     Importance: 0.0
Variable: hour_9     Importance: 0.0
Variable: hour_10    Importance: 0.0
Variable: hour_11    Importance: 0.0
Variable: hour_12    Importance: 0.0
Variable: hour_13    Importance: 0.0
Variable: hour_14    Importance: 0.0
Variable: hour_15    Importance: 0.0
Variable: hour_16    Importance: 0.0
Variable: hour_17    Importance: 0.0
Variable: hour_18    Importance: 0.0
Variable: hour_19 