In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
df_train_set = pd.read_csv('train_aggregated.csv', low_memory=False)

In [3]:
df_train_set.head()

Unnamed: 0,ride_id,travel_date,travel_time,travel_from,car_type,max_capacity,number_of_tickets
0,1442,17-10-17,7:15,Migori,Bus,49,1.0
1,5437,19-11-17,7:12,Migori,Bus,49,1.0
2,5710,26-11-17,7:05,Keroka,Bus,49,1.0
3,5777,27-11-17,7:10,Homa Bay,Bus,49,5.0
4,5778,27-11-17,7:12,Migori,Bus,49,31.0


In [4]:
df_train_set.drop(['ride_id'], axis=1, inplace=True) #ride_id is unnecessary in training set

In [5]:
df_train_set["travel_date"] = pd.to_datetime(df_train_set["travel_date"],infer_datetime_format=True)
df_train_set["travel_date"] = df_train_set["travel_date"].dt.dayofweek #change the full date to day of week

In [6]:
df_train_set["car_type"] = pd.Categorical(df_train_set["car_type"])
car_type_categories = df_train_set.car_type.cat.categories
df_train_set["car_type"] = df_train_set.car_type.cat.codes

In [7]:
df_train_set["travel_from"] = pd.Categorical(df_train_set["travel_from"])
travel_from_categories = df_train_set.travel_from.cat.categories
df_train_set["travel_from"] = df_train_set.travel_from.cat.codes

In [8]:
#express travel time in minutes
df_train_set["travel_time"] = df_train_set["travel_time"].str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1]))

In [9]:
df_train_set.head()

Unnamed: 0,travel_date,travel_time,travel_from,car_type,max_capacity,number_of_tickets
0,1,435,9,0,49,1.0
1,6,432,9,0,49,1.0
2,6,425,4,0,49,1.0
3,0,430,1,0,49,5.0
4,0,432,9,0,49,31.0


# Random forest model

In [10]:
X = df_train_set.drop(["number_of_tickets"], axis=1)
y = df_train_set.number_of_tickets

In [11]:
model = RandomForestRegressor(n_estimators=100, criterion="mae", n_jobs=-1)

In [12]:
model.fit(X,y)

RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [14]:
preds_train_set = model.predict(X)

In [15]:
print (mean_absolute_error(preds_train_set,y))

3.1712946071371415


# Predictions for test set

In [16]:
df_test_set = pd.read_csv('test_questions.csv', low_memory=False)

Let's first format the data as we did for the training set.

In [17]:
df_test_set.drop(['travel_to'], axis=1, inplace=True)

In [18]:
df_test_set["travel_date"] = pd.to_datetime(df_test_set["travel_date"],infer_datetime_format=True)
df_test_set["travel_date"] = df_test_set["travel_date"].dt.dayofweek

In [19]:
df_test_set["car_type"] = pd.Categorical(df_test_set["car_type"], categories=car_type_categories)
df_test_set["car_type"] = df_test_set.car_type.cat.codes

In [20]:
df_test_set["travel_from"] = pd.Categorical(df_test_set["travel_from"], categories=travel_from_categories)
df_test_set["travel_from"] = df_test_set.travel_from.cat.codes

In [21]:
df_test_set["travel_time"] = df_test_set["travel_time"].str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1]))

Now let's calculate predictions using the random forest model we trained.

In [22]:
X_test = df_test_set.drop(['ride_id'], axis=1)
test_set_predictions = model.predict(X_test)

And finally let's create a csv file with predictions. 

In [23]:
d = {'ride_id': df_test_set["ride_id"], 'number_of_ticket': test_set_predictions}
df_predictions = pd.DataFrame(data=d)
df_predictions = df_predictions[['ride_id','number_of_ticket']]

In [24]:
df_predictions.head()

Unnamed: 0,ride_id,number_of_ticket
0,247,5.935
1,256,5.96
2,275,1.01
3,285,9.085
4,286,10.495


In [25]:
df_predictions.to_csv('preds_test_set.csv', index=False) #save to csv file