In [29]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score
from sklearn.metrics import roc_auc_score
import datetime as dt
import matplotlib.pyplot as plt

In [30]:
df_train_set = pd.read_csv('train_aggregated.csv', low_memory=False)

In [31]:
df_train_set.head()

Unnamed: 0,ride_id,travel_date,travel_time,travel_from,car_type,max_capacity,number_of_tickets
0,1442,17-10-17,7:15,Migori,Bus,49,1.0
1,5437,19-11-17,7:12,Migori,Bus,49,1.0
2,5710,26-11-17,7:05,Keroka,Bus,49,1.0
3,5777,27-11-17,7:10,Homa Bay,Bus,49,5.0
4,5778,27-11-17,7:12,Migori,Bus,49,31.0


In [32]:
df_train_set.describe()

Unnamed: 0,ride_id,max_capacity,number_of_tickets
count,6249.0,6249.0,6249.0
mean,9963.644583,30.392223,8.264522
std,2296.304872,18.997471,8.632968
min,1442.0,11.0,1.0
25%,7989.0,11.0,2.0
50%,10024.0,49.0,7.0
75%,11917.0,49.0,11.0
max,20117.0,49.0,50.0


In [33]:
df_train_set.drop(['ride_id'], axis=1, inplace=True) #ride_id is unnecessary in training set

In [34]:
df_train_set["travel_date"] = pd.to_datetime(df_train_set["travel_date"],infer_datetime_format=True)
df_train_set['travel_month'] = df_train_set.travel_date.dt.month
df_train_set['travel_dayof_year'] = df_train_set.travel_date.dt.dayofyear
df_train_set["travel_date"] = df_train_set["travel_date"].dt.dayofweek 
#remember change the full date to day of week
#add travel season eg rainny,dry etc, school going season

In [35]:
df_train_set["car_type"] = pd.Categorical(df_train_set["car_type"])
car_type_categories = df_train_set.car_type.cat.categories
df_train_set["car_type"] = df_train_set.car_type.cat.codes

In [36]:
df_train_set["travel_from"] = pd.Categorical(df_train_set["travel_from"])
travel_from_categories = df_train_set.travel_from.cat.categories
df_train_set["travel_from"] = df_train_set.travel_from.cat.codes

In [37]:
#express travel time in minutes
df_train_set["travel_time"] = df_train_set["travel_time"].str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1]))

In [38]:
# pairwise correlation
df_train_set.corr()

Unnamed: 0,travel_date,travel_time,travel_from,car_type,max_capacity,number_of_tickets,travel_month,travel_dayof_year
travel_date,1.0,0.022232,0.00731,-0.030175,0.030175,0.001035,0.031918,0.028249
travel_time,0.022232,1.0,0.145781,-0.019499,0.019499,0.205832,0.041282,0.040311
travel_from,0.00731,0.145781,1.0,-0.203815,0.203815,-0.027875,-0.024963,-0.025947
car_type,-0.030175,-0.019499,-0.203815,1.0,-1.0,-0.208762,0.085932,0.086896
max_capacity,0.030175,0.019499,0.203815,-1.0,1.0,0.208762,-0.085932,-0.086896
number_of_tickets,0.001035,0.205832,-0.027875,-0.208762,0.208762,1.0,0.030389,0.028571
travel_month,0.031918,0.041282,-0.024963,0.085932,-0.085932,0.030389,1.0,0.99715
travel_dayof_year,0.028249,0.040311,-0.025947,0.086896,-0.086896,0.028571,0.99715,1.0


In [39]:
X = df_train_set.drop(["number_of_tickets"], axis=1)
y = df_train_set.number_of_tickets

In [40]:
model = RandomForestRegressor() 

In [41]:
model.fit(X,y)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [42]:
preds_train_set = model.predict(X)

In [43]:
print (explained_variance_score(y,preds_train_set)) #best is 1.0

0.9227594376136058


In [44]:
# print (mean_absolute_error(y,preds_train_set)) #best is 0.0

In [45]:
df_test_set = pd.read_csv('test_questions.csv', low_memory=False)

In [46]:
df_test_set.head()

Unnamed: 0,ride_id,travel_date,travel_time,travel_from,travel_to,car_type,max_capacity
0,247,2018-05-07,07:06,Kisii,Nairobi,Bus,49
1,256,2018-05-06,11:08,Kisii,Nairobi,shuttle,11
2,275,2018-05-04,05:00,Kisii,Nairobi,shuttle,11
3,285,2018-05-04,09:10,Kisii,Nairobi,shuttle,11
4,286,2018-05-04,09:20,Kisii,Nairobi,shuttle,11


In [47]:
#df_test_set["travel_to"].value_counts()
df_test_set.drop(['travel_to'], axis=1, inplace=True)

In [48]:
df_test_set["travel_date"] = pd.to_datetime(df_test_set["travel_date"],infer_datetime_format=True)
df_test_set['travel_month'] = df_test_set.travel_date.dt.month
df_test_set['travel_dayof_year'] = df_test_set.travel_date.dt.dayofyear
df_test_set["travel_date"] = df_test_set["travel_date"].dt.dayofweek

In [49]:
df_test_set["car_type"] = pd.Categorical(df_test_set["car_type"], categories=car_type_categories)
df_test_set["car_type"] = df_test_set.car_type.cat.codes

In [50]:
df_test_set["travel_from"] = pd.Categorical(df_test_set["travel_from"], categories=travel_from_categories)
df_test_set["travel_from"] = df_test_set.travel_from.cat.codes

In [51]:
df_test_set["travel_time"] = df_test_set["travel_time"].str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1]))

In [52]:
X_test = df_test_set.drop(['ride_id'], axis=1)
test_set_predictions = model.predict(X_test)

In [53]:
d = {'ride_id': df_test_set["ride_id"], 'number_of_ticket': test_set_predictions}
df_predictions = pd.DataFrame(data=d)
df_predictions = df_predictions[['ride_id','number_of_ticket']]

In [54]:
df_predictions.number_of_ticket = df_predictions.number_of_ticket.astype(int)
df_predictions["number_of_ticket"] = df_predictions["number_of_ticket"].apply(lambda x: int(round(x)))

In [55]:
df_predictions.head()

Unnamed: 0,ride_id,number_of_ticket
0,247,12
1,256,8
2,275,1
3,285,8
4,286,8


In [56]:
df_predictions.to_csv('prediction.csv', index=False) #save to csv file