In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2-cp311-cp311-win_amd64.whl (101.0 MB)
     ---------------------------------------- 0.0/101.0 MB ? eta -:--:--
     - ------------------------------------ 3.7/101.0 MB 114.9 MB/s eta 0:00:01
     --- ---------------------------------- 9.2/101.0 MB 116.1 MB/s eta 0:00:01
     ----- ------------------------------- 16.0/101.0 MB 131.2 MB/s eta 0:00:01
     ------ ------------------------------ 18.9/101.0 MB 110.0 MB/s eta 0:00:01
     --------- ---------------------------- 24.6/101.0 MB 93.9 MB/s eta 0:00:01
     ----------- ------------------------- 31.3/101.0 MB 131.2 MB/s eta 0:00:01
     -------------- ---------------------- 38.5/101.0 MB 162.4 MB/s eta 0:00:01
     ---------------- -------------------- 46.1/101.0 MB 131.2 MB/s eta 0:00:01
     ------------------ ------------------ 51.8/101.0 MB 131.2 MB/s eta 0:00:01
     -------------------- ---------------- 57.3/101.0 MB 108.8 MB/s eta 0:00:01
     ----------------------- ------------- 

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.metrics import r2_score
from catboost import CatBoostRegressor

In [None]:
# Load training data, reshape, add departure time as an integer number of seconds and add day of week:
df = pd.read_csv('../TotalEnergies/data/Train.csv', parse_dates=['travel_date'], dayfirst=True)
train = df.groupby(['ride_id', 'travel_date', 'travel_time', 'travel_from', 'max_capacity']).size().reset_index(name='Count') #sort=False if needed?
train["travel_time"] = train["travel_time"].str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1]))
train['day'] = train['travel_date'].dt.dayofweek

In [4]:
# The same for the test data
test = pd.read_csv('../TotalEnergies/data/Test.csv', parse_dates=['travel_date'], dayfirst=True).drop(['car_type', 'travel_to'], axis=1)
test["travel_time"] = test["travel_time"].str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1]))
test['day'] = test['travel_date'].dt.dayofweek

In [10]:
# The sample submission file
sample = pd.read_csv('../TotalEnergies/submissions/SampleSubmission.csv')

In [11]:
# Combine training and test data for now, so that we can add uber movement data all in one go
train['t'] = 0
test['t'] = 1
X = pd.concat([train, test], sort=False)

In [12]:
# Load travel times from Uber movement data ( 3 x 3month periods)
t1 = pd.read_csv('../TotalEnergies/data/Travel_Times_Daily_1.csv',parse_dates=['Date'])
t2 = pd.read_csv('../TotalEnergies/data/Travel_Times_Daily_2.csv',parse_dates=['Date'])
t3 = pd.read_csv('../TotalEnergies/data/Travel_Times_Daily_3.csv',parse_dates=['Date'])
travel_times = pd.concat([t1, t2, t3], ignore_index=True)
travel_times = travel_times.fillna(method='ffill')[['Daily Mean Travel Time (Seconds)', 'Date']]
travel_times['Date'] = pd.to_datetime(travel_times['Date'])
travel_times.head(1)

Unnamed: 0,Daily Mean Travel Time (Seconds),Date
0,2926.0,2017-12-15


In [13]:
# Merge with our contest data
X['Date'] = X['travel_date']
X.set_index('travel_date', inplace=True)
X = X.merge(travel_times, how='left', on='Date')
X.head(1)

Unnamed: 0,ride_id,travel_time,travel_from,max_capacity,Count,day,t,Date,Daily Mean Travel Time (Seconds)
0,1442,435,Migori,49,1.0,1,0,2017-10-17,2698.0


In [20]:
# Create the model
model = CatBoostRegressor(iterations=200, 
                          depth=4, 
                          learning_rate=0.5, 
                          loss_function='MAE', verbose=False)

In [22]:
in_cols = ['travel_time', 'travel_from', 'max_capacity', 'day'] #'Daily Mean Travel Time (Seconds)' as an option

In [23]:
# Train model
tr = X.loc[X.t == 0]
model.fit(tr[in_cols], tr['Count'], cat_features=['travel_from', 'max_capacity', 'day'])

<catboost.core.CatBoostRegressor at 0x25b0d142310>

In [24]:
# Score model
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(model.predict(tr[in_cols]), tr['Count']))

3.499389595249243


In [25]:
# Make predictions and append to the sample submission data, and save as csv
te = X.loc[X.t == 1]
te[in_cols].head()
te = X.loc[X.t == 1]
sample['number_of_ticket'][5:] = model.predict(te[in_cols])[5:] # Ignore the warning
sample.to_csv('catboost_predictions.csv', index=False)
sample.head(10)

Unnamed: 0,ride_id,number_of_ticket
0,4446,0.0
1,13962,0.0
2,5569,0.0
3,1675,0.0
4,5711,0.0
5,2417,9.77877
6,15010,14.936669
7,1823,9.622466
8,15191,9.856025
9,14402,1.651639
