# Testing code during development 

In [1]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

### Reading data

In [2]:
from demand_forecasting.read import read_data

# reading voiholm rode data 
rides_raw_df = read_data('../input/voiholm.csv')

# reading weather data
weather_raw_df = read_data('../input/weather_data.csv')

INFO:root:Read dataframe from file path ../input/voiholm.csv
INFO:root:Read dataframe from file path ../input/weather_data.csv


In [3]:
from demand_forecasting.verify import verify_ride_df, verify_weather_df

# verifying the dataframes 
rides_df = verify_ride_df(rides_raw_df)
weather_df = verify_weather_df(weather_raw_df)

INFO:root:Validated Voiholm ride dataframe
INFO:root:Validated weather dataframe


### Preprocessing data

In [4]:
from demand_forecasting.preprocess import preprocessing

df, unique_h3index_df = preprocessing(rides_df=rides_df, weather_df=weather_df)
df

INFO:root:Preprossed dataframe


Unnamed: 0,h3index_small,start_date,num_of_rides,date,temperature,max_temperature,min_temperature,precipitation,start_weekday
0,891f2506107ffff,2020-08-01,0.0,2020-08-01,17.6,19.3,15.6,0.69,Saturday
1,891f2506107ffff,2020-08-02,0.0,2020-08-02,16.7,19.1,14.8,0.00,Sunday
2,891f2506107ffff,2020-08-03,1.0,2020-08-03,16.8,19.6,14.3,0.00,Monday
3,891f2506107ffff,2020-08-04,2.0,2020-08-04,15.0,17.1,12.8,11.44,Tuesday
4,891f2506107ffff,2020-08-05,2.0,2020-08-05,21.5,26.5,17.0,0.50,Wednesday
...,...,...,...,...,...,...,...,...,...
9775,891f25a9ecbffff,2020-08-26,0.0,2020-08-26,16.7,20.8,13.5,0.00,Wednesday
9776,891f25a9ecbffff,2020-08-27,0.0,2020-08-27,13.9,18.5,9.1,0.60,Thursday
9777,891f25a9ecbffff,2020-08-28,0.0,2020-08-28,14.5,17.8,12.1,0.10,Friday
9778,891f25a9ecbffff,2020-08-29,1.0,2020-08-29,14.7,19.8,8.9,0.00,Saturday


### Generating features

In [5]:
from demand_forecasting.features import generate_features

df_with_features = generate_features(df)
df_with_features

INFO:root:Generating features


Unnamed: 0,h3index_small,start_date,num_of_rides,date,temperature,max_temperature,min_temperature,precipitation,start_weekday,avg_num_rides_in_area_one_week_before
0,891f2506107ffff,2020-08-01,0.0,2020-08-01,17.6,19.3,15.6,0.69,Saturday,
1,891f2506107ffff,2020-08-02,0.0,2020-08-02,16.7,19.1,14.8,0.00,Sunday,
2,891f2506107ffff,2020-08-03,1.0,2020-08-03,16.8,19.6,14.3,0.00,Monday,
3,891f2506107ffff,2020-08-04,2.0,2020-08-04,15.0,17.1,12.8,11.44,Tuesday,
4,891f2506107ffff,2020-08-05,2.0,2020-08-05,21.5,26.5,17.0,0.50,Wednesday,
...,...,...,...,...,...,...,...,...,...,...
9775,891f25a9ecbffff,2020-08-26,0.0,2020-08-26,16.7,20.8,13.5,0.00,Wednesday,0.285714
9776,891f25a9ecbffff,2020-08-27,0.0,2020-08-27,13.9,18.5,9.1,0.60,Thursday,0.285714
9777,891f25a9ecbffff,2020-08-28,0.0,2020-08-28,14.5,17.8,12.1,0.10,Friday,0.285714
9778,891f25a9ecbffff,2020-08-29,1.0,2020-08-29,14.7,19.8,8.9,0.00,Saturday,0.142857


# Modelling

In [6]:
target_variable = 'num_of_rides'
features = ['precipitation', 'start_weekday', 'max_temperature', 'avg_num_rides_in_area_one_week_before']

### XGboost

In [7]:
from demand_forecasting.train import training_xgb_model

model = training_xgb_model(df_with_features,
                           target_variable=target_variable,
                            features=features,
                            max_depth=3,
                            learning_rate=0.1,
                            n_estimators=100)

model

# Interference 

In [32]:
from dateutil.relativedelta import relativedelta

date_of_prediction = pd.to_datetime(os.environ.get('PREDICTION_DATE'))

date_one_week_before = date_of_prediction - relativedelta(days = 7)
date_one_week_before

Timestamp('2020-08-24 00:00:00')

In [17]:
#  # #  Get feature data for the date of prediction  # # #

inference_df = unique_h3index_df[['h3index_small']].copy()
inference_df['start_date'] = date_of_prediction
inference_df['start_weekday'] =  inference_df.start_date.dt.day_name()
inference_df['start_weekday'] =  inference_df.start_weekday.astype("category")

# # #  Extracting avg num rides in each area one week before  # # #
df_one_week_before_df = df[df.start_date >= date_one_week_before]

avg_num_of_rides_per_area_df = df_one_week_before_df.groupby('h3index_small').agg({'num_of_rides':'mean'}).reset_index()
avg_num_of_rides_per_area_df.rename(columns={'num_of_rides':'avg_num_rides_in_area_one_week_before'}, inplace=True)

# Adding avg num rides in each area one week before to dataframe 
inference_df = pd.merge(inference_df, avg_num_of_rides_per_area_df, on='h3index_small', how='left')

# # #   Get weather forecast for next day  # # #
# this should call an external API during night to get the weather forecast for the area for the next day. 
# since this case doesn't provide the location of Voiholm I can't implement this now 

# lets here assume the weather forecast we get for next day is accurate, and let's use the data we have in weather_data.csv for the 2020-08-31)

weather_at_prediciton_date_df = weather_df[weather_df.date == date_of_prediction]

inference_df = pd.merge(inference_df, weather_at_prediciton_date_df, left_on= 'start_date', right_on='date', how='left') 
inference_df

Unnamed: 0,h3index_small,start_date,start_weekday,avg_num_rides_in_area_one_week_before,date,temperature,max_temperature,min_temperature,precipitation
0,891f2506107ffff,2020-08-31,Monday,1.000000,2020-08-31,14.7,21.5,7.1,0.0
1,891f2506123ffff,2020-08-31,Monday,10.285714,2020-08-31,14.7,21.5,7.1,0.0
2,891f2506127ffff,2020-08-31,Monday,6.714286,2020-08-31,14.7,21.5,7.1,0.0
3,891f250612bffff,2020-08-31,Monday,1.714286,2020-08-31,14.7,21.5,7.1,0.0
4,891f250612fffff,2020-08-31,Monday,14.142857,2020-08-31,14.7,21.5,7.1,0.0
...,...,...,...,...,...,...,...,...,...
321,891f25a9eafffff,2020-08-31,Monday,20.857143,2020-08-31,14.7,21.5,7.1,0.0
322,891f25a9eb3ffff,2020-08-31,Monday,1.857143,2020-08-31,14.7,21.5,7.1,0.0
323,891f25a9eb7ffff,2020-08-31,Monday,19.428571,2020-08-31,14.7,21.5,7.1,0.0
324,891f25a9ebbffff,2020-08-31,Monday,9.857143,2020-08-31,14.7,21.5,7.1,0.0


In [10]:
from demand_forecasting.features import generate_features_for_inference

inference_with_features_df = generate_features_for_inference(unique_h3index_df, df, weather_df) 
inference_with_features_df

Unnamed: 0,h3index_small,start_date,start_weekday,avg_num_rides_in_area_one_week_before,date,temperature,max_temperature,min_temperature,precipitation
0,891f2506107ffff,2020-08-31,Monday,1.000000,2020-08-31,14.7,21.5,7.1,0.0
1,891f2506123ffff,2020-08-31,Monday,10.285714,2020-08-31,14.7,21.5,7.1,0.0
2,891f2506127ffff,2020-08-31,Monday,6.714286,2020-08-31,14.7,21.5,7.1,0.0
3,891f250612bffff,2020-08-31,Monday,1.714286,2020-08-31,14.7,21.5,7.1,0.0
4,891f250612fffff,2020-08-31,Monday,14.142857,2020-08-31,14.7,21.5,7.1,0.0
...,...,...,...,...,...,...,...,...,...
321,891f25a9eafffff,2020-08-31,Monday,20.857143,2020-08-31,14.7,21.5,7.1,0.0
322,891f25a9eb3ffff,2020-08-31,Monday,1.857143,2020-08-31,14.7,21.5,7.1,0.0
323,891f25a9eb7ffff,2020-08-31,Monday,19.428571,2020-08-31,14.7,21.5,7.1,0.0
324,891f25a9ebbffff,2020-08-31,Monday,9.857143,2020-08-31,14.7,21.5,7.1,0.0


In [19]:
from demand_forecasting.predict import predict

final_df = predict(inference_with_features_df, features, model)

final_df

Unnamed: 0,h3index_small,date,prediced_num_of_rides
0,891f2506107ffff,2020-08-31,1.0
1,891f2506123ffff,2020-08-31,11.0
2,891f2506127ffff,2020-08-31,8.0
3,891f250612bffff,2020-08-31,2.0
4,891f250612fffff,2020-08-31,15.0
...,...,...,...
321,891f25a9eafffff,2020-08-31,25.0
322,891f25a9eb3ffff,2020-08-31,2.0
323,891f25a9eb7ffff,2020-08-31,21.0
324,891f25a9ebbffff,2020-08-31,11.0


In [None]:
# continuously evaluate predicted values vs actual outcome and alert when off more than 