# Testing code during development 

In [1]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

### Reading data

In [2]:
from demand_forecasting.read import read_data

# reading voiholm rode data 
rides_raw_df = read_data('../input/voiholm.csv')

# reading weather data
weather_raw_df = read_data('../input/weather_data.csv')

INFO:root:Read dataframe from file path ../input/voiholm.csv
INFO:root:Read dataframe from file path ../input/weather_data.csv


In [4]:
from demand_forecasting.verify import verify_ride_df, verify_weather_df

# verifying the dataframes 
rides_df = verify_ride_df(rides_raw_df)
weather_df = verify_weather_df(weather_raw_df)

INFO:root:Validated Voiholm ride dataframe
INFO:root:Validated weather dataframe


Unnamed: 0,ride_id,city_name,start_time,start_lon,start_lat,h3index_big,h3index_small
0,97341db7-f89f-45db-9dd1-bcc2620fc1db,voiholm,2020-08-05 04:27:40.425797 UTC,11.908587,57.687122,881f250613fffff,891f250612bffff
1,646c160f-bbb7-4c78-b2ca-bdd8644a8400,voiholm,2020-08-05 07:47:29.060615 UTC,11.908669,57.687050,881f250613fffff,891f250612bffff
2,6082a591-03ca-4a6d-a7f4-af9ea5a30366,voiholm,2020-08-05 07:42:37.61098 UTC,11.908693,57.687042,881f250613fffff,891f250612bffff
3,eea5165f-c8e5-4e61-b604-c219226dbc1e,voiholm,2020-08-05 13:38:59.425146 UTC,11.912271,57.689289,881f250613fffff,891f2506123ffff
4,ef26200a-6d0f-40cf-b1fb-91b2a5b6270c,voiholm,2020-08-05 03:29:27.859222 UTC,11.913318,57.690361,881f250613fffff,891f2506123ffff
...,...,...,...,...,...,...,...
164361,818eb4eb-03b1-4836-80dd-eedf24245e8f,voiholm,2020-08-21 13:59:31.111663 UTC,12.000498,57.717979,881f25a9ebfffff,891f25a9ebbffff
164362,e3c47612-6c61-4acc-a456-cb4292fa96f4,voiholm,2020-08-21 08:18:33.740151 UTC,12.000905,57.714794,881f25a9ebfffff,891f25a9eabffff
164363,cc117fa0-458b-409a-a847-6b2675bbef0b,voiholm,2020-08-21 06:57:10.500826 UTC,12.005732,57.721436,881f25a9ebfffff,891f25a9eb7ffff
164364,fc79fa02-c231-48d2-9caf-81e8ece06cdc,voiholm,2020-08-21 10:29:34.908873 UTC,12.005229,57.720001,881f25a9ebfffff,891f25a9eb7ffff


### Preprocessing data

In [5]:
from demand_forecasting.preprocess import preprocessing

preprocess_raw_df = preprocessing(rides_df=rides_df, weather_df=weather_df)

INFO:root:Preprossed dataframe


Unnamed: 0,h3index_small,start_date,num_of_rides,date,temperature,max_temperature,min_temperature,precipitation,start_weekday
0,891f2506107ffff,2020-08-01,0.0,2020-08-01,17.6,19.3,15.6,0.69,Saturday
1,891f2506107ffff,2020-08-02,0.0,2020-08-02,16.7,19.1,14.8,0.00,Sunday
2,891f2506107ffff,2020-08-03,1.0,2020-08-03,16.8,19.6,14.3,0.00,Monday
3,891f2506107ffff,2020-08-04,2.0,2020-08-04,15.0,17.1,12.8,11.44,Tuesday
4,891f2506107ffff,2020-08-05,2.0,2020-08-05,21.5,26.5,17.0,0.50,Wednesday
...,...,...,...,...,...,...,...,...,...
9775,891f25a9ecbffff,2020-08-26,0.0,2020-08-26,16.7,20.8,13.5,0.00,Wednesday
9776,891f25a9ecbffff,2020-08-27,0.0,2020-08-27,13.9,18.5,9.1,0.60,Thursday
9777,891f25a9ecbffff,2020-08-28,0.0,2020-08-28,14.5,17.8,12.1,0.10,Friday
9778,891f25a9ecbffff,2020-08-29,1.0,2020-08-29,14.7,19.8,8.9,0.00,Saturday


In [5]:
from demand_forecasting.verify import verify_preprocessed_df
preprocess_df = verify_preprocessed_df(preprocess_raw_df) #df
preprocess_df

INFO:root:Validated preprocessed dataframe


Unnamed: 0,h3index_small,start_date,num_of_rides,date,temperature,max_temperature,min_temperature,precipitation,start_weekday
0,891f2506107ffff,2020-08-01,0,2020-08-01,17.6,19.3,15.6,0.69,Saturday
1,891f2506107ffff,2020-08-02,0,2020-08-02,16.7,19.1,14.8,0.00,Sunday
2,891f2506107ffff,2020-08-03,1,2020-08-03,16.8,19.6,14.3,0.00,Monday
3,891f2506107ffff,2020-08-04,2,2020-08-04,15.0,17.1,12.8,11.44,Tuesday
4,891f2506107ffff,2020-08-05,2,2020-08-05,21.5,26.5,17.0,0.50,Wednesday
...,...,...,...,...,...,...,...,...,...
9420,891f25a9ecbffff,2020-08-25,0,2020-08-25,14.1,16.1,11.6,3.88,Tuesday
9421,891f25a9ecbffff,2020-08-26,0,2020-08-26,16.7,20.8,13.5,0.00,Wednesday
9422,891f25a9ecbffff,2020-08-27,0,2020-08-27,13.9,18.5,9.1,0.60,Thursday
9423,891f25a9ecbffff,2020-08-28,0,2020-08-28,14.5,17.8,12.1,0.10,Friday


In [6]:
from demand_forecasting.preprocess import extract_all_unique_h3index_small_areas

unique_h3index_df = extract_all_unique_h3index_small_areas(rides_df)
unique_h3index_df

Unnamed: 0,h3index_small
0,891f2506107ffff
1,891f2506123ffff
2,891f2506127ffff
3,891f250612bffff
4,891f250612fffff
...,...
321,891f25a9eafffff
322,891f25a9eb3ffff
323,891f25a9eb7ffff
324,891f25a9ebbffff


### Generating features

In [7]:
from demand_forecasting.features import generate_features_for_training

df_with_features = generate_features_for_training(preprocess_df)
df_with_features

INFO:root:Generated features for training dataset


Unnamed: 0,h3index_small,start_date,num_of_rides,date,temperature,max_temperature,min_temperature,precipitation,start_weekday,avg_num_rides_in_area_one_week_before
0,891f2506107ffff,2020-08-01,0,2020-08-01,17.6,19.3,15.6,0.69,Saturday,
1,891f2506107ffff,2020-08-02,0,2020-08-02,16.7,19.1,14.8,0.00,Sunday,
2,891f2506107ffff,2020-08-03,1,2020-08-03,16.8,19.6,14.3,0.00,Monday,
3,891f2506107ffff,2020-08-04,2,2020-08-04,15.0,17.1,12.8,11.44,Tuesday,
4,891f2506107ffff,2020-08-05,2,2020-08-05,21.5,26.5,17.0,0.50,Wednesday,
...,...,...,...,...,...,...,...,...,...,...
9420,891f25a9ecbffff,2020-08-25,0,2020-08-25,14.1,16.1,11.6,3.88,Tuesday,0.285714
9421,891f25a9ecbffff,2020-08-26,0,2020-08-26,16.7,20.8,13.5,0.00,Wednesday,0.285714
9422,891f25a9ecbffff,2020-08-27,0,2020-08-27,13.9,18.5,9.1,0.60,Thursday,0.285714
9423,891f25a9ecbffff,2020-08-28,0,2020-08-28,14.5,17.8,12.1,0.10,Friday,0.285714


In [8]:
df_with_features[df_with_features.h3index_small == '891f25a9807ffff']

Unnamed: 0,h3index_small,start_date,num_of_rides,date,temperature,max_temperature,min_temperature,precipitation,start_weekday,avg_num_rides_in_area_one_week_before


# Modelling

In [9]:
target_variable = 'num_of_rides'
features = ['precipitation', 'start_weekday', 'max_temperature', 'avg_num_rides_in_area_one_week_before']

### XGboost

In [10]:
from demand_forecasting.train import training_xgb_model

model = training_xgb_model(df_with_features,
                           target_variable=target_variable,
                            features=features,
                            max_depth=3,
                            learning_rate=0.1,
                            n_estimators=100)

model

INFO:root:Trained XGB model


# Interference 

In [13]:
from dateutil.relativedelta import relativedelta

date_of_prediction = pd.to_datetime(os.environ.get('PREDICTION_DATE'))

In [14]:
from demand_forecasting.features import generate_features_for_inference

inference_with_features_df = generate_features_for_inference(unique_h3index_df, preprocess_df, weather_df) 
inference_with_features_df

INFO:root:Generated features for interference


Unnamed: 0,h3index_small,start_date,start_weekday,avg_num_rides_in_area_one_week_before,date,temperature,max_temperature,min_temperature,precipitation
0,891f2506107ffff,2020-08-30,Sunday,1.285714,2020-08-30,14.3,19.0,10.0,0.0
1,891f2506123ffff,2020-08-30,Sunday,11.142857,2020-08-30,14.3,19.0,10.0,0.0
2,891f2506127ffff,2020-08-30,Sunday,7.142857,2020-08-30,14.3,19.0,10.0,0.0
3,891f250612bffff,2020-08-30,Sunday,1.857143,2020-08-30,14.3,19.0,10.0,0.0
4,891f250612fffff,2020-08-30,Sunday,13.285714,2020-08-30,14.3,19.0,10.0,0.0
...,...,...,...,...,...,...,...,...,...
321,891f25a9eafffff,2020-08-30,Sunday,19.000000,2020-08-30,14.3,19.0,10.0,0.0
322,891f25a9eb3ffff,2020-08-30,Sunday,1.714286,2020-08-30,14.3,19.0,10.0,0.0
323,891f25a9eb7ffff,2020-08-30,Sunday,18.857143,2020-08-30,14.3,19.0,10.0,0.0
324,891f25a9ebbffff,2020-08-30,Sunday,10.714286,2020-08-30,14.3,19.0,10.0,0.0


In [16]:
from demand_forecasting.predict import predict

final_df = predict(inference_with_features_df, features, model)

final_df

INFO:root:Made predictions


Unnamed: 0,h3index_small,date,predicted_num_of_rides
0,891f2506107ffff,2020-08-30,1
1,891f2506123ffff,2020-08-30,13
2,891f2506127ffff,2020-08-30,8
3,891f250612bffff,2020-08-30,2
4,891f250612fffff,2020-08-30,15
...,...,...,...
321,891f25a9eafffff,2020-08-30,21
322,891f25a9eb3ffff,2020-08-30,2
323,891f25a9eb7ffff,2020-08-30,21
324,891f25a9ebbffff,2020-08-30,11


# Linear regression  (Base mdoel) 



In [None]:
from app import config
target_variable = config.MODEL_CONFIG['target']
features = config.MODEL_CONFIG['features']
features

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

all_feature_df = df_with_features[df_with_features.avg_num_rides_in_area_one_week_before.notna()]

columns = features.copy()
columns.append(target_variable)
model_df = all_feature_df[columns].copy()

dummy_df = pd.get_dummies(model_df['start_weekday'])
model_df = pd.concat([model_df, dummy_df], axis=1)
model_df.drop(columns=['start_weekday'], inplace=True)
model_df

In [None]:
# Setting a side evaluation data to not be used when training and hyper parameter tuning the model
training_df, test_df = train_test_split(model_df, test_size=0.15, random_state=43)
X_train = training_df.drop(columns=target_variable)
y_train = training_df[[target_variable]]
X_test = test_df.drop(columns=target_variable)
y_test = test_df[[target_variable]]

X_test

In [None]:
# Create a Linear Regression model
from sklearn.linear_model import LinearRegression
lin_model = LinearRegression()

# Fit the model to the data
lin_model.fit(X_train, y_train)

# Make predictions
y_pred = lin_model.predict(X_test)
y_pred

In [None]:
y_predctions = np.where(y_pred < 0, 0, y_pred).astype('int')
y_predctions

In [None]:
# Plot the data and the linear regression line
plt.scatter(X_train.avg_num_rides_in_area_one_week_before, y_train, alpha=0.8, label='Original Data')
plt.plot(X_test.avg_num_rides_in_area_one_week_before, y_predctions, 'r-', label='Linear Regression')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()

In [None]:
r_sq = lin_model.score(X_train, y_train)
print(f"coefficient of determination: {r_sq}")
print(f"intercept: {lin_model.intercept_}")
print(f"coefficients: {lin_model.coef_}")

In [None]:
root_mse = mean_squared_error(y_test.num_of_rides,y_predctions, squared=False)
mae = mean_absolute_error(y_test.num_of_rides, y_predctions)

print('root_mse', root_mse)
print('mae', mae)