<a href="https://colab.research.google.com/github/sophiabj/03-presidential-election/blob/master/Road_Safety_Time_Series_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Problem statement: Can we predict the dangerous and safest times to travel in United Kingdom (UK)?

Data: The data contains information on road accidents that occurred in UK in 2016. It includes details of casualties and vehicles involved in the accidents.



In [41]:
#Output directory
output_dir = "."

In [42]:
# Import Kubeflow SDK
import kfp
import kfp.dsl as dsl
import kfp.components as comp
import os

In [43]:
#Preprocessing component
def preprocess (data_path):
  import pickle
  import os
  import sys, subprocess;
  subprocess.run([sys.executable, '-m', 'pip', 'install', 'pip==20.2.4'])
  subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas==0.23.4'])
  #subprocess.run([sys.executable, '-m', '!pip', 'install', 'scikit-learn==0.22'])
  subprocess.run([sys.executable, '-m', 'pip', 'install', 'numpy==1.17.1'])
    
  import pandas as pd
  import numpy as np

 

  main = pd.read_csv("https://raw.githubusercontent.com/sophiabj/stage-f-01-road-safety/master/data/dftRoadSafety_Accidents_2016.csv")
  
  pd.set_option("display.max_columns", 32)
 
  main.fillna(method='ffill',inplace=True)
 
  main.to_csv('Time_series', index=False)
 
  Time_series = pd.read_csv('Time_series', infer_datetime_format=True, parse_dates={'datetime':[9,11]},
                 index_col=['datetime'], header = 0,)
 
  Time_series_2 = Time_series[['Accident_Index','Number_of_Casualties']]
  
  Time_series_2.isna().sum()
  
  Time_series_2.index = pd.to_datetime(Time_series.index)
 
  df_daily = Time_series_2.resample('D').mean()
    
  df_daily = df_daily.reset_index()

  df_daily = df_daily.rename(columns={'datetime': 'ds', 'Number_of_Casualties': 'y'})
  
  train2 = df_daily[(df_daily['ds'] > '2016-01-01') & (df_daily['ds'] <= '2016-12-01')]
  
  test = df_daily[(df_daily['ds'] > '2016-12-01')]
    
  #Save data              
  with open(f'{data_path}/preprocessed_data', 'wb') as f:
        pickle.dump((train2), f)
        
  (print("Preprocessing Done"))
  return pd.DataFrame(train2).shape  


In [44]:
preprocess(output_dir)

  if (await self.run_code(code, result,  async_=asy)):


Preprocessing Done


(335, 2)

Training the Model

In [45]:
#Training component
def train(data_path):
  import pickle
  import sys 
  import os
  import subprocess;
  subprocess.run([sys.executable, '-m', 'pip', 'install', 'pip==20.2.4'])
  subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas==0.23.4'])
  subprocess.run([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.22'])
  subprocess.run([sys.executable, '-m', 'pip', 'install', 'numpy==1.16.1'])
  subprocess.run([sys.executable, '-m', 'pip', 'install','fbprophet', 'Prophet', 'plotly', ])
  
  import pandas as pd
  import numpy as np
  from fbprophet import Prophet

  #load saved preprocessed data 
  with open(f'{data_path}/preprocessed_data', 'rb') as f:
       preprocessed_data = pickle.load(f)
    
  train2 = preprocessed_data
 
  model = Prophet(weekly_seasonality=True, yearly_seasonality=True,seasonality_mode='multiplicative',
                daily_seasonality=True)

  train3 = model.fit(train2)

  #Save the model
  with open(f'{data_path}/model', 'wb') as file:
    pickle.dump((train3), file)

  return (print("Model Trained"))
  
  


In [46]:
train(output_dir)

Model Trained


In [47]:
#Prediction component
def predict (data_path):
  import pickle
  import sys, subprocess;
  subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas', 'scikit-learn'])
  subprocess.run([sys.executable, '-m', 'pip', 'install','fbprophet', 'Prophet', 'plotly', ])
  from sklearn.metrics import mean_absolute_error as MAE
  from sklearn.metrics import mean_squared_error as MSE
  from sklearn.metrics import r2_score
  from fbprophet import Prophet

  #Load saved model
  with open(f'{data_path}/model','rb') as file:
    model = pickle.load(file)

  #Evaluate model and print results
  future_date = model.make_future_dataframe(periods=150, freq='D')

  forecast = model.predict(future_date)
  #forecast[['ds','yhat', 'trend', 'yhat_lower','yhat_upper', 'trend_lower', 'trend_upper']].head()

  print('Model \nr2_score = {} \nMAE = {} \nMSE = {}' .format(r2_score(forecast.trend, forecast.yhat),
        MAE (forecast.trend, forecast.yhat), MSE (forecast.trend, forecast.yhat) ))
  
  #save result
  with open(f'{data_path}/model_result.txt', 'wb') as result:
    pickle.dump(forecast, result)
    
  
  print("Prediction saved!")


In [48]:
predict(output_dir)

Model 
r2_score = -62.10180036111323 
MAE = 0.039647263929478836 
MSE = 0.0024535282891507445
Prediction saved!


In [49]:
#Packaging components
preprocess_op = comp.func_to_container_op(preprocess , base_image = "tensorflow/tensorflow:latest-gpu-py3")
train_op = comp.func_to_container_op(train , base_image = "tensorflow/tensorflow:latest-gpu-py3")
predict_op = comp.func_to_container_op(predict , base_image = "tensorflow/tensorflow:latest-gpu-py3")

In [50]:
#connecting to kfp
import kfp
client = kfp.Client(host='51d2cc85769d1448-dot-us-central2.pipelines.googleusercontent.com')


In [51]:
# Define the pipeline
@dsl.pipeline(
   name='Road Safety Pipeline',
   description='An ML pipeline that predicts safest and dangerous times to travel.'
)

# Define parameters to be fed into pipeline
def road_safety_container_pipeline(
    data_path: str,
    model_file: str
):
    
    # Define volume to share data between components.
    vop = dsl.VolumeOp(
    name="create_volume",
    resource_name="data-volume", 
    size="1Gi", 
    modes=dsl.VOLUME_MODE_RWO)
    
    # Create road safety training component.
    road_safety_preprocess_container = preprocess_op(data_path).add_pvolumes({data_path: vop.volume})
    
    road_safety_training_container = train_op(data_path) \
                                    .add_pvolumes({data_path: road_safety_preprocess_container.pvolume})

    # Create road safety prediction component.
    road_safety_predict_container = predict_op(data_path) \
                                    .add_pvolumes({data_path: road_safety_training_container.pvolume})
    
    # Print the result of the prediction
    road_safety_result_container = dsl.ContainerOp(
        name="print_prediction",
        image='library/bash:4.4.23',
        pvolumes={data_path: road_safety_predict_container.pvolume},
        arguments=['head', f'{data_path}/model_result.txt']
    )

In [52]:
DATA_PATH = '/mnt'
MODEL_PATH='road_safety_model.h5'

In [53]:
pipeline_func = road_safety_container_pipeline

In [54]:
experiment_name = 'road_safety_kubeflow'
run_name = pipeline_func.__name__ + ' run'

arguments = {"data_path":DATA_PATH,
             "model_file":MODEL_PATH}

# Compile pipeline to generate compressed YAML definition of the pipeline.
kfp.compiler.Compiler().compile(pipeline_func,  
  '{}.zip'.format(experiment_name))

# Submit pipeline directly from pipeline function
run_result = client.create_run_from_pipeline_func(pipeline_func, 
                                                  experiment_name=experiment_name, 
                                                  run_name=run_name, 
                                                  arguments=arguments)

