In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta, date
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

In [2]:
df = pd.read_csv('D://stuff//uni//ai//project//data//traffic.csv', parse_dates=True, index_col='DateTime')
df.head()

Unnamed: 0_level_0,Junction,Vehicles,ID
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-11-01 00:00:00,1,15,20151101001
2015-11-01 01:00:00,1,13,20151101011
2015-11-01 02:00:00,1,10,20151101021
2015-11-01 03:00:00,1,7,20151101031
2015-11-01 04:00:00,1,9,20151101041


Dropping the ID column and splitting the date column into seprate columns

In [3]:
df.drop('ID', axis=1, inplace=True)

df['Year'] = pd.Series(df.index).apply(lambda x: x.year).to_list()
df['Month'] = pd.Series(df.index).apply(lambda x: x.month).to_list()
df['Day'] = pd.Series(df.index).apply(lambda x: x.day).to_list()
df['Hour'] = pd.Series(df.index).apply(lambda x: x.hour).to_list()

df.head()

Unnamed: 0_level_0,Junction,Vehicles,Year,Month,Day,Hour
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-11-01 00:00:00,1,15,2015,11,1,0
2015-11-01 01:00:00,1,13,2015,11,1,1
2015-11-01 02:00:00,1,10,2015,11,1,2
2015-11-01 03:00:00,1,7,2015,11,1,3
2015-11-01 04:00:00,1,9,2015,11,1,4


In [4]:
lag_df = df.copy()
for i in range(1, 3):
    lag_df[f'Vehicles_lag_{i}'] = df.Vehicles.shift(i)

lag_df.dropna(inplace=True)
lag_df.head()

Unnamed: 0_level_0,Junction,Vehicles,Year,Month,Day,Hour,Vehicles_lag_1,Vehicles_lag_2
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-11-01 02:00:00,1,10,2015,11,1,2,13.0,15.0
2015-11-01 03:00:00,1,7,2015,11,1,3,10.0,13.0
2015-11-01 04:00:00,1,9,2015,11,1,4,7.0,10.0
2015-11-01 05:00:00,1,6,2015,11,1,5,9.0,7.0
2015-11-01 06:00:00,1,9,2015,11,1,6,6.0,9.0


In [5]:
def get_list_data(dataf, drop=[]):
  dataf = [dataf[dataf.Junction == i].drop('Junction', axis=1) for i in range(5)]
  return dataf

In [6]:
data = get_list_data(df)
for i in data:
    print(i.head(2))

Empty DataFrame
Columns: [Vehicles, Year, Month, Day, Hour]
Index: []
                     Vehicles  Year  Month  Day  Hour
DateTime                                             
2015-11-01 00:00:00        15  2015     11    1     0
2015-11-01 01:00:00        13  2015     11    1     1
                     Vehicles  Year  Month  Day  Hour
DateTime                                             
2015-11-01 00:00:00         6  2015     11    1     0
2015-11-01 01:00:00         6  2015     11    1     1
                     Vehicles  Year  Month  Day  Hour
DateTime                                             
2015-11-01 00:00:00         9  2015     11    1     0
2015-11-01 01:00:00         7  2015     11    1     1
                     Vehicles  Year  Month  Day  Hour
DateTime                                             
2017-01-01 00:00:00         3  2017      1    1     0
2017-01-01 01:00:00         1  2017      1    1     1


In [7]:
lag_data = get_list_data(lag_df,  drop=['Year'])
for i in lag_data:
    print(i.head(2))

Empty DataFrame
Columns: [Vehicles, Year, Month, Day, Hour, Vehicles_lag_1, Vehicles_lag_2]
Index: []
                     Vehicles  Year  Month  Day  Hour  Vehicles_lag_1  \
DateTime                                                                
2015-11-01 02:00:00        10  2015     11    1     2            13.0   
2015-11-01 03:00:00         7  2015     11    1     3            10.0   

                     Vehicles_lag_2  
DateTime                             
2015-11-01 02:00:00            15.0  
2015-11-01 03:00:00            13.0  
                     Vehicles  Year  Month  Day  Hour  Vehicles_lag_1  \
DateTime                                                                
2015-11-01 00:00:00         6  2015     11    1     0            78.0   
2015-11-01 01:00:00         6  2015     11    1     1             6.0   

                     Vehicles_lag_2  
DateTime                             
2015-11-01 00:00:00            84.0  
2015-11-01 01:00:00            78.0  
        

Creating a ML model

In [8]:
class Model:
  def __init__(self, name, data, predict_features, test_size, ml_model):
    self.name = name
    self.data = data
    self.predict_features = predict_features
    self.is_trained = False
    self.test_size = test_size
    self.ml_model = ml_model
    self.do_things()
    
  def cal_mae(self):
    self.mae = mean_absolute_error(self.ytest, self.ypredict)
    return self.mae

  def cal_mse(self):
    self.mse = mean_squared_error(self.ytest, self.ypredict)
    return self.mse

  def cal_rmse(self):
    self.rmse = mean_squared_error(self.ytest, self.ypredict, squared=False)
    return self.rmse

  def cal_mape(self):
    self.mape = mean_absolute_percentage_error(self.ytest, self.ypredict)
    return self.mape


  def split_data(self, test_size):
    self.features = [i for i in self.data.columns if i != self.predict_features]
    self.X = self.data[self.features].values
    self.y = self.data[self.predict_features].values
    self.Xtrain, self.Xtest, self.ytrain, self.ytest = train_test_split(self.X, self.y, test_size=test_size)
    return None

  def fit(self):
    self.is_trained = True
    self.ml_model.fit(self.Xtrain, self.ytrain)
    self.ypredict = self.ml_model.predict(self.Xtest)
    return self.ml_model

  def do_things(self) -> None:
    self.split_data(self.test_size)
    self.fit()
    self.cal_mse()
    self.cal_rmse()
    self.cal_mae()
    self.cal_mape()
    return None

  def __repr__(self) -> str:
    if not self.is_trained:
      return f'<{self.name}> (is not trained yet)>'
    return f'<({self.name}: [MAE: {self.mae}], [MSE: {self.mse}], [RMSE: {self.rmse}], [MAPE: {self.mape}] )>'

In [9]:
def make_metrics(models):
    data = {
        'name': [model.name for model in models[1:]],
        'mae': [model.mae for model in models[1:]],
        'mse': [model.mse for model in models[1:]],
        'rmse': [model.rmse for model in models[1:]],
        'mape': [model.mape for model in models[1:]]
    }
    return pd.DataFrame(data)

In [10]:
models = [None]
for i in range(1, 5):
    models += [
        Model(
            ml_model=SVR(),
            name=f'Dataset of junction {i}',
            data=data[i],
            predict_features='Vehicles',
            test_size=1/4
        )
    ]
    
make_metrics(models)

Unnamed: 0,name,mae,mse,rmse,mape
0,Dataset of junction 1,18.164989,557.012676,23.601116,0.500544
1,Dataset of junction 2,5.518492,59.219394,7.695414,0.462457
2,Dataset of junction 3,6.938161,114.204462,10.686649,0.716537
3,Dataset of junction 4,2.675906,13.710346,3.702748,0.523129


In [None]:
lag_models = [None]
for i in range(1, 5):
    lag_models += [
        Model(
            ml_model=SVR(),
            name=f'Dataset of junction {i} with lag data',
            data=lag_data[i],
            predict_features='Vehicles',
            test_size=1/3
        )
    ]

make_metrics(lag_models)

Prediction for the next year

In [None]:
for junction in range(1, 5):
    cur_time = lag_data[junction].tail(1).index[0] 
    end_time = pd.Timestamp(2018, 7, 1, 0, 0, 0)
    new_data = lag_data[junction].copy()
    features = lag_models[junction].features 
    while cur_time != end_time:
        last = new_data.tail(1).copy() 
        new_data = pd.concat([new_data, last]) 
        for i in range(1, 3):
            new_data[f'Vehicles_lag_{i}'] = new_data.Vehicles.shift(i) 
        new_data.iloc[len(new_data) - 1, [1, 2, 3]] = [cur_time.month, cur_time.day, cur_time.hour] 
        last = new_data[features].tail(1).values 
        new_data.iloc[len(new_data) - 1, 0] = round(lag_models[junction].ml_model.predict(last)[0]) 
        cur_time += timedelta(hours=1) 
    new_data.index = pd.date_range(
        start=lag_data[junction].head(1).index.values[0],
        end=pd.Timestamp(2018, 7, 1, 0, 0, 0),
        freq='H'
    )
    new_data.to_csv(f'peridiction_of_vehicals_for_the_next_year_for_junction_{junction}.csv') 
    print(f'|==Predicted for Junction {junction}==|')