In [39]:
import pandas as pd
import numpy as np
import seaborn as sns
import gc
from math import floor, ceil

from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import accuracy_score, mean_absolute_percentage_error, f1_score, roc_auc_score,  r2_score, mean_absolute_error,mean_squared_error, recall_score, precision_score


from sklearn.svm import SVR, LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor, Pool
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor, DMatrix
from sklearn.ensemble import AdaBoostRegressor

from prophet import Prophet


In [32]:
!pip install Catboost

Collecting Catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Catboost
Successfully installed Catboost-1.2.3


In [4]:
df = pd.read_csv('traffic_clean.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,DateTime,Junction,Vehicles,ID,Year,Month,day_of_month,day_of_week,Date,Time,day_of_year,Seconds
0,0,2015-11-01 00:00:00+00:00,1,15,20151101001,2015,11,1,6,2015-11-01,0,305,0
1,1,2015-11-01 01:00:00+00:00,1,13,20151101011,2015,11,1,6,2015-11-01,1,305,3600
2,2,2015-11-01 02:00:00+00:00,1,10,20151101021,2015,11,1,6,2015-11-01,2,305,7200
3,3,2015-11-01 03:00:00+00:00,1,7,20151101031,2015,11,1,6,2015-11-01,3,305,10800
4,4,2015-11-01 04:00:00+00:00,1,9,20151101041,2015,11,1,6,2015-11-01,4,305,14400


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48120 entries, 0 to 48119
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    48120 non-null  int64 
 1   DateTime      48120 non-null  object
 2   Junction      48120 non-null  int64 
 3   Vehicles      48120 non-null  int64 
 4   ID            48120 non-null  int64 
 5   Year          48120 non-null  int64 
 6   Month         48120 non-null  int64 
 7   day_of_month  48120 non-null  int64 
 8   day_of_week   48120 non-null  int64 
 9   Date          48120 non-null  object
 10  Time          48120 non-null  int64 
 11  day_of_year   48120 non-null  int64 
 12  Seconds       48120 non-null  int64 
dtypes: int64(11), object(2)
memory usage: 4.8+ MB


In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,DateTime,Junction,Vehicles,ID,Year,Month,day_of_month,day_of_week,Date,Time,day_of_year,Seconds
0,0,2015-11-01 00:00:00+00:00,1,15,20151101001,2015,11,1,6,2015-11-01,0,305,0
1,1,2015-11-01 01:00:00+00:00,1,13,20151101011,2015,11,1,6,2015-11-01,1,305,3600
2,2,2015-11-01 02:00:00+00:00,1,10,20151101021,2015,11,1,6,2015-11-01,2,305,7200
3,3,2015-11-01 03:00:00+00:00,1,7,20151101031,2015,11,1,6,2015-11-01,3,305,10800
4,4,2015-11-01 04:00:00+00:00,1,9,20151101041,2015,11,1,6,2015-11-01,4,305,14400


In [10]:
#Encoding the Date Column
encoder =LabelEncoder()
df['Date'] = encoder.fit_transform(df['Date'])
import pickle
output = open('Labelencoder.pkl', 'wb')
pickle.dump(encoder, output)
output.close()

In [12]:
df1 = df.copy()
df['DateTime'] = pd.to_datetime(df['DateTime'])
# Converting the DateTime Column into integer
df['DateTime'] = df['DateTime'].values.astype(np.int64) / 10 ** 9

### cross validation

In [18]:
def cross_validation(X, y, model, splits=10, seed=42):
  tscv = TimeSeriesSplit(n_splits=splits)
  RMSE_scores = []  # List to store RMSE scores
  MAPE_scores = []  # List to store MAPE scores
  MAE_scores = []   # List to store MAE scores
  predictions = []  # List to store predictions per split
  model =  model     # Initialize the model

  i = 0
  for train, test in tscv.split(X):
      x_train, x_test, y_train, y_test = X.iloc[train], X.iloc[test], y.iloc[train], y.iloc[test]
      model.fit(x_train, y_train)
      RMSE = mean_squared_error(y_test, model.predict(x_test), squared=False)
      MAPE = mean_absolute_percentage_error(y_test, model.predict(x_test)) * 100
      MAE = mean_absolute_error(y_test, model.predict(x_test))

      print(f'Splits Trained: {i} \t RMSE:  {RMSE}  \t MAPE: {MAPE}  \t MAE: {MAE}')
      RMSE_scores.append(RMSE)
      MAPE_scores.append(MAPE)
      MAE_scores.append(MAE)
      i += 1

  # Calculate the mean of validation scores
  print('Average RMSE: ', np.mean(RMSE_scores))
  print('Average MAPE: ', np.mean(MAPE_scores))
  print('Average MAE: ', np.mean(MAE_scores))

  return predictions


In [22]:
#Use Target Encoding to create new features  with aggregates : Std, Max,Min, Mean and Median
def generate_encoded_features(dataframe):
    selected_features = ['Month', 'Quarter', 'day_of_month', 'day_of_week', 'Time', 'day_of_year']
    for feature in tqdm(selected_features):
    # Aggregate vehicles by selected feature
      aggregate_data = dataframe.groupby(feature)['Vehicles'].agg(['std', 'max', 'min', 'mean', 'median'])
      # Rename columns
      aggregate_data.columns = [feature + '_' + c + '_vehicles' for c in aggregate_data.columns]
      # Convert data types
      aggregate_data = aggregate_data.astype({c: np.float32 for c in aggregate_data.columns})
      # Reset index
      aggregate_data.reset_index(inplace=True)
      # Merge aggregated data with the original dataframe
      dataframe = dataframe.merge(aggregate_data, on=feature, how='left')
      gc.collect()
      return dataframe

df2 = generate_encoded_features(df)

  0%|          | 0/6 [00:00<?, ?it/s]


In [None]:
df2.isnull().sum()

In [23]:
len(df2)

48120

In [25]:
df.head()

Unnamed: 0.1,Unnamed: 0,DateTime,Junction,Vehicles,ID,Year,Month,day_of_month,day_of_week,Date,Time,day_of_year,Seconds
0,0,1446336000.0,1,15,20151101001,2015,11,1,6,0,0,305,0
1,1,1446340000.0,1,13,20151101011,2015,11,1,6,0,1,305,3600
2,2,1446343000.0,1,10,20151101021,2015,11,1,6,0,2,305,7200
3,3,1446347000.0,1,7,20151101031,2015,11,1,6,0,3,305,10800
4,4,1446350000.0,1,9,20151101041,2015,11,1,6,0,4,305,14400


In [43]:
df_junction1= df2.loc[df2['Junction']==1]
df_junction2= df2.loc[df2['Junction']==2]
df_junction3= df2.loc[df2['Junction']==3]
df_junction4= df2.loc[df2['Junction']==4]

df_junction1.reset_index(inplace=True, drop=True)
df_junction2.reset_index(inplace=True, drop=True)
df_junction3.reset_index(inplace=True, drop=True)
df_junction4.reset_index(inplace=True, drop=True)

## Junction1 Model

In [44]:
#Declaring Input and Output Variables
X = df_junction1.drop(['Vehicles','Seconds', 'ID', 'Junction'], axis=1)
y = df_junction1['Vehicles']

In [29]:
a= ceil(0.8 * len(df_junction1))
X_train=X.iloc[:a]
y_train= y.iloc[:a]
X_valid=X.iloc[a:]
y_valid=y.iloc[a:]

X_valid.head()

Unnamed: 0.1,Unnamed: 0,DateTime,Year,Month,day_of_month,day_of_week,Date,Time,day_of_year,Month_std_vehicles,Month_max_vehicles,Month_min_vehicles,Month_mean_vehicles,Month_median_vehicles
11674,11674,1488362000.0,2017,3,1,2,486,10,60,20.499683,110.0,1.0,23.073349,16.0
11675,11675,1488366000.0,2017,3,1,2,486,11,60,20.499683,110.0,1.0,23.073349,16.0
11676,11676,1488370000.0,2017,3,1,2,486,12,60,20.499683,110.0,1.0,23.073349,16.0
11677,11677,1488373000.0,2017,3,1,2,486,13,60,20.499683,110.0,1.0,23.073349,16.0
11678,11678,1488377000.0,2017,3,1,2,486,14,60,20.499683,110.0,1.0,23.073349,16.0


In [36]:
#LinearRegression
lg =  LinearRegression()
lg.fit(X_train, y_train)
lg_pred = lg.predict(X_valid)
lg_RMSE = mean_squared_error(y_valid,lg_pred, squared=False)
lg_MAPE= mean_absolute_percentage_error(y_valid,lg_pred)*100

print("LinearRegression RMSE score on validation set is : ",lg_RMSE)
print("LinearRegression MAPE score on validation set is : ",lg_MAPE, '%')

print('--------'*20)

#LassoRegression
l =  Lasso()
l.fit(X_train, y_train)
l_pred = l.predict(X_valid)
l_RMSE = mean_squared_error(y_valid,l_pred, squared=False)
l_MAPE= mean_absolute_percentage_error(y_valid,l_pred)*100

print("LassoRegression RMSE score on validation set is : ",l_RMSE)
print("LassoRegression MAPE score on validation set is : ",l_MAPE, '%')

print('--------'*20)

#RidgeRegression
rr =  Ridge()
rr.fit(X_train, y_train)
rr_pred = rr.predict(X_valid)
rr_RMSE = mean_squared_error(y_valid,rr_pred, squared=False)
rr_MAPE= mean_absolute_percentage_error(y_valid,rr_pred)*100

print("RidgeRegression RMSE score on validation set is : ",rr_RMSE)
print("RidgeRegression MAPE score on validation set is : ",rr_MAPE, '%')

print('--------'*20)


# Catboost
cat = CatBoostRegressor(verbose=False, iterations=100)
cat.fit(X_train, y_train)
cat_pred = cat.predict(X_valid)
cat_RMSE = mean_squared_error(y_valid,cat_pred, squared=False)
cat_MAPE= mean_absolute_percentage_error(y_valid,cat_pred)*100

print("CatBoost RMSE score on validation set is : ",cat_RMSE)
print("CatBoost MAPE score on validation set is : ",cat_MAPE, '%')

print('--------'*20)

# LGBM
# lgb = LGBMRegressor()#num_iterations=1000)
# lgb.fit(X_train, y_train)
# lgb_pred = lgb.predict(X_valid)
# lgb_RMSE = mean_squared_error(y_valid,lgb_pred, squared=False)
# lgb_MAPE= mean_absolute_percentage_error(y_valid,lgb_pred)*100

# print("LGBM RMSE score on validation set is : ",lgb_RMSE)
# print("LGBM MAPE score on validation set is : ",lgb_MAPE, '%')

# print('--------'*20)

# XGBoost
xg = XGBRegressor()#n_estimators=1000)
xg.fit(X_train, y_train)
xg_pred = xg.predict(X_valid)
xg_RMSE = mean_squared_error(y_valid,xg_pred, squared=False)
xg_MAPE= mean_absolute_percentage_error(y_valid,xg_pred)*100

print("XGBoost RMSE score on validation set is : ",xg_RMSE)
print("XGBoost MAPE score on validation set is : ",xg_MAPE, '%')

print('--------'*20)


#AdaBoost
ab =  AdaBoostRegressor()
ab.fit(X_train, y_train)
ab_pred = ab.predict(X_valid)
ab_RMSE = mean_squared_error(y_valid,ab_pred, squared=False)
ab_MAPE= mean_absolute_percentage_error(y_valid,ab_pred)*100

print("AdaBoost RMSE score on validation set is : ",ab_RMSE)
print("AdaBoost MAPE score on validation set is : ",ab_MAPE, '%')

print('--------'*20)



#Naive Bayes
nb =  GradientBoostingRegressor()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_valid)
nb_RMSE = mean_squared_error(y_valid,nb_pred, squared=False)
nb_MAPE= mean_absolute_percentage_error(y_valid,nb_pred)*100

print("Gradient Boosting RMSE score on validation set is : ",nb_RMSE)
print("Gradient Boosting MAPE score on validation set is : ",nb_MAPE, '%')

print('--------'*20)

#DecisionTree
dt =  DecisionTreeRegressor()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_valid)
dt_RMSE = mean_squared_error(y_valid,dt_pred, squared=False)
dt_MAPE= mean_absolute_percentage_error(y_valid,dt_pred)*100

print("DecisionTree RMSE score on validation set is : ",dt_RMSE)
print("DecisionTree MAPE score on validation set is : ",dt_MAPE, '%')

print('--------'*20)

#RandomForest
rf =  RandomForestRegressor()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_valid)
rf_RMSE = mean_squared_error(y_valid,rf_pred, squared=False)
rf_MAPE= mean_absolute_percentage_error(y_valid,rf_pred)*100

print("RandomForest RMSE score on validation set is : ",rf_RMSE)
print("RandomForest MAPE score on validation set is : ",rf_MAPE, '%')

print('--------'*20)



#SVC
svc =  SVR()
svc.fit(X_train, y_train)
svc_pred = svc.predict(X_valid)
svc_RMSE = mean_squared_error(y_valid,svc_pred, squared=False)
svc_MAPE= mean_absolute_percentage_error(y_valid,svc_pred)*100

print("SVC RMSE score on validation set is : ",svc_RMSE)
print("SVC MAPE score on validation set is : ",svc_MAPE, '%')

print('--------'*20)

#LinearSVC
lsvc =  LinearSVR()
lsvc.fit(X_train, y_train)
lsvc_pred = lsvc.predict(X_valid)
lsvc_RMSE = mean_squared_error(y_valid,lsvc_pred, squared=False)
lsvc_MAPE= mean_absolute_percentage_error(y_valid,lsvc_pred)*100
print("LinearSVC RMSE score on validation set is : ",lsvc_RMSE)
print("LinearSVC MAPE score on validation set is : ",lsvc_MAPE, '%')

print('--------'*20)

LinearRegression RMSE score on validation set is :  18.207954742562094
LinearRegression MAPE score on validation set is :  24.903737620038665 %
----------------------------------------------------------------------------------------------------------------------------------------------------------------
LassoRegression RMSE score on validation set is :  18.08470215910074
LassoRegression MAPE score on validation set is :  26.88775762394399 %
----------------------------------------------------------------------------------------------------------------------------------------------------------------
RidgeRegression RMSE score on validation set is :  18.13771542117801
RidgeRegression MAPE score on validation set is :  25.02262167029578 %
----------------------------------------------------------------------------------------------------------------------------------------------------------------


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


CatBoost RMSE score on validation set is :  11.294949094707631
CatBoost MAPE score on validation set is :  11.351950852444665 %
----------------------------------------------------------------------------------------------------------------------------------------------------------------
XGBoost RMSE score on validation set is :  10.498097673240125
XGBoost MAPE score on validation set is :  10.146681975589727 %
----------------------------------------------------------------------------------------------------------------------------------------------------------------
AdaBoost RMSE score on validation set is :  13.517557055082046
AdaBoost MAPE score on validation set is :  18.94595422504585 %
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Gradient Boosting RMSE score on validation set is :  10.892593327173664
Gradient Boosting MAPE score on validation set is :  10.80542636



### Prophet model1

In [45]:
a= ceil(0.8 * len(df_junction1))
a
train=df_junction1.iloc[:a]
test=df_junction1.iloc[a:]

train['DateTime'] =list(df1.loc[df1['Junction']==1]['DateTime'].iloc[:a])
test['DateTime'] =list(df1.loc[df1['Junction']==1]['DateTime'].iloc[a:])

train.drop(['Seconds', 'ID', 'Junction'], axis=1, inplace=True)
test.drop(['Seconds', 'ID', 'Junction'], axis=1, inplace=True)
train = train.rename(columns = {"DateTime" : "ds", "Vehicles" : "y"})
test = test.rename(columns = {"DateTime" : "ds", "Vehicles" : "y"})

train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['DateTime'] =list(df1.loc[df1['Junction']==1]['DateTime'].iloc[:a])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['DateTime'] =list(df1.loc[df1['Junction']==1]['DateTime'].iloc[a:])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.drop(['Seconds', 'ID', 'Junction'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice fr

In [46]:
model = Prophet(yearly_seasonality=True)
for col in train.columns:
    if col != 'y' and col != 'ds':
        model.add_regressor(col)
model.fit(train)

ValueError: Column ds has timezone specified, which is not supported. Remove timezone.

In [None]:
# making predictions for the next 10 months
X_test = test.drop('y', axis=1)
y_test= test['y']

forecast = model.predict(X_test)
pred = forecast['yhat']
forecast[[ 'ds', 'yhat', 'yhat_lower', 'yhat_upper', 'trend', 'trend_lower', 'trend_upper' ]].head()

In [None]:
prof_MAPE= mean_absolute_percentage_error(y_test,pred)*100
print("Prophet Model MAPE score on validation set is : {:.2f} ".format(prof_MAPE))

In [None]:
prof_RMSE= mean_squared_error(y_test,pred, squared=False)
print("Prophet Model RMSE score on validation set is : {:.2f} ".format(prof_RMSE))