In [1]:
import pandas as pd
import pickle
import numpy as np
import os
import csv
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.tree import export_graphviz
#import matplotlib.pyplot as plt
#import matplotlib.patches as mpatches

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
with open('random_forest_results_optimised_final.csv', 'a', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Route", "Direction", "Mean_Absolute_Error", "Mean_Squared_Error", "Root_Mean_Squared_Error", "R2", "MAE_Train", "MPE"])

In [3]:
files = os.listdir("/Users/laura/Desktop/Trimester_3/My_work/Cleaned_Bus_Weather_Data/Master_Weather_Direction/")
files
print(len(files))

253


In [4]:
def random_forest(file_name, route, direction):
    """Takes a file and trains a linear regression on that file, returns a pickle and writes accuracy to .csv """
    #read in file
    
    filepath = "/Users/laura/Desktop/Trimester_3/My_work/Cleaned_Bus_Weather_Data/Master_Weather_Direction/" + file_name
    df = pd.read_csv(filepath, keep_default_na=True, sep=',\s+', delimiter=',', skipinitialspace=True)
    
    #find the main subroute to run linear regression on
    subroutes = pd.read_csv("~/Desktop/Trimester_3/My_work/routes_subroutes-04072020.csv", keep_default_na=True, sep=',\s+', delimiter=',', skipinitialspace=True)
    only_route = subroutes[subroutes.LINEID==str(route)]
    route_direction = only_route[only_route.DIRECTION==int(direction)]
    subroute_list = list(route_direction.MAINROUTE.unique())
    main_subroute = subroute_list[0][2:-2]
    print("main as is", main_subroute)

    
    #limit the dataframe to only journeys on the main subroute
    main_route = df[df["ROUTEID"] ==main_subroute]
    
    #limit the dataframe to only the first and last stop
    progrnumbers = list(main_route.PROGRNUMBER.unique())
    last_progrnumber = max(progrnumbers)
    only_first = main_route[main_route["PROGRNUMBER"]==1]
    only_last = main_route[main_route["PROGRNUMBER"]==int(last_progrnumber)]
    
    #renaming fields pertaining to the last stop before merging those fields into the dataframe for the first stop
    for_merging = only_last[["PLANNEDTIME_ARR", "ACTUALTIME_ARR", "UNIQUE_TRIP", "DELAYARR", "DELAYDEP"]]
    for_merging.columns = ["PLANNEDTIME_ARR_LAST", "ACTUALTIME_ARR_LAST", "UNIQUE_TRIP", "DELAYARR_LAST", "DELAYDEP_LAST"]
    
    result = pd.merge(only_first,
                        for_merging[["PLANNEDTIME_ARR_LAST", "ACTUALTIME_ARR_LAST", "UNIQUE_TRIP", "DELAYARR_LAST", "DELAYDEP_LAST"]],
                        on='UNIQUE_TRIP', 
                        how='left')
    
    #drop any empty rows
    reduced_result = result.dropna()
    
    #changing dtype of 'ACTUAL_ARR_LAST' to int
    reduced_result['ACTUALTIME_ARR_LAST'] = reduced_result['ACTUALTIME_ARR_LAST'].astype('int64')
    
    #creating journeytime feature
    reduced_result['JOURNEYTIME'] = reduced_result["ACTUALTIME_ARR_LAST"] - reduced_result["ACTUALTIME_DEP"]
    reduced_result['PLANNED_JOURNEYTIME'] = reduced_result["PLANNEDTIME_ARR_LAST"] - reduced_result["PLANNEDTIME_DEP"]
    
    
    #drop columns I no longer need
    reduced_result = reduced_result.drop(columns=['date', 'timezone', 'DAYOFSERVICE','PROGRNUMBER','DELAYDEP', 'ROUTEID', 'DELAYARR', 'DIRECTION', 'UNIQUE_TRIP', 'PLANNEDTIME_ARR', 'PLANNEDTIME_DEP', 'ACTUALTIME_ARR', 'ACTUALTIME_DEP', 'PLANNEDTIME_ARR_LAST', 'ACTUALTIME_ARR_LAST', 'DELAYARR_LAST', 'DELAYDEP_LAST', 'STOPPOINTID'])

    #prepare dataframe for modelling
    reduced_result['weather_main'] = reduced_result['weather_main'].astype('category')
    reduced_result['weather_description'] = reduced_result['weather_description'].astype('category')
    reduced_result['MONTH'] = reduced_result['MONTH'].astype('category')
    reduced_result['DAYOFWEEK'] = reduced_result['DAYOFWEEK'].astype('category')
    reduced_result['TIME_GROUP'] = reduced_result['TIME_GROUP'].astype('category')
    df_dummies = pd.get_dummies(reduced_result)
    
    #remove negative journey times
    reduced_result = reduced_result[(reduced_result['JOURNEYTIME'] > 0)]
    
    
    #split the dataset into target and the rest
    # y is the target
    y = df_dummies["JOURNEYTIME"]
    # X is everything else
    X = df_dummies.drop(["JOURNEYTIME", "PLANNED_JOURNEYTIME"],1)
    
    #split into test/train split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,  test_size=0.1)

    #reset indices
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)
    
    #train a random forest model
    rf = RandomForestRegressor(n_estimators=32).fit(X_train, y_train)
    
    #get predictions on training set
    train_predictions = rf.predict(X_train)

    
    #get predictions on test set
    rf_predictions_test = rf.predict(X_test)
    
    #get results
    MAE_train = metrics.mean_absolute_error(y_train, train_predictions)
    MAE =  metrics.mean_absolute_error(y_test, rf_predictions_test)
    MSE =  metrics.mean_squared_error(y_test, rf_predictions_test)
    RMSE = np.sqrt(metrics.mean_squared_error(y_test, rf_predictions_test))
    R2 = metrics.r2_score(y_test, rf_predictions_test)
    MPE = np.mean((y_test - rf_predictions_test)/y_test)
    
    with open('random_forest_results_optimised_final.csv', 'a', newline='') as file:
                    writer = csv.writer(file)
                    writer.writerow([route, direction, MAE, MSE, RMSE, R2, MAE_train, MPE])
                

    
    #dump to pickle file
    #file_name = str(file_name)
    #foo = file_name[0:-4]
    #pickle_file = foo + ".pickle"
    #pickle_file_path = "/Users/laura/Desktop/pickles_new/" + pickle_file
    
    #with open(pickle_file_path, 'wb') as f:
        #pickle.dump(multiple_linreg, f)
        #print(pickle_file_path, " dumped")
    

In [5]:
for file in files:
    if str(file)[-4:] == ".csv":
        route_split = file.split("_", 1)
        direction = int(file[-5:-4])
        route = route_split[0]
        print(route, direction, " in progress.")
        random_forest(file, route, direction)

39 2  in progress.
main as is 39_21
9 2  in progress.
main as is 9_7
77X 2  in progress.
main as is 77X_61
40D 1  in progress.
main as is 40D_102
47 1  in progress.
main as is 47_139
40 2  in progress.
main as is 40_31
140 2  in progress.
main as is 140_21
84A 1  in progress.
main as is 84A_1
123 1  in progress.
main as is 123_34
1 2  in progress.
main as is 1_40
31 2  in progress.
main as is 31_18
39 1  in progress.
main as is 39_20
9 1  in progress.
main as is 9_5
40D 2  in progress.
main as is 40D_104
41A 2  in progress.
main as is 41A_22
47 2  in progress.
main as is 47_136
140 1  in progress.
main as is 140_19
40 1  in progress.
main as is 40_27
84A 2  in progress.
main as is 84A_3
123 2  in progress.
main as is 123_36
1 1  in progress.
main as is 1_37
31 1  in progress.
main as is 31_15
41 1  in progress.
main as is 41_3
25A 2  in progress.
main as is 25A_270
53 2  in progress.
main as is 53_21
83A 2  in progress.
main as is 83A_23
25 1  in progress.
main as is 25_272
7 2  in pro

main as is 18_3
69 1  in progress.
main as is 69_45
161 1  in progress.
main as is 161_50
61 1  in progress.
main as is 61_106
66B 1  in progress.
main as is 66B_58
238 2  in progress.
main as is 238_15
76A 1  in progress.
main as is 76A_28
27A 2  in progress.
main as is 27A_5
66 2  in progress.
main as is 66_18
31A 1  in progress.
main as is 31A_25
102 2  in progress.
main as is 102_9
17 2  in progress.
main as is 17_15
83 2  in progress.
main as is 83_22
79A 1  in progress.
main as is 79A_27
15A 2  in progress.
main as is 15A_84
14C 1  in progress.
main as is 14C_17
38B 2  in progress.
main as is 38B_44
84 1  in progress.
main as is 84_27
184 1  in progress.
main as is 184_29
18 2  in progress.
main as is 18_4
118 2  in progress.
main as is 118_4
69 2  in progress.
main as is 69_47
61 2  in progress.
main as is 61_108
161 2  in progress.
main as is 161_51
66B 2  in progress.
main as is 66B_59
238 1  in progress.
main as is 238_11
76A 2  in progress.
main as is 76A_29
27A 1  in progre