In [1]:
import pandas as pd
import pickle
import numpy as np
import os
import csv
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.tree import export_graphviz

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
with open('neural_network_random_forest_combo_results_final.csv', 'a', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Route", "Direction", "Mean_Absolute_Error", "Mean_Squared_Error", "Root_Mean_Squared_Error", "R2", "MAE_Train", "MPE"])

In [3]:
files = os.listdir("/Users/laura/Desktop/Trimester_3/My_work/Cleaned_Bus_Weather_Data/Master_Weather_Direction/")
files
print(len(files))

253


In [4]:
def neural_net(file_name, route, direction, hs, al, l, mx):
    """Takes a file and trains a linear regression on that file, returns a pickle and writes accuracy to .csv """
    
    #read in file
    filepath = "/Users/laura/Desktop/Trimester_3/My_work/Cleaned_Bus_Weather_Data/Master_Weather_Direction/" + file_name
    df = pd.read_csv(filepath, keep_default_na=True, sep=',\s+', delimiter=',', skipinitialspace=True)
    
    #find the main subroute to run linear regression on
    subroutes = pd.read_csv("~/Desktop/Trimester_3/My_work/routes_subroutes-04072020.csv", keep_default_na=True, sep=',\s+', delimiter=',', skipinitialspace=True)
    only_route = subroutes[subroutes.LINEID==str(route)]
    route_direction = only_route[only_route.DIRECTION==int(direction)]
    subroute_list = list(route_direction.MAINROUTE.unique())
    main_subroute = subroute_list[0][2:-2]
    
    #limit the dataframe to only journeys on the main subroute
    main_route = df[df["ROUTEID"] ==main_subroute]
    
    #limit the dataframe to only the first and last stop
    progrnumbers = list(main_route.PROGRNUMBER.unique())
    last_progrnumber = max(progrnumbers)
    only_first = main_route[main_route["PROGRNUMBER"]==1]
    only_last = main_route[main_route["PROGRNUMBER"]==int(last_progrnumber)]
    
    #renaming fields pertaining to the last stop before merging those fields into the dataframe for the first stop
    for_merging = only_last[["PLANNEDTIME_ARR", "ACTUALTIME_ARR", "UNIQUE_TRIP", "DELAYARR", "DELAYDEP"]]
    for_merging.columns = ["PLANNEDTIME_ARR_LAST", "ACTUALTIME_ARR_LAST", "UNIQUE_TRIP", "DELAYARR_LAST", "DELAYDEP_LAST"]
    
    result = pd.merge(only_first,
                        for_merging[["PLANNEDTIME_ARR_LAST", "ACTUALTIME_ARR_LAST", "UNIQUE_TRIP", "DELAYARR_LAST", "DELAYDEP_LAST"]],
                        on='UNIQUE_TRIP', 
                        how='left')
    
    #drop any empty rows
    reduced_result = result.dropna()
    
    #changing dtype of 'ACTUAL_ARR_LAST' to int
    reduced_result['ACTUALTIME_ARR_LAST'] = reduced_result['ACTUALTIME_ARR_LAST'].astype('int64')
    
    #creating journeytime feature
    reduced_result['JOURNEYTIME'] = reduced_result["ACTUALTIME_ARR_LAST"] - reduced_result["ACTUALTIME_DEP"]
    reduced_result['PLANNED_JOURNEYTIME'] = reduced_result["PLANNEDTIME_ARR_LAST"] - reduced_result["PLANNEDTIME_DEP"]
    
    
    #drop columns I no longer need
    reduced_result = reduced_result.drop(columns=['date', 'timezone', 'DAYOFSERVICE','PROGRNUMBER','DELAYDEP', 'ROUTEID', 'DELAYARR', 'DIRECTION', 'UNIQUE_TRIP', 'PLANNEDTIME_ARR', 'PLANNEDTIME_DEP', 'ACTUALTIME_ARR', 'ACTUALTIME_DEP', 'PLANNEDTIME_ARR_LAST', 'ACTUALTIME_ARR_LAST', 'DELAYARR_LAST', 'DELAYDEP_LAST', 'STOPPOINTID'])
    
    #remove any rows where journey time is negative
    reduced_result = reduced_result[(reduced_result['JOURNEYTIME'] > 0)]
    
    #prepare dataframe for modelling
    reduced_result['weather_main'] = reduced_result['weather_main'].astype('category')
    reduced_result['weather_description'] = reduced_result['weather_description'].astype('category')
    reduced_result['MONTH'] = reduced_result['MONTH'].astype('category')
    reduced_result['DAYOFWEEK'] = reduced_result['DAYOFWEEK'].astype('category')
    reduced_result['TIME_GROUP'] = reduced_result['TIME_GROUP'].astype('category')
    df_dummies = pd.get_dummies(reduced_result)
    
    #split the dataset into target and the rest
    # y is the target
    y = df_dummies["JOURNEYTIME"]
    # X is everything else
    X = df_dummies.drop(["JOURNEYTIME", "PLANNED_JOURNEYTIME"],1)
    
    #split into test/train split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,  test_size=0.1)
    #print(X_train.head())
    #print(X_train.columns)
    #reset indices
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)
    
    #train a linear regression model
    neural_net = MLPRegressor(random_state=1, hidden_layer_sizes=hs, alpha=al, learning_rate_init=l, max_iter=mx).fit(X_train, y_train)
    
    #get predictions on training set
    neural_net_predictions_train = neural_net.predict(X_train)

    
    #get predictions on test set
    neural_net_predictions_test = neural_net.predict(X_test)
    
    #get results
    MAE_train = metrics.mean_absolute_error(y_train, neural_net_predictions_train)
    MAE_test =  metrics.mean_absolute_error(y_test, neural_net_predictions_test)
    MSE_test =  metrics.mean_squared_error(y_test, neural_net_predictions_test)
    RMSE_test = np.sqrt(metrics.mean_squared_error(y_test, neural_net_predictions_test))
    R2_test = metrics.r2_score(y_test, neural_net_predictions_test)
    MPE = np.mean((y_test - neural_net_predictions_test)/y_test)
    
    with open('neural_network_random_forest_combo_results_final.csv', 'a', newline='') as file:
                    writer = csv.writer(file)
                    writer.writerow([route, direction, MAE_test, MSE_test, RMSE_test, R2_test, MAE_train, MPE])
                

    
    #dump to pickle file
    #file_name = str(file_name)
    #foo = file_name[0:-4]
    #pickle_file = foo + ".pickle"
    #pickle_file_path = "/Users/laura/Desktop/pickles_nn_rf_combo/" + pickle_file
    
    #with open(pickle_file_path, 'wb') as f:
        #pickle.dump(neural_net, f)
        #print(pickle_file_path, " dumped")
    

In [5]:
def random_forest(file_name, route, direction):
    """Takes a file and trains a linear regression on that file, returns a pickle and writes accuracy to .csv """
    #read in file
    
    filepath = "/Users/laura/Desktop/Trimester_3/My_work/Cleaned_Bus_Weather_Data/Master_Weather_Direction/" + file_name
    df = pd.read_csv(filepath, keep_default_na=True, sep=',\s+', delimiter=',', skipinitialspace=True)
    
    #find the main subroute to run linear regression on
    subroutes = pd.read_csv("~/Desktop/Trimester_3/My_work/routes_subroutes-04072020.csv", keep_default_na=True, sep=',\s+', delimiter=',', skipinitialspace=True)
    only_route = subroutes[subroutes.LINEID==str(route)]
    route_direction = only_route[only_route.DIRECTION==int(direction)]
    subroute_list = list(route_direction.MAINROUTE.unique())
    main_subroute = subroute_list[0][2:-2]
    #print("main as is", main_subroute)

    
    #limit the dataframe to only journeys on the main subroute
    main_route = df[df["ROUTEID"] ==main_subroute]
    
    #limit the dataframe to only the first and last stop
    progrnumbers = list(main_route.PROGRNUMBER.unique())
    last_progrnumber = max(progrnumbers)
    only_first = main_route[main_route["PROGRNUMBER"]==1]
    only_last = main_route[main_route["PROGRNUMBER"]==int(last_progrnumber)]
    
    #renaming fields pertaining to the last stop before merging those fields into the dataframe for the first stop
    for_merging = only_last[["PLANNEDTIME_ARR", "ACTUALTIME_ARR", "UNIQUE_TRIP", "DELAYARR", "DELAYDEP"]]
    for_merging.columns = ["PLANNEDTIME_ARR_LAST", "ACTUALTIME_ARR_LAST", "UNIQUE_TRIP", "DELAYARR_LAST", "DELAYDEP_LAST"]
    
    result = pd.merge(only_first,
                        for_merging[["PLANNEDTIME_ARR_LAST", "ACTUALTIME_ARR_LAST", "UNIQUE_TRIP", "DELAYARR_LAST", "DELAYDEP_LAST"]],
                        on='UNIQUE_TRIP', 
                        how='left')
    
    #drop any empty rows
    reduced_result = result.dropna()
    
    #changing dtype of 'ACTUAL_ARR_LAST' to int
    reduced_result['ACTUALTIME_ARR_LAST'] = reduced_result['ACTUALTIME_ARR_LAST'].astype('int64')
    
    #creating journeytime feature
    reduced_result['JOURNEYTIME'] = reduced_result["ACTUALTIME_ARR_LAST"] - reduced_result["ACTUALTIME_DEP"]
    reduced_result['PLANNED_JOURNEYTIME'] = reduced_result["PLANNEDTIME_ARR_LAST"] - reduced_result["PLANNEDTIME_DEP"]
    
    
    #drop columns I no longer need
    reduced_result = reduced_result.drop(columns=['date', 'timezone', 'DAYOFSERVICE','PROGRNUMBER','DELAYDEP', 'ROUTEID', 'DELAYARR', 'DIRECTION', 'UNIQUE_TRIP', 'PLANNEDTIME_ARR', 'PLANNEDTIME_DEP', 'ACTUALTIME_ARR', 'ACTUALTIME_DEP', 'PLANNEDTIME_ARR_LAST', 'ACTUALTIME_ARR_LAST', 'DELAYARR_LAST', 'DELAYDEP_LAST', 'STOPPOINTID'])

    #prepare dataframe for modelling
    reduced_result['weather_main'] = reduced_result['weather_main'].astype('category')
    reduced_result['weather_description'] = reduced_result['weather_description'].astype('category')
    reduced_result['MONTH'] = reduced_result['MONTH'].astype('category')
    reduced_result['DAYOFWEEK'] = reduced_result['DAYOFWEEK'].astype('category')
    reduced_result['TIME_GROUP'] = reduced_result['TIME_GROUP'].astype('category')
    df_dummies = pd.get_dummies(reduced_result)
    reduced_result = reduced_result[(reduced_result['JOURNEYTIME'] > 0)]
    
    #split the dataset into target and the rest
    # y is the target
    y = df_dummies["JOURNEYTIME"]
    # X is everything else
    X = df_dummies.drop(["JOURNEYTIME", "PLANNED_JOURNEYTIME"],1)
    
    #split into test/train split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,  test_size=0.1)
    #print(X_train.head())
    #print(X_train.columns)
    #reset indices
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)
    
    #train a linear regression model
    rf = RandomForestRegressor().fit(X_train, y_train)
    
    #get predictions on training set
    rf_predictions_train = rf.predict(X_train)

    
    #get predictions on test set
    rf_predictions_test = rf.predict(X_test)
    
    
    
    #get results
    MAE_train = metrics.mean_absolute_error(y_train, rf_predictions_train)
    MAE =  metrics.mean_absolute_error(y_test, rf_predictions_test)
    MSE =  metrics.mean_squared_error(y_test, rf_predictions_test)
    RMSE = np.sqrt(metrics.mean_squared_error(y_test, rf_predictions_test))
    R2 = metrics.r2_score(y_test, rf_predictions_test)
    MPE = np.mean((y_test - rf_predictions_test)/y_test)
    
    with open('neural_network_random_forest_combo_results_final.csv', 'a', newline='') as file:
                    writer = csv.writer(file)
                    writer.writerow([route, direction, MAE, MSE, RMSE, R2, MAE_train, MPE])
                

    
    #dump to pickle file
    #file_name = str(file_name)
    #foo = file_name[0:-4]
    #pickle_file = foo + ".pickle"
    #pickle_file_path = "/Users/laura/Desktop/pickles_nn_rf_combo/" + pickle_file
    
    #with open(pickle_file_path, 'wb') as f:
        #pickle.dump(rf, f)
        #print(pickle_file_path, " dumped")
    

In [6]:
we_have_a_pickle_for = []

In [7]:
nn_count = 0
rf_count = 0
for file in files:
    if str(file)[-4:] == ".csv":
        route_split = file.split("_", 1)
        direction = int(file[-5:-4])
        route = route_split[0]
        identity_string = str(route) + "_" + str(direction)
        
        # the below file is a list of the optimum parameters for the neural net.
        # it contains duplicates for each route as various optimisations were trialled.
        rf_df = pd.read_csv("~/Desktop/random_forest_results_optimised_altered.csv", keep_default_na=True, sep=',\s+',delimiter=',', skipinitialspace=True)
        rf_routes = list(rf_df['Route'].unique())
        nn_df = pd.read_csv("~/Desktop/Trimester_3/My_work/neural_networks_final_parameters.csv", keep_default_na=True, sep=',\s+',delimiter=',', skipinitialspace=True)
        nn_routes = list(nn_df['Route'].unique())
        if identity_string in rf_routes:
            #we_have_a_pickle_for.append(file)
            #print(route, direction, " in progress.")
            rf_route_frame = rf_df[rf_df['Route']==identity_string]
            rf_result = list(rf_route_frame['Mean_Absolute_Error'].unique())[0]
            print("RF result", rf_result)
            
        if identity_string in nn_routes:    
            nn_route_frame = nn_df[nn_df['Route']==identity_string]
            best_MAE = nn_route_frame['MAE'].min()
            best_combo_frame = nn_route_frame[nn_route_frame['MAE']==best_MAE]
            nn_result = list(best_combo_frame['MAE'].unique())[0]
            print("NN result", nn_result)
            hs = list(best_combo_frame['HS'].unique())[0]
            al = list(best_combo_frame['A'].unique())[0]
            l = list(best_combo_frame['L'].unique())[0]
            mx = list(best_combo_frame['MX'].unique())[0]
            ##print(hs, al, l, mx)
        
        if nn_result <= rf_result:
            print ("NN result better")
            neural_net(file, route, direction, int(hs), float(al), float(l), int(mx))
            we_have_a_pickle_for.append(file)
            nn_count += 1
        else:
            print('RF Result better')
            random_forest(file, route, direction)
            we_have_a_pickle_for.append(file)
            rf_count += 1
            

RF result 397.09742149445486
NN result 399.5896498597048
RF Result better
RF result 391.6154122941658
NN result 371.2531721538581
NN result better
RF result 225.13400000000016
NN result 145.65028382523502
NN result better
RF result 261.83228366685944
NN result 249.42352281710444
NN result better
RF result 340.96340462184867
NN result 334.2794957247498
NN result better
RF result 465.7230128259317
NN result 430.75124776128325
NN result better
RF result 484.3757966288255
NN result 449.0435212589798
NN result better
RF result 331.03507246376813
NN result 363.38630404783027
RF Result better
RF result 344.2552687245848
NN result 321.44465857116836
NN result better
RF result 311.91354298375137
NN result 300.71642877458737
NN result better
RF result 211.24633701139967
NN result 208.5553968308156
NN result better
RF result 354.06284615611895
NN result 372.6598857054726
RF Result better
RF result 368.55149896157167
NN result 360.7089111889872
NN result better
RF result 245.25199978807663
NN resu

RF result 192.19939114391144
NN result 203.95676227593844
RF Result better
RF result 543.5300029799263
NN result 577.1040311321661
RF Result better
RF result 329.3174193548388
NN result 340.3276054634858
RF Result better
RF result 261.87219512195117
NN result 254.55320991521236
NN result better
RF result 66.32999999999993
NN result 65.94808433695471
NN result better
RF result 375.18627783289236
NN result 457.3578142287849
RF Result better
RF result 240.5597366030881
NN result 216.7412941477861
NN result better
RF result 453.2365
NN result 407.78275238240445
NN result better
RF result 296.48851548118336
NN result 317.2108801919502
RF Result better
RF result 308.08466365733034
NN result 343.35621014772096
RF Result better
RF result 301.98647700902706
NN result 295.324415202378
NN result better
RF result 382.22372338318286
NN result 372.9189852190429
NN result better
RF result 235.25087304217863
NN result 213.60762270952702
NN result better
RF result 261.81544983663133
NN result 254.01862

RF result 440.2067474048442
NN result 436.8544892276429
NN result better
RF result 154.39935643564357
NN result 157.7562069682649
RF Result better
RF result 588.9198113207548
NN result 552.784500893149
NN result better
RF result 176.937865493931
NN result 182.57313757056656
RF Result better
RF result 326.88044435149436
NN result 311.8786142924536
NN result better
RF result 306.4172962962963
NN result 322.520844339161
RF Result better
RF result 254.70251364300046
NN result 246.37612901171275
NN result better
RF result 360.5380594413285
NN result 335.7572397905461
NN result better
RF result 426.7535689739859
NN result 413.8335650958488
NN result better
RF result 180.764027339056
NN result 181.81253813454498
RF Result better
RF result 279.790808150818
NN result 265.56739543633864
NN result better
RF result 92.76374999999996
NN result 111.98106718435885
RF Result better
RF result 514.5427720913514
NN result 502.50382681671977
NN result better
RF result 375.4998616504855
NN result 384.57035

#### check that all of the files have a pickle

In [8]:
check_list = np.setdiff1d(files,we_have_a_pickle_for)

In [9]:
list(check_list)

['.DS_Store']

#### how many are random forest

In [10]:
rf_count

93

In [11]:
nn_count

159