In [1]:
import pandas as pd
import pickle
import numpy as np
import os
import csv
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.tree import export_graphviz

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
with open('neural_network_results_random_state_experiment.csv', 'a', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Route", "Direction", "Random State", "Mean_Absolute_Error", "Mean_Squared_Error", "Root_Mean_Squared_Error", "R2", "MAE_Train", "MPE"])

In [3]:
files = os.listdir("/Users/laura/Desktop/Trimester_3/My_work/Cleaned_Bus_Weather_Data/Master_Weather_Direction/")
files
print(len(files))

253


In [4]:
def neural_net(file_name, route, direction, hs, al, l, mx, rs):
    """Takes a file and trains a linear regression on that file, returns a pickle and writes accuracy to .csv """
    
    #read in file
    filepath = "/Users/laura/Desktop/Trimester_3/My_work/Cleaned_Bus_Weather_Data/Master_Weather_Direction/" + file_name
    df = pd.read_csv(filepath, keep_default_na=True, sep=',\s+', delimiter=',', skipinitialspace=True)
    
    #find the main subroute to run linear regression on
    subroutes = pd.read_csv("~/Desktop/Trimester_3/My_work/routes_subroutes-04072020.csv", keep_default_na=True, sep=',\s+', delimiter=',', skipinitialspace=True)
    only_route = subroutes[subroutes.LINEID==str(route)]
    route_direction = only_route[only_route.DIRECTION==int(direction)]
    subroute_list = list(route_direction.MAINROUTE.unique())
    main_subroute = subroute_list[0][2:-2]
    
    #limit the dataframe to only journeys on the main subroute
    main_route = df[df["ROUTEID"] ==main_subroute]
    
    #limit the dataframe to only the first and last stop
    progrnumbers = list(main_route.PROGRNUMBER.unique())
    last_progrnumber = max(progrnumbers)
    only_first = main_route[main_route["PROGRNUMBER"]==1]
    only_last = main_route[main_route["PROGRNUMBER"]==int(last_progrnumber)]
    
    #renaming fields pertaining to the last stop before merging those fields into the dataframe for the first stop
    for_merging = only_last[["PLANNEDTIME_ARR", "ACTUALTIME_ARR", "UNIQUE_TRIP", "DELAYARR", "DELAYDEP"]]
    for_merging.columns = ["PLANNEDTIME_ARR_LAST", "ACTUALTIME_ARR_LAST", "UNIQUE_TRIP", "DELAYARR_LAST", "DELAYDEP_LAST"]
    
    result = pd.merge(only_first,
                        for_merging[["PLANNEDTIME_ARR_LAST", "ACTUALTIME_ARR_LAST", "UNIQUE_TRIP", "DELAYARR_LAST", "DELAYDEP_LAST"]],
                        on='UNIQUE_TRIP', 
                        how='left')
    
    #drop any empty rows
    reduced_result = result.dropna()
    
    #changing dtype of 'ACTUAL_ARR_LAST' to int
    reduced_result['ACTUALTIME_ARR_LAST'] = reduced_result['ACTUALTIME_ARR_LAST'].astype('int64')
    
    #creating journeytime feature
    reduced_result['JOURNEYTIME'] = reduced_result["ACTUALTIME_ARR_LAST"] - reduced_result["ACTUALTIME_DEP"]
    reduced_result['PLANNED_JOURNEYTIME'] = reduced_result["PLANNEDTIME_ARR_LAST"] - reduced_result["PLANNEDTIME_DEP"]
    
    
    #drop columns I no longer need
    reduced_result = reduced_result.drop(columns=['date', 'timezone', 'DAYOFSERVICE','PROGRNUMBER','DELAYDEP', 'ROUTEID', 'DELAYARR', 'DIRECTION', 'UNIQUE_TRIP', 'PLANNEDTIME_ARR', 'PLANNEDTIME_DEP', 'ACTUALTIME_ARR', 'ACTUALTIME_DEP', 'PLANNEDTIME_ARR_LAST', 'ACTUALTIME_ARR_LAST', 'DELAYARR_LAST', 'DELAYDEP_LAST', 'STOPPOINTID'])
    
    #remove any rows where journey time is negative
    reduced_result = reduced_result[(reduced_result['JOURNEYTIME'] > 0)]
    
    #prepare dataframe for modelling
    reduced_result['weather_main'] = reduced_result['weather_main'].astype('category')
    reduced_result['weather_description'] = reduced_result['weather_description'].astype('category')
    reduced_result['MONTH'] = reduced_result['MONTH'].astype('category')
    reduced_result['DAYOFWEEK'] = reduced_result['DAYOFWEEK'].astype('category')
    reduced_result['TIME_GROUP'] = reduced_result['TIME_GROUP'].astype('category')
    df_dummies = pd.get_dummies(reduced_result)
    
    #split the dataset into target and the rest
    # y is the target
    y = df_dummies["JOURNEYTIME"]
    # X is everything else
    X = df_dummies.drop(["JOURNEYTIME", "PLANNED_JOURNEYTIME"],1)
    
    #split into test/train split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,  test_size=0.1)
    #print(X_train.head())
    #print(X_train.columns)
    #reset indices
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)
    
    #train a linear regression model
    neural_net = MLPRegressor(random_state=rs, hidden_layer_sizes=hs, alpha=al, learning_rate_init=l, max_iter=mx).fit(X_train, y_train)
    
    #get predictions on training set
    neural_net_predictions_train = neural_net.predict(X_train)

    
    #get predictions on test set
    neural_net_predictions_test = neural_net.predict(X_test)
    
    #get results
    MAE_train = metrics.mean_absolute_error(y_train, neural_net_predictions_train)
    MAE_test =  metrics.mean_absolute_error(y_test, neural_net_predictions_test)
    MSE_test =  metrics.mean_squared_error(y_test, neural_net_predictions_test)
    RMSE_test = np.sqrt(metrics.mean_squared_error(y_test, neural_net_predictions_test))
    R2_test = metrics.r2_score(y_test, neural_net_predictions_test)
    MPE = np.mean((y_test - neural_net_predictions_test)/y_test)
    #negative value for MPE means the model is overestimating rather than underestimating
    print(MPE)
    with open('neural_network_results_random_state_experiment.csv', 'a', newline='') as file:
                    writer = csv.writer(file)
                    writer.writerow([route, direction, rs, MAE_test, MSE_test, RMSE_test, R2_test, MAE_train, MPE])
                

    
    #dump to pickle file
    #file_name = str(file_name)
    #foo = file_name[0:-4]
    #pickle_file = foo + ".pickle"
    #pickle_file_path = "/Users/laura/Desktop/pickles_neural_net/" + pickle_file
    
    #with open(pickle_file_path, 'wb') as f:
        #pickle.dump(neural_net, f)
        #print(pickle_file_path, " dumped")
    

In [5]:
#we_have_a_pickle_for = []

In [6]:
for file in files:
    if str(file)[-4:] == ".csv":
        route_split = file.split("_", 1)
        direction = int(file[-5:-4])
        route = route_split[0]
        identity_string = str(route) + "_" + str(direction)
        
        # the below file is a list of the optimum parameters for the neural net.
        # it contains duplicates for each route as various optimisations were trialled.
        df = pd.read_csv("~/Desktop/Trimester_3/My_work/neural_networks_final_parameters.csv", keep_default_na=True, sep=',\s+',delimiter=',', skipinitialspace=True)
        routes = list(df['Route'].unique())
        #print(df.head())
        #print(identity_string)
        if identity_string in routes:
            #we_have_a_pickle_for.append(file)
            print(route, direction, " in progress.")
            route_frame = df[df['Route']==identity_string]
            best_MAE = route_frame['MAE'].min()
            best_combo_frame = route_frame[route_frame['MAE']==best_MAE]
            hs = list(best_combo_frame['HS'].unique())[0]
            al = list(best_combo_frame['A'].unique())[0]
            l = list(best_combo_frame['L'].unique())[0]
            mx = list(best_combo_frame['MX'].unique())[0]
            print(hs, al, l, mx)
            for rs in range(1,9):
                neural_net(file, route, direction, int(hs), float(al), float(l), int(mx), rs)

39 2  in progress.
100 0.5 0.01 500
-0.003858926216173699
-0.012197241704502485
-0.0031896418554636474
-0.008481847504099175
-0.020802548413841385
-0.007261006698542187
0.004441139629188707
-0.0075520943498256395
9 2  in progress.
100 0.6 0.02 500
0.005068157304077101
-0.010682543869300173
-0.01729109263195215
-0.0010329028960607515
0.006772789747527706
-0.0233732502409045
0.01677349880768383
-0.02541112568568748
77X 2  in progress.
5 0.85 0.025 9000
-0.016041468913238606
-0.020149633317017034
-0.027734813440349955
-0.01529981186445536
-0.028779033466063104
-0.021407669936848863
-0.016044774153579834
-0.012045727139352683
40D 1  in progress.
10 0.6 0.01 500
-0.0023495244800765744
-0.018049120435544547
-0.004670069646929143
-0.008726179435904787
-0.01557082090349143
0.006549283396052684
-0.009480325194335486
-0.006391995022852711
47 1  in progress.
100 0.6 0.03 500
-0.004735664375173283
0.023552515334494546
0.020122893437032753
-0.013445785079230842
-0.009398451464190037
-0.036803371202

0.026006805636966186
0.02617012794560193
0.02524612331635022
0.024098733355034777
0.027337632524938437
0.026726607675460427
0.027055680825296996
40E 2  in progress.
100 0.6 0.03 500
-0.040290436442226574
-0.05867248794575157
-0.046757974464378474
-0.029314448618696375
-0.06720533789248108
-0.012508847513920389
-0.06686396249124606
-0.05695183769584895
40B 1  in progress.
20 0.9 0.005 900
-0.01084415131738261
-0.009807767922864722
-0.002131701727802832
-0.012435725754938167
-0.0010827579177370742
-0.015008160616524654
-0.010869035592980317
-0.006053827586205455
49 1  in progress.
250 0.8 0.005 700
-0.004849545569134072
-0.024667330530484706
-0.01941001985674105
-0.003099613896082523
-0.022548514674383933
-0.04321437749540033
-0.017783710282131156
-0.014277783352345917
66X 2  in progress.
5 0.82 0.00655 9114
-0.0010720839913138916
-0.0019345528022361588
-0.0025377143308106262
-0.002895999760291951
-0.002312369612829653
-0.003460073387630114
-0.0027378941098978814
-0.005998313550853567
41

0.06791996884848911
-0.02883501618224633
-0.04221500086213678
0.017143162477794686
0.0437924311445383
-0.024105795548697143
-0.05271035886371611
39X 1  in progress.
35 0.9452024999999998 0.009850500000000002 3234
-0.007384934620154297
0.0010882195514034009
-0.010344758469825657
-0.0040455778313447386
-0.023698139613488385
-0.007351015686673971
-0.006907040073579711
-0.0013906106331908286
68X 2  in progress.
40 0.85 0.005 9000
-0.018071764011264296
-0.03997343432483149
-0.061631098782586775
-0.058746421969751626
-0.06331068939967692
-0.029862048505335886
-0.016064387545756093
-0.04101836874212255
120 2  in progress.
100 0.8 0.01 200
-0.010009483868980118
-0.0298128269435459
-0.017836392836016766
-0.02632322101918316
-0.017121644455447227
0.024162222285003292
-0.027894470551918467
-0.014003064987705514
32 1  in progress.
75 0.4 0.035 300
-0.002859875457443242
0.017895622471605567
-0.013627912344231838
-0.023872216365941137
-0.023180898534931587
-0.00957974424953395
-0.04797422314677465
-

0.0016136504188926138
-0.021186953259884768
-0.051960346741276434
-0.026643040383235832
-0.05431700427851297
-0.023220522768495275
-0.024658162648028434
-0.04251042171095642
26 2  in progress.
75 0.6 0.025 300
-0.009105837182128155
-0.04597650333039547
-0.06001170275075358
-0.06702910910917234
-0.09872269830089958
-0.02971382576468975
-0.04740266194478845
0.005831644222209137
33 2  in progress.
80 0.4 0.005 3000
-0.01297145184487331
-0.01476741638464817
-0.011056817565568226
-0.01180156998472208
-0.014241001934831268
-0.016213052832609234
-0.011139015060263135
-0.015358265001947197
33B 2  in progress.
125 0.4 0.005 700
-0.05244652965505915
-0.02568013366472595
-0.030134504643346608
-0.059730106172628854
-0.06458394442255594
-0.004825163477961881
-0.026123238099189212
-0.08473279407325625
142 2  in progress.
55 0.8 0.025 300
-0.010360537622107828
-0.001111541410809532
0.0035261678321975693
-0.028892575465103383
-0.010188455679925178
-0.019513758232913327
0.011439995964986712
0.002483556

-0.005793508620029455
0.025692011177468062
0.03623031977941908
-0.036117157228055474
-0.05779798861744846
-0.016787917255218436
0.059087685505282414
0.0012914776028136606
65 2  in progress.
200 0.8 0.03 300
-0.024729782159181233
-0.06979298475581786
-0.0014084080482635827
-0.0289985419842612
-0.039253705823446894
-0.007324526297459754
-0.06823361247164993
0.026330261449133995
25X 1  in progress.
77 0.3755684999999999 0.042420000000000006 3234
-0.0025730509785902057
0.04142937298508318
0.0004617367811860397
0.021317508149236837
0.01623312276105475
-0.004490631312655707
0.020194475518439874
0.006262041609404911
44B 2  in progress.
150 0.9 0.03 300
0.012451561816893325
0.06124372570912455
0.030810016700726795
-0.02598744646066939
0.05514183524044976
0.02783174266544927
0.03775504033403325
0.008909768563987957
15B 1  in progress.
30 0.9 0.005 300
-0.001492443667891128
-0.02509876203583213
-0.025953491395011023
-0.013245361444041558
-0.027452052749543224
-0.019920232518016924
-0.00955861594

0.023851944257346356
0.019996216458851985
0.03481042748135601
0.022512128901699867
0.022955913286023706
0.02359966754482723
0.01801621541516915
0.024173603224959178
67 2  in progress.
100 0.4 0.035 300
0.00028931903077590495
0.007793627930699132
0.009630177201714196
-0.017184522488911846
-0.0557854655456409
0.0006327327052098113
-0.02931271408416921
-0.03975325065274609
239 2  in progress.
150 0.4 0.005 300
-0.042550020611237103
-0.029852932728090857
-0.05864064369991807
-0.060374788190915324
-0.054607110587347116
-0.06304519397867973
-0.058288362005878475
-0.008855349885537428
75 1  in progress.
75 0.5 0.035 300
0.0005575914641314706
-0.005918732649636287
-0.002147110697433557
-0.020681082115205463
0.013018243072459516
-0.024955246402447968
0.014285306078544469
0.0353353226991361
16 2  in progress.
30 0.4 0.025 300
-0.007846114727179328
-0.010848460476161469
-0.022241242249936125
-0.03774331907134239
-0.018463836270686606
-0.015788305411619387
-0.013594354847799221
-0.0281150412473306

-0.012091021344853951
-0.0005710226480335487
-0.019767462951740405
-0.027674991305049287
-0.01180283434130751
14C 1  in progress.
20 0.9 0.025 700
0.005710346928776743
-0.013662394269127983
0.017191965513520212
-0.0023842775869934996
-0.04633173111113837
-0.03477850912758383
0.011906574884454806
0.005075044690309457
38B 2  in progress.
55 0.8 0.005 700
-0.016883743098150338
-0.03576104677012
-0.0302417192102726
-0.025347807877615817
-0.02354075703166215
-0.03461702626612068
-0.03126433819837951
-0.02766835163844069
84 1  in progress.
75 0.9 0.025 700
-0.010885885959051265
-0.0009326333305210794
-0.003517880276277315
0.012497441823693027
0.038309750822325066
-0.005951863836538513
-0.011599469163099013
0.007464200711186012
184 1  in progress.
55 0.5 0.02 300
0.0009081784046656795
-0.008724624307981632
-0.027635794940903312
0.0051954896522439665
-0.0014417586636150663
-0.02303082684302551
-0.02947324963640782
-0.0033348387043402924
18 2  in progress.
200 0.9 0.015 300
-0.00554458444928417

#### check that all of the files have a pickle

In [7]:
check_list = np.setdiff1d(files,we_have_a_pickle_for)

NameError: name 'we_have_a_pickle_for' is not defined

In [None]:
list(check_list)