In [1]:
import pandas as pd
import pickle
import numpy as np
import os
import csv
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.tree import export_graphviz

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
with open('Amazon_4th_Batch_32-64.csv', 'a', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Route", "combo", "Mean_Absolute_Error", "Mean_Squared_Error", "Root_Mean_Squared_Error", "R2"])

In [3]:
files = os.listdir("/Users/laura/Desktop/Trimester_3/My_work/Cleaned_Bus_Weather_Data/Master_Weather_Direction/")
files
print(len(files))

253


In [4]:
unfinished_files = [['142_1', 419.2629784316811, ['250', '0.9', '0.025', '700']], ['65_2', 758.2412786755805, ['200', '0.8', '0.03', '300']], ['84A_1', 363.3863040478304, ['75', '0.5', '0.03', '300']], ['33B_2', 317.2108801919502, ['125', '0.4', '0.005', '700']]]

In [5]:
def neural_net(file_name, route, direction, hidden_size, al, l, max_iterations):
    """Takes a file and trains a linear regression on that file, returns a pickle and writes accuracy to .csv """
    #read in file
    
    filepath = "/Users/laura/Desktop/Trimester_3/My_work/Cleaned_Bus_Weather_Data/Master_Weather_Direction/" + file_name
    df = pd.read_csv(filepath, keep_default_na=True, sep=',\s+', delimiter=',', skipinitialspace=True)
    
    #find the main subroute to run linear regression on
    subroutes = pd.read_csv("~/Desktop/Trimester_3/My_work/routes_subroutes-04072020.csv", keep_default_na=True, sep=',\s+', delimiter=',', skipinitialspace=True)
    only_route = subroutes[subroutes.LINEID==str(route)]
    route_direction = only_route[only_route.DIRECTION==int(direction)]
    subroute_list = list(route_direction.MAINROUTE.unique())
    main_subroute = subroute_list[0][2:-2]
    
    #limit the dataframe to only journeys on the main subroute
    main_route = df[df["ROUTEID"] ==main_subroute]
    
    #limit the dataframe to only the first and last stop
    progrnumbers = list(main_route.PROGRNUMBER.unique())
    last_progrnumber = max(progrnumbers)
    only_first = main_route[main_route["PROGRNUMBER"]==1]
    only_last = main_route[main_route["PROGRNUMBER"]==int(last_progrnumber)]
    
    #renaming fields pertaining to the last stop before merging those fields into the dataframe for the first stop
    for_merging = only_last[["PLANNEDTIME_ARR", "ACTUALTIME_ARR", "UNIQUE_TRIP", "DELAYARR", "DELAYDEP"]]
    for_merging.columns = ["PLANNEDTIME_ARR_LAST", "ACTUALTIME_ARR_LAST", "UNIQUE_TRIP", "DELAYARR_LAST", "DELAYDEP_LAST"]
    
    result = pd.merge(only_first,
                        for_merging[["PLANNEDTIME_ARR_LAST", "ACTUALTIME_ARR_LAST", "UNIQUE_TRIP", "DELAYARR_LAST", "DELAYDEP_LAST"]],
                        on='UNIQUE_TRIP', 
                        how='left')
    
    #drop any empty rows
    reduced_result = result.dropna()
    
    #changing dtype of 'ACTUAL_ARR_LAST' to int
    reduced_result['ACTUALTIME_ARR_LAST'] = reduced_result['ACTUALTIME_ARR_LAST'].astype('int64')
    
    #creating journeytime feature
    reduced_result['JOURNEYTIME'] = reduced_result["ACTUALTIME_ARR_LAST"] - reduced_result["ACTUALTIME_DEP"]
    #print(reduced_result['JOURNEYTIME'].describe())
    reduced_result['PLANNED_JOURNEYTIME'] = reduced_result["PLANNEDTIME_ARR_LAST"] - reduced_result["PLANNEDTIME_DEP"]
    
    
    #drop columns I no longer need
    reduced_result = reduced_result.drop(columns=['date', 'timezone', 'DAYOFSERVICE','PROGRNUMBER','DELAYDEP', 'ROUTEID', 'DELAYARR', 'DIRECTION', 'UNIQUE_TRIP', 'PLANNEDTIME_ARR', 'PLANNEDTIME_DEP', 'ACTUALTIME_ARR', 'ACTUALTIME_DEP', 'PLANNEDTIME_ARR_LAST', 'ACTUALTIME_ARR_LAST', 'DELAYARR_LAST', 'DELAYDEP_LAST', 'STOPPOINTID'])
    reduced_result = reduced_result[(reduced_result['JOURNEYTIME'] > 0)]
    #prepare dataframe for modelling
    reduced_result['weather_main'] = reduced_result['weather_main'].astype('category')
    reduced_result['weather_description'] = reduced_result['weather_description'].astype('category')
    reduced_result['MONTH'] = reduced_result['MONTH'].astype('category')
    reduced_result['DAYOFWEEK'] = reduced_result['DAYOFWEEK'].astype('category')
    reduced_result['TIME_GROUP'] = reduced_result['TIME_GROUP'].astype('category')
    df_dummies = pd.get_dummies(reduced_result)
    
    #split the dataset into target and the rest
    # y is the target
    y = df_dummies["JOURNEYTIME"]
    # X is everything else
    X = df_dummies.drop(["JOURNEYTIME", 'PLANNED_JOURNEYTIME'],1)
    
    #split into test/train split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,  test_size=0.1)
    #print(X_train.head())
    #print(X_train.columns)
    #reset indices
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)
    
    #train a linear regression model
    multiple_linreg = MLPRegressor(random_state=1, hidden_layer_sizes=hidden_size, alpha=al, learning_rate_init=l, max_iter=max_iterations).fit(X_train, y_train)
    
    #get predictions on training set
    predictions = multiple_linreg.predict(X_train)

    
    #get predictions on test set
    multiple_linreg_predictions_test = multiple_linreg.predict(X_test)
    
    #get results

    MAE =  metrics.mean_absolute_error(y_test, multiple_linreg_predictions_test)
    MSE =  metrics.mean_squared_error(y_test, multiple_linreg_predictions_test)
    RMSE = np.sqrt(metrics.mean_squared_error(y_test, multiple_linreg_predictions_test))
    R2 = metrics.r2_score(y_test, multiple_linreg_predictions_test)
    
    combo_string = str(hidden_size) + "_" + str(al) + "_" + str(l) + "_" + str(max_iterations)
    route_direction_string = str(route) + "_" + str(direction)
    with open('Amazon_4th_Batch_32-64.csv', 'a', newline='') as file:
                    writer = csv.writer(file)
                    writer.writerow([route_direction_string, combo_string, MAE, MSE, RMSE, R2])
                

    
    #dump to pickle file
    #file_name = str(file_name)
    #foo = file_name[0:-4]
    #pickle_file = foo + ".pickle"
    #pickle_file_path = "/Users/laura/Desktop/pickles_new/" + pickle_file
    
    #with open(pickle_file_path, 'wb') as f:
        #pickle.dump(multiple_linreg, f)
        #print(pickle_file_path, " dumped")
    

In [6]:
#hidden_size = [4]
#hidden_size = [100]
#alpha = [0.24, 0.25, 0.26]
#alpha = [0.6]
#lamb = [0.01887, 0.01888, 0.01889, 0.0189]
#lamb = [0.03]
#max_iterations = [4855, 4865, 4875]

In [7]:
for file in files:
    if str(file)[-4:] == ".csv":
        route_split = file.split("_", 1)
        direction = int(file[-5:-4])
        route = route_split[0]
        string = str(route) + "_" + str(direction)
        print(route, direction, " in progress.")
        for item in unfinished_files:
            if string == item[0]:
                print(item[2])
                hs_start = int(item[2][0])
                al_start = float(item[2][1])
                l_start = float(item[2][2])
                mx_start = int(item[2][3])
                
                if hs_start <= 10:
                    hidden_size = [hs_start-1, hs_start, hs_start+1]
                else:
                    hidden_size = [hs_start-2, hs_start-1, hs_start, hs_start+1, hs_start+2]
                    #hidden_size = [hs_start]
                alpha = [al_start*0.99, al_start*0.995, al_start, al_start*1.005, al_start*1.01]
                #alpha = [al_start]
                    
                lamb = [l_start*0.99, l_start*0.995, l_start, l_start*1.005, l_start*1.01]
                
                #lamb = [l_start]
                
                max_iterations = [int(mx_start*0.99), int(mx_start*0.995), mx_start, int(mx_start*1.005), int(mx_start*1.01)]

                #max_iterations = [mx_start]
                
                print(hs_start, al_start, l_start, mx_start)
                for hs in hidden_size:
                    for al in alpha:
                        for l in lamb:
                            for mx in max_iterations:
                                print(hs, al, l, mx)
                                neural_net(file, route, direction, hs, al, l, mx)
                            

39 2  in progress.
9 2  in progress.
77X 2  in progress.
40D 1  in progress.
47 1  in progress.
40 2  in progress.
140 2  in progress.
84A 1  in progress.
['75', '0.5', '0.03', '300']
75 0.5 0.03 300
73 0.495 0.029699999999999997 297
73 0.495 0.029699999999999997 298
73 0.495 0.029699999999999997 300
73 0.495 0.029699999999999997 301
73 0.495 0.029699999999999997 303
73 0.495 0.029849999999999998 297
73 0.495 0.029849999999999998 298
73 0.495 0.029849999999999998 300
73 0.495 0.029849999999999998 301
73 0.495 0.029849999999999998 303
73 0.495 0.03 297
73 0.495 0.03 298
73 0.495 0.03 300
73 0.495 0.03 301
73 0.495 0.03 303
73 0.495 0.030149999999999996 297
73 0.495 0.030149999999999996 298
73 0.495 0.030149999999999996 300
73 0.495 0.030149999999999996 301
73 0.495 0.030149999999999996 303
73 0.495 0.0303 297
73 0.495 0.0303 298
73 0.495 0.0303 300
73 0.495 0.0303 301
73 0.495 0.0303 303
73 0.4975 0.029699999999999997 297
73 0.4975 0.029699999999999997 298
73 0.4975 0.029699999999999997

75 0.4975 0.029849999999999998 303
75 0.4975 0.03 297
75 0.4975 0.03 298
75 0.4975 0.03 300
75 0.4975 0.03 301
75 0.4975 0.03 303
75 0.4975 0.030149999999999996 297
75 0.4975 0.030149999999999996 298
75 0.4975 0.030149999999999996 300
75 0.4975 0.030149999999999996 301
75 0.4975 0.030149999999999996 303
75 0.4975 0.0303 297
75 0.4975 0.0303 298
75 0.4975 0.0303 300
75 0.4975 0.0303 301
75 0.4975 0.0303 303
75 0.5 0.029699999999999997 297
75 0.5 0.029699999999999997 298
75 0.5 0.029699999999999997 300
75 0.5 0.029699999999999997 301
75 0.5 0.029699999999999997 303
75 0.5 0.029849999999999998 297
75 0.5 0.029849999999999998 298
75 0.5 0.029849999999999998 300
75 0.5 0.029849999999999998 301
75 0.5 0.029849999999999998 303
75 0.5 0.03 297
75 0.5 0.03 298
75 0.5 0.03 300
75 0.5 0.03 301
75 0.5 0.03 303
75 0.5 0.030149999999999996 297
75 0.5 0.030149999999999996 298
75 0.5 0.030149999999999996 300
75 0.5 0.030149999999999996 301
75 0.5 0.030149999999999996 303
75 0.5 0.0303 297
75 0.5 0.030

77 0.5025 0.029699999999999997 303
77 0.5025 0.029849999999999998 297
77 0.5025 0.029849999999999998 298
77 0.5025 0.029849999999999998 300
77 0.5025 0.029849999999999998 301
77 0.5025 0.029849999999999998 303
77 0.5025 0.03 297
77 0.5025 0.03 298
77 0.5025 0.03 300
77 0.5025 0.03 301
77 0.5025 0.03 303
77 0.5025 0.030149999999999996 297
77 0.5025 0.030149999999999996 298
77 0.5025 0.030149999999999996 300
77 0.5025 0.030149999999999996 301
77 0.5025 0.030149999999999996 303
77 0.5025 0.0303 297
77 0.5025 0.0303 298
77 0.5025 0.0303 300
77 0.5025 0.0303 301
77 0.5025 0.0303 303
77 0.505 0.029699999999999997 297
77 0.505 0.029699999999999997 298
77 0.505 0.029699999999999997 300
77 0.505 0.029699999999999997 301
77 0.505 0.029699999999999997 303
77 0.505 0.029849999999999998 297
77 0.505 0.029849999999999998 298
77 0.505 0.029849999999999998 300
77 0.505 0.029849999999999998 301
77 0.505 0.029849999999999998 303
77 0.505 0.03 297
77 0.505 0.03 298
77 0.505 0.03 300
77 0.505 0.03 301
77 

249 0.9045 0.02475 707
249 0.9045 0.024875 693
249 0.9045 0.024875 696
249 0.9045 0.024875 700
249 0.9045 0.024875 703
249 0.9045 0.024875 707
249 0.9045 0.025 693
249 0.9045 0.025 696
249 0.9045 0.025 700
249 0.9045 0.025 703
249 0.9045 0.025 707
249 0.9045 0.025124999999999998 693
249 0.9045 0.025124999999999998 696
249 0.9045 0.025124999999999998 700
249 0.9045 0.025124999999999998 703
249 0.9045 0.025124999999999998 707
249 0.9045 0.02525 693
249 0.9045 0.02525 696
249 0.9045 0.02525 700
249 0.9045 0.02525 703
249 0.9045 0.02525 707
249 0.909 0.02475 693
249 0.909 0.02475 696
249 0.909 0.02475 700
249 0.909 0.02475 703
249 0.909 0.02475 707
249 0.909 0.024875 693
249 0.909 0.024875 696
249 0.909 0.024875 700
249 0.909 0.024875 703
249 0.909 0.024875 707
249 0.909 0.025 693
249 0.909 0.025 696
249 0.909 0.025 700
249 0.909 0.025 703
249 0.909 0.025 707
249 0.909 0.025124999999999998 693
249 0.909 0.025124999999999998 696
249 0.909 0.025124999999999998 700
249 0.909 0.025124999999999

252 0.8955 0.025124999999999998 693
252 0.8955 0.025124999999999998 696
252 0.8955 0.025124999999999998 700
252 0.8955 0.025124999999999998 703
252 0.8955 0.025124999999999998 707
252 0.8955 0.02525 693
252 0.8955 0.02525 696
252 0.8955 0.02525 700
252 0.8955 0.02525 703
252 0.8955 0.02525 707
252 0.9 0.02475 693
252 0.9 0.02475 696
252 0.9 0.02475 700
252 0.9 0.02475 703
252 0.9 0.02475 707
252 0.9 0.024875 693
252 0.9 0.024875 696
252 0.9 0.024875 700
252 0.9 0.024875 703
252 0.9 0.024875 707
252 0.9 0.025 693
252 0.9 0.025 696
252 0.9 0.025 700
252 0.9 0.025 703
252 0.9 0.025 707
252 0.9 0.025124999999999998 693
252 0.9 0.025124999999999998 696
252 0.9 0.025124999999999998 700
252 0.9 0.025124999999999998 703
252 0.9 0.025124999999999998 707
252 0.9 0.02525 693
252 0.9 0.02525 696
252 0.9 0.02525 700
252 0.9 0.02525 703
252 0.9 0.02525 707
252 0.9045 0.02475 693
252 0.9045 0.02475 696
252 0.9045 0.02475 700
252 0.9045 0.02475 703
252 0.9045 0.02475 707
252 0.9045 0.024875 693
252 0.

124 0.404 0.004975 707
124 0.404 0.005 693
124 0.404 0.005 696
124 0.404 0.005 700
124 0.404 0.005 703
124 0.404 0.005 707
124 0.404 0.005025 693
124 0.404 0.005025 696
124 0.404 0.005025 700
124 0.404 0.005025 703
124 0.404 0.005025 707
124 0.404 0.00505 693
124 0.404 0.00505 696
124 0.404 0.00505 700
124 0.404 0.00505 703
124 0.404 0.00505 707
125 0.396 0.00495 693
125 0.396 0.00495 696
125 0.396 0.00495 700
125 0.396 0.00495 703
125 0.396 0.00495 707
125 0.396 0.004975 693
125 0.396 0.004975 696
125 0.396 0.004975 700
125 0.396 0.004975 703
125 0.396 0.004975 707
125 0.396 0.005 693
125 0.396 0.005 696
125 0.396 0.005 700
125 0.396 0.005 703
125 0.396 0.005 707
125 0.396 0.005025 693
125 0.396 0.005025 696
125 0.396 0.005025 700
125 0.396 0.005025 703
125 0.396 0.005025 707
125 0.396 0.00505 693
125 0.396 0.00505 696
125 0.396 0.00505 700
125 0.396 0.00505 703
125 0.396 0.00505 707
125 0.398 0.00495 693
125 0.398 0.00495 696
125 0.398 0.00495 700
125 0.398 0.00495 703
125 0.398 0.00

127 0.40199999999999997 0.004975 693
127 0.40199999999999997 0.004975 696
127 0.40199999999999997 0.004975 700
127 0.40199999999999997 0.004975 703
127 0.40199999999999997 0.004975 707
127 0.40199999999999997 0.005 693
127 0.40199999999999997 0.005 696
127 0.40199999999999997 0.005 700
127 0.40199999999999997 0.005 703
127 0.40199999999999997 0.005 707
127 0.40199999999999997 0.005025 693
127 0.40199999999999997 0.005025 696
127 0.40199999999999997 0.005025 700
127 0.40199999999999997 0.005025 703
127 0.40199999999999997 0.005025 707
127 0.40199999999999997 0.00505 693
127 0.40199999999999997 0.00505 696
127 0.40199999999999997 0.00505 700
127 0.40199999999999997 0.00505 703
127 0.40199999999999997 0.00505 707
127 0.404 0.00495 693
127 0.404 0.00495 696
127 0.404 0.00495 700
127 0.404 0.00495 703
127 0.404 0.00495 707
127 0.404 0.004975 693
127 0.404 0.004975 696
127 0.404 0.004975 700
127 0.404 0.004975 703
127 0.404 0.004975 707
127 0.404 0.005 693
127 0.404 0.005 696
127 0.404 0.005

199 0.8039999999999999 0.029699999999999997 301
199 0.8039999999999999 0.029699999999999997 303
199 0.8039999999999999 0.029849999999999998 297
199 0.8039999999999999 0.029849999999999998 298
199 0.8039999999999999 0.029849999999999998 300
199 0.8039999999999999 0.029849999999999998 301
199 0.8039999999999999 0.029849999999999998 303
199 0.8039999999999999 0.03 297
199 0.8039999999999999 0.03 298
199 0.8039999999999999 0.03 300
199 0.8039999999999999 0.03 301
199 0.8039999999999999 0.03 303
199 0.8039999999999999 0.030149999999999996 297
199 0.8039999999999999 0.030149999999999996 298
199 0.8039999999999999 0.030149999999999996 300
199 0.8039999999999999 0.030149999999999996 301
199 0.8039999999999999 0.030149999999999996 303
199 0.8039999999999999 0.0303 297
199 0.8039999999999999 0.0303 298
199 0.8039999999999999 0.0303 300
199 0.8039999999999999 0.0303 301
199 0.8039999999999999 0.0303 303
199 0.808 0.029699999999999997 297
199 0.808 0.029699999999999997 298
199 0.808 0.029699999999

201 0.8039999999999999 0.03 300
201 0.8039999999999999 0.03 301
201 0.8039999999999999 0.03 303
201 0.8039999999999999 0.030149999999999996 297
201 0.8039999999999999 0.030149999999999996 298
201 0.8039999999999999 0.030149999999999996 300
201 0.8039999999999999 0.030149999999999996 301
201 0.8039999999999999 0.030149999999999996 303
201 0.8039999999999999 0.0303 297
201 0.8039999999999999 0.0303 298
201 0.8039999999999999 0.0303 300
201 0.8039999999999999 0.0303 301
201 0.8039999999999999 0.0303 303
201 0.808 0.029699999999999997 297
201 0.808 0.029699999999999997 298
201 0.808 0.029699999999999997 300
201 0.808 0.029699999999999997 301
201 0.808 0.029699999999999997 303
201 0.808 0.029849999999999998 297
201 0.808 0.029849999999999998 298
201 0.808 0.029849999999999998 300
201 0.808 0.029849999999999998 301
201 0.808 0.029849999999999998 303
201 0.808 0.03 297
201 0.808 0.03 298
201 0.808 0.03 300
201 0.808 0.03 301
201 0.808 0.03 303
201 0.808 0.030149999999999996 297
201 0.808 0.03