In [1]:
import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os.path
from typing import List, Tuple
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


from NJCleaner import NJCleaner
from DecisionTreeClassifier import DecisionTreeClassifier

In [2]:
work_data = pd.read_csv("data/2018_03.csv", header=0)
work_data.shape

(256508, 13)

In [3]:
work_data.head(20)

Unnamed: 0,date,train_id,stop_sequence,from,from_id,to,to_id,scheduled_time,actual_time,delay_minutes,status,line,type
0,2018-03-01,3805,1.0,New York Penn Station,105,New York Penn Station,105,2018-03-02 01:22:00,2018-03-02 01:21:05,0.0,departed,Northeast Corrdr,NJ Transit
1,2018-03-01,3805,2.0,New York Penn Station,105,Secaucus Upper Lvl,38187,2018-03-02 01:31:00,2018-03-02 01:31:08,0.133333,departed,Northeast Corrdr,NJ Transit
2,2018-03-01,3805,3.0,Secaucus Upper Lvl,38187,Newark Penn Station,107,2018-03-02 01:40:00,2018-03-02 01:40:07,0.116667,departed,Northeast Corrdr,NJ Transit
3,2018-03-01,3805,4.0,Newark Penn Station,107,Newark Airport,37953,2018-03-02 01:45:00,2018-03-02 01:45:10,0.166667,departed,Northeast Corrdr,NJ Transit
4,2018-03-01,3805,5.0,Newark Airport,37953,North Elizabeth,109,2018-03-02 01:49:00,2018-03-02 01:49:10,0.166667,departed,Northeast Corrdr,NJ Transit
5,2018-03-01,3805,6.0,North Elizabeth,109,Elizabeth,41,2018-03-02 01:52:00,2018-03-02 01:52:01,0.016667,departed,Northeast Corrdr,NJ Transit
6,2018-03-01,3805,7.0,Elizabeth,41,Linden,70,2018-03-02 01:58:00,2018-03-02 01:58:05,0.083333,departed,Northeast Corrdr,NJ Transit
7,2018-03-01,3805,8.0,Linden,70,Rahway,127,2018-03-02 02:02:00,2018-03-02 02:01:03,0.0,departed,Northeast Corrdr,NJ Transit
8,2018-03-01,3805,9.0,Rahway,127,Metropark,83,2018-03-02 02:08:00,2018-03-02 02:08:00,0.0,departed,Northeast Corrdr,NJ Transit
9,2018-03-01,3805,10.0,Metropark,83,Metuchen,84,2018-03-02 02:13:00,2018-03-02 02:13:10,0.166667,departed,Northeast Corrdr,NJ Transit


In [4]:
work_data.sort_values(by = ['scheduled_time'])
work_data.head()

Unnamed: 0,date,train_id,stop_sequence,from,from_id,to,to_id,scheduled_time,actual_time,delay_minutes,status,line,type
0,2018-03-01,3805,1.0,New York Penn Station,105,New York Penn Station,105,2018-03-02 01:22:00,2018-03-02 01:21:05,0.0,departed,Northeast Corrdr,NJ Transit
1,2018-03-01,3805,2.0,New York Penn Station,105,Secaucus Upper Lvl,38187,2018-03-02 01:31:00,2018-03-02 01:31:08,0.133333,departed,Northeast Corrdr,NJ Transit
2,2018-03-01,3805,3.0,Secaucus Upper Lvl,38187,Newark Penn Station,107,2018-03-02 01:40:00,2018-03-02 01:40:07,0.116667,departed,Northeast Corrdr,NJ Transit
3,2018-03-01,3805,4.0,Newark Penn Station,107,Newark Airport,37953,2018-03-02 01:45:00,2018-03-02 01:45:10,0.166667,departed,Northeast Corrdr,NJ Transit
4,2018-03-01,3805,5.0,Newark Airport,37953,North Elizabeth,109,2018-03-02 01:49:00,2018-03-02 01:49:10,0.166667,departed,Northeast Corrdr,NJ Transit


In [5]:
to_drop = ['from', 'to']
work_data = work_data.drop(columns=to_drop)
work_data = work_data.dropna()
work_data.head()

Unnamed: 0,date,train_id,stop_sequence,from_id,to_id,scheduled_time,actual_time,delay_minutes,status,line,type
0,2018-03-01,3805,1.0,105,105,2018-03-02 01:22:00,2018-03-02 01:21:05,0.0,departed,Northeast Corrdr,NJ Transit
1,2018-03-01,3805,2.0,105,38187,2018-03-02 01:31:00,2018-03-02 01:31:08,0.133333,departed,Northeast Corrdr,NJ Transit
2,2018-03-01,3805,3.0,38187,107,2018-03-02 01:40:00,2018-03-02 01:40:07,0.116667,departed,Northeast Corrdr,NJ Transit
3,2018-03-01,3805,4.0,107,37953,2018-03-02 01:45:00,2018-03-02 01:45:10,0.166667,departed,Northeast Corrdr,NJ Transit
4,2018-03-01,3805,5.0,37953,109,2018-03-02 01:49:00,2018-03-02 01:49:10,0.166667,departed,Northeast Corrdr,NJ Transit


In [6]:
work_data.shape

(243028, 11)

In [7]:
def convert_date_to_day(inputdata: pd.DataFrame) -> pd.DataFrame:
    inputdata.date = pd.to_datetime(inputdata.date)
    inputdata['day'] = inputdata.date.dt.day_name()
    inputdata = inputdata.drop(columns='date')
    return inputdata

In [8]:
work_data = convert_date_to_day(work_data)

In [9]:
work_data.head()

Unnamed: 0,train_id,stop_sequence,from_id,to_id,scheduled_time,actual_time,delay_minutes,status,line,type,day
0,3805,1.0,105,105,2018-03-02 01:22:00,2018-03-02 01:21:05,0.0,departed,Northeast Corrdr,NJ Transit,Thursday
1,3805,2.0,105,38187,2018-03-02 01:31:00,2018-03-02 01:31:08,0.133333,departed,Northeast Corrdr,NJ Transit,Thursday
2,3805,3.0,38187,107,2018-03-02 01:40:00,2018-03-02 01:40:07,0.116667,departed,Northeast Corrdr,NJ Transit,Thursday
3,3805,4.0,107,37953,2018-03-02 01:45:00,2018-03-02 01:45:10,0.166667,departed,Northeast Corrdr,NJ Transit,Thursday
4,3805,5.0,37953,109,2018-03-02 01:49:00,2018-03-02 01:49:10,0.166667,departed,Northeast Corrdr,NJ Transit,Thursday


In [10]:
work_data['scheduled_time'] = pd.to_datetime(work_data['scheduled_time'])

In [11]:
work_data['time'] = work_data['scheduled_time'].dt.time

In [12]:
work_data.head()

Unnamed: 0,train_id,stop_sequence,from_id,to_id,scheduled_time,actual_time,delay_minutes,status,line,type,day,time
0,3805,1.0,105,105,2018-03-02 01:22:00,2018-03-02 01:21:05,0.0,departed,Northeast Corrdr,NJ Transit,Thursday,01:22:00
1,3805,2.0,105,38187,2018-03-02 01:31:00,2018-03-02 01:31:08,0.133333,departed,Northeast Corrdr,NJ Transit,Thursday,01:31:00
2,3805,3.0,38187,107,2018-03-02 01:40:00,2018-03-02 01:40:07,0.116667,departed,Northeast Corrdr,NJ Transit,Thursday,01:40:00
3,3805,4.0,107,37953,2018-03-02 01:45:00,2018-03-02 01:45:10,0.166667,departed,Northeast Corrdr,NJ Transit,Thursday,01:45:00
4,3805,5.0,37953,109,2018-03-02 01:49:00,2018-03-02 01:49:10,0.166667,departed,Northeast Corrdr,NJ Transit,Thursday,01:49:00


In [13]:
work_data['part_of_the_day'] = '0'

In [14]:
def part(hour) -> str:
    if (hour >= 4) and (hour < 8):
        return 'early_morning'
    elif (hour >= 8) and (hour < 12 ):
        return 'morning'
    elif (hour >= 12) and (hour < 16):
        return'afternoon'
    elif (hour >= 16) and (hour < 20) :
        return 'evening'
    elif (hour >= 20) and (hour < 24):
        return 'night'
    elif (hour < 4):
        return'late_night'

In [15]:
def determine_time(time) -> str:
    hour = time.hour
    name = part(hour)
    return name

In [16]:
work_data['part_of_the_day'] = work_data['time'].apply(determine_time)

In [17]:
work_data= work_data.drop(columns='time')

In [18]:
work_data.head(100)

Unnamed: 0,train_id,stop_sequence,from_id,to_id,scheduled_time,actual_time,delay_minutes,status,line,type,day,part_of_the_day
0,3805,1.0,105,105,2018-03-02 01:22:00,2018-03-02 01:21:05,0.000000,departed,Northeast Corrdr,NJ Transit,Thursday,late_night
1,3805,2.0,105,38187,2018-03-02 01:31:00,2018-03-02 01:31:08,0.133333,departed,Northeast Corrdr,NJ Transit,Thursday,late_night
2,3805,3.0,38187,107,2018-03-02 01:40:00,2018-03-02 01:40:07,0.116667,departed,Northeast Corrdr,NJ Transit,Thursday,late_night
3,3805,4.0,107,37953,2018-03-02 01:45:00,2018-03-02 01:45:10,0.166667,departed,Northeast Corrdr,NJ Transit,Thursday,late_night
4,3805,5.0,37953,109,2018-03-02 01:49:00,2018-03-02 01:49:10,0.166667,departed,Northeast Corrdr,NJ Transit,Thursday,late_night
...,...,...,...,...,...,...,...,...,...,...,...,...
113,3834,9.0,127,37953,2018-03-01 10:50:00,2018-03-01 10:55:09,5.150000,departed,Northeast Corrdr,NJ Transit,Thursday,morning
114,3834,10.0,37953,107,2018-03-01 10:56:00,2018-03-01 11:01:04,5.066667,departed,Northeast Corrdr,NJ Transit,Thursday,morning
115,3834,11.0,107,38187,2018-03-01 11:03:00,2018-03-01 11:15:09,12.150000,departed,Northeast Corrdr,NJ Transit,Thursday,morning
116,3834,12.0,38187,105,2018-03-01 11:16:00,2018-03-01 11:32:00,16.000000,estimated,Northeast Corrdr,NJ Transit,Thursday,morning


In [19]:
def convert_delay(data: pd.DataFrame) -> pd.DataFrame:
    def create_delay(row):
        if row >= 5:
            val = 1
        else:
            val = 0
        return val

    data['delay'] = data.delay_minutes.apply(create_delay)
    return data

In [20]:
work_data = convert_delay(work_data)

In [21]:
work_data.head(100)

Unnamed: 0,train_id,stop_sequence,from_id,to_id,scheduled_time,actual_time,delay_minutes,status,line,type,day,part_of_the_day,delay
0,3805,1.0,105,105,2018-03-02 01:22:00,2018-03-02 01:21:05,0.000000,departed,Northeast Corrdr,NJ Transit,Thursday,late_night,0
1,3805,2.0,105,38187,2018-03-02 01:31:00,2018-03-02 01:31:08,0.133333,departed,Northeast Corrdr,NJ Transit,Thursday,late_night,0
2,3805,3.0,38187,107,2018-03-02 01:40:00,2018-03-02 01:40:07,0.116667,departed,Northeast Corrdr,NJ Transit,Thursday,late_night,0
3,3805,4.0,107,37953,2018-03-02 01:45:00,2018-03-02 01:45:10,0.166667,departed,Northeast Corrdr,NJ Transit,Thursday,late_night,0
4,3805,5.0,37953,109,2018-03-02 01:49:00,2018-03-02 01:49:10,0.166667,departed,Northeast Corrdr,NJ Transit,Thursday,late_night,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,3834,9.0,127,37953,2018-03-01 10:50:00,2018-03-01 10:55:09,5.150000,departed,Northeast Corrdr,NJ Transit,Thursday,morning,1
114,3834,10.0,37953,107,2018-03-01 10:56:00,2018-03-01 11:01:04,5.066667,departed,Northeast Corrdr,NJ Transit,Thursday,morning,1
115,3834,11.0,107,38187,2018-03-01 11:03:00,2018-03-01 11:15:09,12.150000,departed,Northeast Corrdr,NJ Transit,Thursday,morning,1
116,3834,12.0,38187,105,2018-03-01 11:16:00,2018-03-01 11:32:00,16.000000,estimated,Northeast Corrdr,NJ Transit,Thursday,morning,1


In [22]:
def drop_unnecessary_columns(inputdata: pd.DataFrame) -> pd.DataFrame:
    columns_to_drop: List = ['train_id', 'scheduled_time', 'actual_time', 'delay_minutes']
    inputdata = inputdata.drop(columns=columns_to_drop)
    return inputdata

In [23]:
work_data = drop_unnecessary_columns(work_data)

In [24]:
work_data.head(100)

Unnamed: 0,stop_sequence,from_id,to_id,status,line,type,day,part_of_the_day,delay
0,1.0,105,105,departed,Northeast Corrdr,NJ Transit,Thursday,late_night,0
1,2.0,105,38187,departed,Northeast Corrdr,NJ Transit,Thursday,late_night,0
2,3.0,38187,107,departed,Northeast Corrdr,NJ Transit,Thursday,late_night,0
3,4.0,107,37953,departed,Northeast Corrdr,NJ Transit,Thursday,late_night,0
4,5.0,37953,109,departed,Northeast Corrdr,NJ Transit,Thursday,late_night,0
...,...,...,...,...,...,...,...,...,...
113,9.0,127,37953,departed,Northeast Corrdr,NJ Transit,Thursday,morning,1
114,10.0,37953,107,departed,Northeast Corrdr,NJ Transit,Thursday,morning,1
115,11.0,107,38187,departed,Northeast Corrdr,NJ Transit,Thursday,morning,1
116,12.0,38187,105,estimated,Northeast Corrdr,NJ Transit,Thursday,morning,1


In [25]:
def save_first_60k(inputdata: pd.DataFrame, path_to_save: str):
    inputdata.iloc[:60000].to_csv(path_to_save, index=False, sep=',')

In [26]:
save_first_60k(work_data, 'data/NJ1.csv')

In [27]:
print(os.getcwd())

D:\Egyetem\BevAdat\FQQQOC_BEVADAT2022232\HAZI\HAZI06


In [28]:
cleaner = NJCleaner('data/2018_03.csv')

In [29]:
cleaner.prep_df('data/NJ.csv')

In [30]:
data_to_fit = pd.read_csv('data/NJ_60k.csv', sep=',', header=0)

In [31]:
data_to_fit.head()

Unnamed: 0,stop_sequence,from_id,to_id,status,line,type,day,part_of_the_day,delay
0,1.0,148,148,departed,Northeast Corrdr,NJ Transit,Thursday,late_night,1
1,1.0,123,123,departed,Bergen Co. Line,NJ Transit,Thursday,late_night,0
2,2.0,148,32905,departed,Northeast Corrdr,NJ Transit,Thursday,late_night,1
3,1.0,74,74,departed,No Jersey Coast,NJ Transit,Thursday,late_night,0
4,3.0,32905,125,departed,Northeast Corrdr,NJ Transit,Thursday,early_morning,0


In [32]:
X: pd.DataFrame = data_to_fit.iloc[:, :-1].values
Y: pd.DataFrame  = data_to_fit.iloc[:, -1].values.reshape(-1,1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=41)

In [41]:
classifier = DecisionTreeClassifier(min_samples_split=20, max_depth=8)
classifier.fit(X_train, Y_train)

Y_pred = classifier.predict(X_test)
print(accuracy_score(Y_test, Y_pred))

0.7955


In [42]:
print(accuracy_score(Y_test, Y_pred))

In [45]:
results = []
for i in range(20,100,5):
    for j in range(7, 17, 2):
        try:
            classifier = DecisionTreeClassifier(min_samples_split=i, max_depth=j)
            classifier.fit(X_train, Y_train)

            Y_pred = classifier.predict(X_test)
            acc = accuracy_score(Y_test, Y_pred)
            results.append([acc, i, j])
            print(f'{acc:.2%} accuracy {i} min sample és {j} max depth esetén')
        except Exception as e:
            print(f'{e} hiba {i} min sample és {j} max depth mellett')
            break

79.37% accuracy 20 min sample és 7 max depth esetén
'info_gain' hiba 20 min sample és 9 max depth mellett
79.37% accuracy 25 min sample és 7 max depth esetén
79.80% accuracy 25 min sample és 9 max depth esetén
80.14% accuracy 25 min sample és 11 max depth esetén
79.67% accuracy 25 min sample és 13 max depth esetén
79.12% accuracy 25 min sample és 15 max depth esetén
79.37% accuracy 30 min sample és 7 max depth esetén
79.80% accuracy 30 min sample és 9 max depth esetén
80.18% accuracy 30 min sample és 11 max depth esetén
79.80% accuracy 30 min sample és 13 max depth esetén
79.29% accuracy 30 min sample és 15 max depth esetén
79.36% accuracy 35 min sample és 7 max depth esetén
79.80% accuracy 35 min sample és 9 max depth esetén
80.22% accuracy 35 min sample és 11 max depth esetén
79.82% accuracy 35 min sample és 13 max depth esetén
79.30% accuracy 35 min sample és 15 max depth esetén
79.35% accuracy 40 min sample és 7 max depth esetén
79.80% accuracy 40 min sample és 9 max depth esetén
8

In [47]:
sorted(results, reverse=True)

[[0.8035833333333333, 95, 11],
 [0.8034166666666667, 60, 11],
 [0.8033333333333333, 75, 11],
 [0.8033333333333333, 70, 11],
 [0.8033333333333333, 65, 11],
 [0.80325, 55, 11],
 [0.8030833333333334, 80, 11],
 [0.803, 90, 11],
 [0.8028333333333333, 45, 11],
 [0.8026666666666666, 85, 11],
 [0.8026666666666666, 50, 11],
 [0.8021666666666667, 40, 11],
 [0.8021666666666667, 35, 11],
 [0.8018333333333333, 30, 11],
 [0.8014166666666667, 25, 11],
 [0.80075, 95, 13],
 [0.7999166666666667, 80, 13],
 [0.79975, 90, 13],
 [0.7995833333333333, 75, 13],
 [0.7994166666666667, 85, 13],
 [0.7994166666666667, 65, 13],
 [0.7993333333333333, 70, 13],
 [0.799, 60, 13],
 [0.7988333333333333, 55, 13],
 [0.7986666666666666, 95, 9],
 [0.7985, 50, 13],
 [0.7983333333333333, 80, 9],
 [0.79825, 90, 9],
 [0.79825, 60, 9],
 [0.79825, 55, 9],
 [0.7981666666666667, 95, 15],
 [0.7981666666666667, 85, 9],
 [0.7981666666666667, 50, 9],
 [0.7981666666666667, 35, 13],
 [0.7980833333333334, 45, 13],
 [0.7980833333333334, 45, 

# Minimum sample split nem indulhat 1-től?
# Info gain hiba 8-as depth mellett, ha a min_split 20 alatti
# 23-es min split mellett a 9-es depth is lefut