In [None]:
import pandas as pd
import numpy as np


In [119]:
data = pd.read_csv('../Data/All_Journeys.csv')


In [120]:
#Copenhagen filtering
condition_1_cph = (
    (data['internalValidZones'].str.match(r'^(1001|1002|1003|1004)(,(1001|1002|1003|1004))*$')
    | # or
    pd.isna(data['internalValidZones']))
    )

condition_2_cph = (
    (data['internalStartZones'].str.match(r'^(1001|1002|1003|1004)$'))
    | # or
    pd.isna(data['internalStartZones'])
    )

data = data[(condition_1_cph)]
data = data[(condition_2_cph)]

data = data[ ~ (data['SearchStart'].str.contains("okation", na=False)
                                             | #Or
                                             data['SearchStart'].str.contains("zoner", na=False))]
data = data[( ~ (data['SearchEnd'].str.contains("zoner", na=False) 
                                            | #Or
                                            data['SearchEnd'].str.contains("okation", na=False)))]

# next two filters are English filters of the first
data = data[( ~ (data['SearchEnd'].str.contains("zones", na=False) 
                                            | #Or
                                            data['SearchEnd'].str.contains("ocation", na=False)))]

data = data[( ~ (data['SearchStart'].str.contains("zones", na=False) 
                                            | #Or
                                            data['SearchStart'].str.contains("ocation", na=False)))]

# Next filter is to remove entries where one of the matching search-x or x-stop are Null
data = data[(
                                        ( ~ (pd.isna(data['SearchStart'])) & ~ (pd.isna(data['SearchEnd'])))
                                        | # Or
                                        ( ~ (pd.isna(data['StartStop'])) & ~ (pd.isna(data['EndStop'])))
                                        )]

# Next filter removes all entries where SearchStart and SearchEnd contain the same value
data = data[(
                        ~(data['SearchStart'] == data['SearchEnd'])
                        )]

  data = data[(condition_2_cph)]


### Getting information of data

We wish to create a table where we see the count of different potential relevant faults in our data such as the amount of duplicates, missing values and unique entries.

It is especially the unique_value for SearchStart and SearchEnd that we are interested in, due to the field being composed of user-inputs. We wish to learn whether or not a problem will occur if two stations appear with different names in our data. And if this occurs, then how often and is it a manageable amount. 

From the table we see that SearchStart contain 25,404 unique values and SearchStart interestingly have 40,017 unique values. This could be due to when a user searches for a journey, usually they search for where they are going and not as much where they should start their journey. This leads to the question of whether or not the inconsistency in station-names is only relevent for SearchEnd and perphaps not as much for SearchStart. 

We also see that the last 3 columns of the data all appear to be empty. Therefore we will remove these columns in an attempt to save space. (Went from 476.7+ MB -> 370.8+ MB)

In [121]:
missing_values = data.isnull().sum()
nan_values = data.isna().sum()
duplicates = data.duplicated().sum()
unique_values = data.nunique()

# Create a summary DataFrame
columns = pd.DataFrame({
    "missing_values": missing_values,
    "nan_values": nan_values,
    "duplicates": duplicates,
    "unique_values": unique_values,
})

columns

Unnamed: 0,missing_values,nan_values,duplicates,unique_values
Id,0,0,0,3471327
Type,20,20,0,2
internalStartZones,0,0,0,4
StartZone,0,0,0,5
internalValidZones,0,0,0,14
StartStop,3471051,3471051,0,30
AmountOfZones,0,0,0,4
EndZone,0,0,0,53
EndStop,3471051,3471051,0,38
SearchStart,64,64,0,25404


In [122]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3471327 entries, 1 to 43345940
Data columns (total 17 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Id                  object 
 1   Type                object 
 2   internalStartZones  object 
 3   StartZone           int64  
 4   internalValidZones  object 
 5   StartStop           float64
 6   AmountOfZones       int64  
 7   EndZone             int64  
 8   EndStop             float64
 9   SearchStart         object 
 10  SearchEnd           object 
 11  ModifiedOn          object 
 12  CreatedOn           object 
 13  JourneyClasses_Id   float64
 14  TravelType          float64
 15  ExtraFrom           float64
 16  ExtraTo             float64
dtypes: float64(6), int64(3), object(8)
memory usage: 476.7+ MB


In [124]:
data = data.drop(columns=['JourneyClasses_Id', 'TravelType', 'ExtraFrom', 'ExtraTo', 'Type', 'internalStartZones', 'StartZone', 'internalValidZones']).reset_index()

In [125]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3471327 entries, 0 to 3471326
Data columns (total 10 columns):
 #   Column         Dtype  
---  ------         -----  
 0   index          int64  
 1   Id             object 
 2   StartStop      float64
 3   AmountOfZones  int64  
 4   EndZone        int64  
 5   EndStop        float64
 6   SearchStart    object 
 7   SearchEnd      object 
 8   ModifiedOn     object 
 9   CreatedOn      object 
dtypes: float64(2), int64(3), object(5)
memory usage: 264.8+ MB


In [126]:
data[['SearchStart', 'SearchEnd']]

Unnamed: 0,SearchStart,SearchEnd
0,"Hovedbanegården, Tivoli (Bernstorffsgade) (01)","Borrebyvej 29, 2700 Brønshøj, Københavns Kommune"
1,København H (togbus) (01),Hulgårds Plads (Frederikssundsvej) (02)
2,København H (togbus) (01),Islands Brygge St. (Metro) (01)
3,København H (Metro) (01),Frederiksberg Allé St. (Metro) (01)
4,Nørreport St. (01),Sluseholmen (Sjællandsbroen) (02)
...,...,...
3471322,Amagerbro St. (Metro) (01),Kongens Nytorv St. (Metro) (01)
3471323,Islands Brygge St. (Ørestads Boulevard),Forum St. (Metro)
3471324,Rådhuspladsen St. (Vesterbrogade) (01),Skt. Annæ Gade (Prinsessegade) (01)
3471325,"Hovedbanegården, Frihedsstøtten (Vesterbrogade...","Roskildevej 96, 2000 Frederiksberg, Frederiksb..."


In [130]:
station_counts = data[['SearchStart', 'SearchEnd']]

Calling value_counts() on the new smaller subset of our data, we can conclude that our data consists of 323,835 unqiue combinations of a Start and End stop. From these ~300,000 the journey between CPH Airport and Copenhagen H is the most common journey. 

In [131]:
station_counts.value_counts()
station_counts = station_counts[~(station_counts['SearchStart'].isna() & station_counts['SearchEnd'].isna())]

the following code creates a list of pairs with the first element being start and the second element being end.

In [132]:
seq = []

def to_list(row):
    seq.append([row['SearchStart'], row['SearchEnd']])

station_counts.apply(to_list, axis=1)
seq


[['Hovedbanegården, Tivoli (Bernstorffsgade) (01)',
  'Borrebyvej 29, 2700 Brønshøj, Københavns Kommune'],
 ['København H (togbus) (01)', 'Hulgårds Plads (Frederikssundsvej) (02)'],
 ['København H (togbus) (01)', 'Islands Brygge St. (Metro) (01)'],
 ['København H (Metro) (01)', 'Frederiksberg Allé St. (Metro) (01)'],
 ['Nørreport St. (01)', 'Sluseholmen (Sjællandsbroen) (02)'],
 ['Lufthavnen St. (Metro) (04)', 'Aksel Møllers Have St. (Metro) (02)'],
 ['CPH Lufthavn (04)', 'Istedgade 6, 1650 København V, Københavns Kommune'],
 ['Ryumgårdsvej (Kongelundsvej) (04)', 'Dybbølsbro St. (01)'],
 ['Teglgårdstræde (Nørre Voldgade) (01)', 'Kapelvej (Nørrebrogade) (01)'],
 ['Nørreport St. (Frederiksborggade) (01)', 'Forum St. (Metro) (01)'],
 ['Drechselsgade (Artillerivej) (01)',
  'Hovedbanegården (Reventlowsgade) (01)'],
 ['Nyhavn (Københavns Havn) (01)', 'Refshaleøen (Refshalevej) (01)'],
 ['Nørre Campus (Tagensvej) (01)',
  'Dronningens Tværgade 37, 1302 København K, Københavns'],
 ['Elmegade 

The following code creates a dictionary out of all the entries in the list. Note that this means that we don't differentiate between start and end stations in the dict, we only care about the total number of entries for each station.

In [133]:
st_count = {}
for (start, end) in seq:
    #First check is not relevant anymore since the nan entries are removed.
    if start == "nan" or start == 'nan':
        print(start)
        print(end)
    if start in st_count:
        st_count.update({start : st_count[start]+1})
    else:
        st_count[start] = 1
    
    if end in st_count:
        st_count.update({end : st_count[end]+1})
    else:
        st_count[end] = 1


In [134]:
list(st_count.keys())

['Hovedbanegården, Tivoli (Bernstorffsgade) (01)',
 'Borrebyvej 29, 2700 Brønshøj, Københavns Kommune',
 'København H (togbus) (01)',
 'Hulgårds Plads (Frederikssundsvej) (02)',
 'Islands Brygge St. (Metro) (01)',
 'København H (Metro) (01)',
 'Frederiksberg Allé St. (Metro) (01)',
 'Nørreport St. (01)',
 'Sluseholmen (Sjællandsbroen) (02)',
 'Lufthavnen St. (Metro) (04)',
 'Aksel Møllers Have St. (Metro) (02)',
 'CPH Lufthavn (04)',
 'Istedgade 6, 1650 København V, Københavns Kommune',
 'Ryumgårdsvej (Kongelundsvej) (04)',
 'Dybbølsbro St. (01)',
 'Teglgårdstræde (Nørre Voldgade) (01)',
 'Kapelvej (Nørrebrogade) (01)',
 'Nørreport St. (Frederiksborggade) (01)',
 'Forum St. (Metro) (01)',
 'Drechselsgade (Artillerivej) (01)',
 'Hovedbanegården (Reventlowsgade) (01)',
 'Nyhavn (Københavns Havn) (01)',
 'Refshaleøen (Refshalevej) (01)',
 'Nørre Campus (Tagensvej) (01)',
 'Dronningens Tværgade 37, 1302 København K, Københavns',
 'Elmegade (Nørrebrogade) (01)',
 'København H (01)',
 'Værne

The following code highlights different variations of stations with 'København H' in it. These are both SearchStart and SearchEnd

In [154]:
for s in list(st_count.keys()):
    if type(s) is not str:
        print("------")
        print(st_count[s])
        print( s)
        print("------")
    else :
        if s.__contains__(r" ("):
            print(s)

Hovedbanegården, Tivoli (Bernstorffsgade) (01)
København H (togbus) (01)
Hulgårds Plads (Frederikssundsvej) (02)
Islands Brygge St. (Metro) (01)
København H (Metro) (01)
Frederiksberg Allé St. (Metro) (01)
Nørreport St. (01)
Sluseholmen (Sjællandsbroen) (02)
Lufthavnen St. (Metro) (04)
Aksel Møllers Have St. (Metro) (02)
CPH Lufthavn (04)
Ryumgårdsvej (Kongelundsvej) (04)
Dybbølsbro St. (01)
Teglgårdstræde (Nørre Voldgade) (01)
Kapelvej (Nørrebrogade) (01)
Nørreport St. (Frederiksborggade) (01)
Forum St. (Metro) (01)
Drechselsgade (Artillerivej) (01)
Hovedbanegården (Reventlowsgade) (01)
Nyhavn (Københavns Havn) (01)
Refshaleøen (Refshalevej) (01)
Nørre Campus (Tagensvej) (01)
Elmegade (Nørrebrogade) (01)
København H (01)
Værnedamsvej (Frederiksberg Allé) (01)
Skellet (Roskildevej) (02)
Vestamager St. (Metro) (03)
Fisketorvet, Dybbølsbro (Kalvebod Brygge) (01)
Sjælør St. (02)
Dybbølsbro St. (togbus) (01)
Østerport St. (01)
Orientkaj St. (Sundkrogsgade) (01)
Sønderport (Amager Boulevard

### WIP for collecting same stations

Work in progress. Need to add key-value pairs to the 'original' station. A lot of stations have multiple points (bus stops) but need to be collected into one unit. Such that København H have two (maybe 3) entries in our dictionary (one for metro and one for trains (and maybe one for busses)). Likewise, Trianglen station also need a metro location but also need all of its different bus-stop addresses to be collected into "Trianglen". 

Currently we remove all '(int int)' entries and add them to the station entry in the dictionary. This is not a final version but might be a starting point.

In [160]:
import re
pat = '\(\d\d\)'

for s in list(st_count.keys()):
    if s.__contains__(" ("):
        output = (re.sub(pat, "", s)).strip()
        
        if output in st_count:
            st_count[output] = st_count[output] + st_count[s]
            del st_count[s]
        else: 
            st_count.update({output : st_count[s]})
        
        print("", output, s)
        print(st_count[output])


 Hovedbanegården, Tivoli (Bernstorffsgade)  Hovedbanegården, Tivoli (Bernstorffsgade) (01)
85819
 København H (togbus)  København H (togbus) (01)
40740
 Hulgårds Plads (Frederikssundsvej)  Hulgårds Plads (Frederikssundsvej) (02)
4432
 Islands Brygge St. (Metro)  Islands Brygge St. (Metro) (01)
57229
 København H (Metro)  København H (Metro) (01)
102779
 Frederiksberg Allé St. (Metro)  Frederiksberg Allé St. (Metro) (01)
34732
 Nørreport St.  Nørreport St. (01)
195343
 Sluseholmen (Sjællandsbroen)  Sluseholmen (Sjællandsbroen) (02)
644
 Lufthavnen St. (Metro)  Lufthavnen St. (Metro) (04)
52570
 Aksel Møllers Have St. (Metro)  Aksel Møllers Have St. (Metro) (02)
15270
 CPH Lufthavn  CPH Lufthavn (04)
193227
 Ryumgårdsvej (Kongelundsvej)  Ryumgårdsvej (Kongelundsvej) (04)
363
 Dybbølsbro St.  Dybbølsbro St. (01)
39816
 Teglgårdstræde (Nørre Voldgade)  Teglgårdstræde (Nørre Voldgade) (01)
6015
 Kapelvej (Nørrebrogade)  Kapelvej (Nørrebrogade) (01)
16615
 Nørreport St. (Frederiksborggade)  

KeyError: 'Københavns Lufthavn St. (Metro)'