In [None]:
import pandas as pd
import numpy as np


In [None]:
data = pd.read_csv('../Data/All_Journeys.csv')


In [None]:
#Copenhagen filtering
condition_1_cph = (
    (data['internalValidZones'].str.match(r'^(1001|1002|1003|1004)(,(1001|1002|1003|1004))*$')
    | # or
    pd.isna(data['internalValidZones']))
    )

condition_2_cph = (
    (data['internalStartZones'].str.match(r'^(1001|1002|1003|1004)$'))
    | # or
    pd.isna(data['internalStartZones'])
    )

data = data[(condition_1_cph)]
data = data[(condition_2_cph)]

data = data[ ~ (data['SearchStart'].str.contains("okation", na=False)
                                             | #Or
                                             data['SearchStart'].str.contains("zoner", na=False))]
data = data[( ~ (data['SearchEnd'].str.contains("zoner", na=False) 
                                            | #Or
                                            data['SearchEnd'].str.contains("okation", na=False)))]

# next two filters are English filters of the first
data = data[( ~ (data['SearchEnd'].str.contains("zones", na=False) 
                                            | #Or
                                            data['SearchEnd'].str.contains("ocation", na=False)))]

data = data[( ~ (data['SearchStart'].str.contains("zones", na=False) 
                                            | #Or
                                            data['SearchStart'].str.contains("ocation", na=False)))]

# Next filter is to remove entries where one of the matching search-x or x-stop are Null
data = data[(
                                        ( ~ (pd.isna(data['SearchStart'])) & ~ (pd.isna(data['SearchEnd'])))
                                        | # Or
                                        ( ~ (pd.isna(data['StartStop'])) & ~ (pd.isna(data['EndStop'])))
                                        )]

# Next filter removes all entries where SearchStart and SearchEnd contain the same value
data = data[(
                        ~(data['SearchStart'] == data['SearchEnd'])
                        )]

### Getting information of data

We wish to create a table where we see the count of different potential relevant faults in our data such as the amount of duplicates, missing values and unique entries.

It is especially the unique_value for SearchStart and SearchEnd that we are interested in, due to the field being composed of user-inputs. We wish to learn whether or not a problem will occur if two stations appear with different names in our data. And if this occurs, then how often and is it a manageable amount. 

From the table we see that SearchStart contain 25,404 unique values and SearchStart interestingly have 40,017 unique values. This could be due to when a user searches for a journey, usually they search for where they are going and not as much where they should start their journey. This leads to the question of whether or not the inconsistency in station-names is only relevent for SearchEnd and perphaps not as much for SearchStart. 

We also see that the last 3 columns of the data all appear to be empty. Therefore we will remove these columns in an attempt to save space. (Went from 476.7+ MB -> 370.8+ MB)

In [None]:
missing_values = data.isnull().sum()
nan_values = data.isna().sum()
duplicates = data.duplicated().sum()
unique_values = data.nunique()

# Create a summary DataFrame
columns = pd.DataFrame({
    "missing_values": missing_values,
    "nan_values": nan_values,
    "duplicates": duplicates,
    "unique_values": unique_values,
})

columns

In [None]:
data.info()

In [None]:
data = data.drop(columns=['JourneyClasses_Id', 'TravelType', 'ExtraFrom', 'ExtraTo']).reset_index()

In [None]:
data.info()

In [None]:
data[['SearchStart', 'SearchEnd']].agg(list)

In [None]:
station_counts = data[['SearchStart', 'SearchEnd']]

Calling value_counts() on the new smaller subset of our data, we can conclude that our data consists of 323,835 unqiue combinations of a Start and End stop. From these ~300,000 the journey between CPH Airport and Copenhagen H is the most common journey. 

In [None]:
station_counts.value_counts()
station_counts = station_counts[~(station_counts['SearchStart'].isna() & station_counts['SearchEnd'].isna())]

the following code creates a list of pairs with the first element being start and the second element being end.

In [None]:
seq = []

def to_list(row):
    seq.append([row['SearchStart'], row['SearchEnd']])

station_counts.apply(to_list, axis=1)
seq


The following code creates a dictionary out of all the entries in the list. Note that this means that we don't differentiate between start and end stations in the dict, we only care about the total number of entries for each station.

In [None]:
st_count = {}
for (start, end) in seq:
    #First check is not relevant anymore since the nan entries are removed.
    if start == "nan" or start == 'nan':
        print(start)
        print(end)
    if start in st_count:
        st_count.update({start : st_count[start]+1})
    else:
        st_count[start] = 1
    
    if end in st_count:
        st_count.update({end : st_count[end]+1})
    else:
        st_count[end] = 1


In [None]:
list(st_count.keys())

The following code highlights different variations of stations with 'København H' in it. These are both SearchStart and SearchEnd

In [None]:
for s in list(st_count.keys()):
    if type(s) is not str:
        print("------")
        print(st_count[s])
        print( s)
        print("------")
    else :
        if s.__contains__("København H"):
            print(s)