## Routes (file: routes.txt) - Data Exploration

In [17]:
"""
import os
from pyhive import hive

# Set python variables from environment variables
username = os.environ['USERNAME']
hive_host = os.environ['HIVE_SERVER2'].split(':')[0]
hive_port = os.environ['HIVE_SERVER2'].split(':')[1]

# create connection
conn = hive.connect(
    host=hive_host,
    port=hive_port
)

# create cursor
cur = conn.cursor()

print(f"your username is {username}")
print(f"you are connected to {hive_host}:{hive_port}")
"""

your username is boukil
you are connected to iccluster044.iccluster.epfl.ch:10000


### ATTRIBUTES KEPT + DESCRIPTION

- route_id 
- route_desc: type of the route (is it a bus route, a tgv route...etc)
- route_name

In [113]:
import pandas as pd
pd.options.display.max_rows = 10
pd.options.mode.use_inf_as_na = True

In [114]:
filepath = "routes.txt"
df = pd.read_csv(filepath)

In [115]:
# ACTION TAKEN
df = df.convert_dtypes()

In [116]:
df.head()

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type
0,10-25-j20-1,11,25,,S-Bahn,400
1,10-27-j20-1,82,27,,S-Bahn,400
2,10-501-j20-1,801,501,,Bus,700
3,10-502-j20-1,801,502,,Bus,700
4,10-503-j20-1,801,503,,Bus,700


In [117]:
df.dtypes

route_id            string
agency_id           string
route_short_name    string
route_long_name      Int64
route_desc          string
route_type           Int64
dtype: object

In [118]:
print(f"Number of rows: {len(df)}")

Number of rows: 5877


In [119]:
print("Contains NA (or equivalent) values?")
for col in df.columns:
    print(f"\t{col}: {getattr(df, col).isna().any()}")

Contains NA (or equivalent) values?
	route_id: False
	agency_id: False
	route_short_name: False
	route_long_name: True
	route_desc: False
	route_type: False


In [120]:
print("Is only NA (or equivalent) values?")
for col in df.columns:
    print(f"\t{col}: {getattr(df, col).isna().all()}")

Is only NA (or equivalent) values?
	route_id: False
	agency_id: False
	route_short_name: False
	route_long_name: True
	route_desc: False
	route_type: False


In [121]:
# ACTION TAKEN
df = df.drop(columns=["route_long_name"]).rename(columns={"route_short_name": "route_name"})

In [122]:
print("Unique values ?")
for col in df.columns:
    print(f"\t{col}: {len(getattr(df, col)) == len(getattr(df, col).unique())}")

Unique values ?
	route_id: True
	agency_id: False
	route_name: False
	route_desc: False
	route_type: False


In [123]:
print("Top 10 most recurrent values for each column? (except the ID column since the values are unique)")
for col in df.columns[1:]:
    print(f"\t{col}: {getattr(df, col).value_counts().sort_values(ascending=False)[:10].index.tolist()}")

Top 10 most recurrent values for each column? (except the ID column since the values are unique)
	agency_id: ['801', '11', '87_LEX', '7000', '72', '06____', '693', 'sbg034', '834', '881']
	route_name: ['1', '3', '2', '5', '6', '4', '8', '7', '9', '12']
	route_desc: ['Bus', 'S-Bahn', 'Luftseilbahn', 'RegioExpress', 'Regionalzug', 'TGV', 'Schiff', 'InterRegio', 'Drahtseilbahn', 'Sesselbahn']
	route_type: [700, 1300, 106, 400, 102, 1000, 103, 100, 1400, 1501]


In [124]:
# ACTION TAKEN
df = df.drop(columns=["agency_id", "route_type"])
df["route_desc"] = df.route_desc.apply(lambda x: x.lower())

In [125]:
print(f"Unique route descriptions:\n\t{df.route_desc.unique().tolist()}")

Unique route descriptions:
	['s-bahn', 'bus', 'intercity', 'taxi', 'regioexpress', 'interregio', 'schnelles nachtnetz', 'standseilbahn', 'expressbus', 'extrazug', 'regionalzug', 'metro', 'schiff', 'tram', 'luftseilbahn', 'panoramabus', 'eurocity', 'en', 'ice', 'tgv', 'drahtseilbahn', 'sesselbahn', 'ter200', 'auoreisezug', 'panoramaexpress', 'aufzug', 'fähre', 'zahnradbahn']
