In [1]:
from IPython.display import HTML
HTML('''
    <style> body {font-family: "Roboto Condensed Light", "Roboto Condensed";} h2 {padding: 10px 12px; background-color: #E64626; position: static; color: #ffffff; font-size: 40px;} .text_cell_render p { font-size: 15px; } .text_cell_render h1 { font-size: 30px; } h1 {padding: 10px 12px; background-color: #E64626; color: #ffffff; font-size: 40px;} .text_cell_render h3 { padding: 10px 12px; background-color: #0148A4; position: static; color: #ffffff; font-size: 20px;} h4:before{ 
    content: "@"; font-family:"Wingdings"; font-style:regular; margin-right: 4px;} .text_cell_render h4 {padding: 8px; font-family: "Roboto Condensed Light"; position: static; font-style: italic; background-color: #FFB800; color: #ffffff; font-size: 18px; text-align: center; border-radius: 5px;}input[type=submit] {background-color: #E64626; border: solid; border-color: #734036; color: white; padding: 8px 16px; text-decoration: none; margin: 4px 2px; cursor: pointer; border-radius: 20px;}</style>
''')

# 👥 Group Information

<p><strong>Code Language:</strong> <span style="font-size:18px;">Python</span></p>

<table style="font-size:18px;">
  <tr>
    <th>Name</th>
    <th>SID</th>
  </tr>
  <tr>
    <td>Ngoc Minh Dao</td>
    <td>520577590</td>
  </tr>
  <tr>
    <td>Manh Duc Nguyen</td>
    <td>520561337</td>
  </tr>
</table>

# Sydney Public Transport Analysis

### Import necessary libraries

In [2]:
# Imports for spatial data
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import time
import numpy as np
from shapely.geometry import Point, Polygon, MultiPolygon
from geoalchemy2 import Geometry, WKTElement

# Imports for pgadmin
from sqlalchemy import create_engine, text
import psycopg2
import psycopg2.extras
import json
from sqlalchemy import text

### Connect to pgAdmin

In [3]:
credentials = "Credentials.json"

def pgconnect(credential_filepath, db_schema="public"):
    with open(credential_filepath) as f:
        db_conn_dict = json.load(f)
        host       = db_conn_dict['host']
        db_user    = db_conn_dict['user']
        db_pw      = db_conn_dict['password']
        default_db = db_conn_dict['user']
        port       = db_conn_dict['port']
        try:
            db = create_engine(f'postgresql+psycopg2://{db_user}:{db_pw}@{host}:{port}/{default_db}', echo=False)
            conn = db.connect()
            print('Connected successfully.')
        except Exception as e:
            print("Unable to connect to the database.")
            print(e)
            db, conn = None, None
        return db,conn

def query(conn, sqlcmd, args=None, df=True):
    result = pd.DataFrame() if df else None
    try:
        if df:
            result = pd.read_sql_query(sqlcmd, conn, params=args)
        else:
            result = conn.execute(text(sqlcmd), args).fetchall()
            result = result[0] if len(result) == 1 else result
    except Exception as e:
        print("Error encountered: ", e, sep='\n')
    return result

In [4]:
db, conn = pgconnect(credentials)

Connected successfully.


### SRID Setup

In [5]:
srid = 4326

In [6]:
def create_wkt_element(geom, srid):
    if geom.geom_type == 'Polygon':
        geom = MultiPolygon([geom])
    return WKTElement(geom.wkt, srid)

## Task 1: Import and Clean datasets

### 1.1 Load + clean Sydney Trains data

#### 1.1.1 Load data

In [7]:
# Load the file
trains = gpd.read_file("SydneyTrainRoutes/sydneytrains/SydneyTrains.shp")
print(trains.columns)
# trains.head(5)

Index(['objectid', 'shape_id', 'route_id', 'agency_id', 'route_shor',
       'route_long', 'route_desc', 'route_type', 'route_colo', 'route_text',
       'exact_time', 'route_ty00', 'st_length(', 'geometry'],
      dtype='object')


In [8]:
# Find all the Sydney trains
sydney_trains = trains[trains["agency_id"].str.startswith("Sydney")]
# sydney_trains.head(5)

In [9]:
# Check data types
sydney_trains.dtypes

objectid         int64
shape_id        object
route_id        object
agency_id       object
route_shor      object
route_long      object
route_desc      object
route_type      object
route_colo      object
route_text      object
exact_time      object
route_ty00      object
st_length(     float64
geometry      geometry
dtype: object

#### 1.1.2 SRID Transformation

In [10]:
sydney_trains = sydney_trains.copy()  # creating a copy of the original for later
sydney_trains['geom'] = sydney_trains['geometry'].apply(lambda x: create_wkt_element(geom=x,srid=srid))  # applying the function
sydney_trains = sydney_trains.drop(columns="geometry")  # deleting the old copy
# sydney_trains.head(5)

In [11]:
# Check for missing values 
null_val = sydney_trains.isnull().sum()
print(f"Missing values per column:\n{null_val}")

Missing values per column:
objectid       0
shape_id       0
route_id       0
agency_id      0
route_shor     0
route_long     0
route_desc     0
route_type     0
route_colo     0
route_text     0
exact_time    54
route_ty00     0
st_length(     0
geom           0
dtype: int64


In [12]:
# For "exact_time", fill with "Unknown"
sydney_trains['exact_time'] = sydney_trains['exact_time'].fillna('Unknown')
# sydney_trains.head(5)

#### 1.1.3 Create table for query

In [13]:
conn.execute(text("""
DROP TABLE IF EXISTS sydney_trains;
CREATE TABLE sydney_trains (
    objectid     INTEGER PRIMARY KEY,
    shape_id     VARCHAR(255),
    route_id     VARCHAR(255),
    agency_id    VARCHAR(255),
    route_shor   VARCHAR(255),
    route_long   VARCHAR(255),
    route_desc   VARCHAR(255),
    route_type   VARCHAR(255),
    route_colo   VARCHAR(255),
    route_text   VARCHAR(255),
    exact_time   VARCHAR(255),
    route_ty00   VARCHAR(255),
    st_length    DOUBLE PRECISION,
    geom         GEOMETRY(LINESTRING, 4326)
);
"""))

<sqlalchemy.engine.cursor.CursorResult at 0x2953b373c40>

### 1.2 Load + clean Train Station Entrance Locations data

In [14]:
# Load the file
entrance_loc = pd.read_csv("TrainStationEntranceLocations/stationentrances2020_v4.csv")
entrance_loc.head(5)

Unnamed: 0,Train_Station,Street_Name,Street_Type,Entrance_Type,LAT,LONG,Exit_Number
0,Aberdeen,Macqueen,St,Ramp,-32.166886,150.891957,
1,Aberdeen,Macqueen,St,Stairs,-32.1669,150.891975,
2,Adamstown,Park,Ave,Path,-32.933706,151.720452,
3,Adamstown,Park,Ave,Path,-32.933827,151.720236,
4,Adamstown,St James,Rd,Stairs,-32.933414,151.720363,


In [15]:
# Check data types
entrance_loc.dtypes

Train_Station     object
Street_Name       object
Street_Type       object
Entrance_Type     object
LAT              float64
LONG             float64
Exit_Number      float64
dtype: object

In [16]:
# Check for missing values 
null_val = entrance_loc.isnull().sum()
print(f"Missing values per column:\n{null_val}")

Missing values per column:
Train_Station       0
Street_Name         0
Street_Type        28
Entrance_Type       0
LAT                 0
LONG                0
Exit_Number      1024
dtype: int64


In [17]:
# For Street_Type, fill with "Unknown"
entrance_loc['Street_Type'] = entrance_loc['Street_Type'].fillna('Unknown')
# For Exit_Number, assuming missing = 0 exits
entrance_loc['Exit_Number'] = entrance_loc['Exit_Number'].fillna(0).astype(int)
entrance_loc.head(5)

Unnamed: 0,Train_Station,Street_Name,Street_Type,Entrance_Type,LAT,LONG,Exit_Number
0,Aberdeen,Macqueen,St,Ramp,-32.166886,150.891957,0
1,Aberdeen,Macqueen,St,Stairs,-32.1669,150.891975,0
2,Adamstown,Park,Ave,Path,-32.933706,151.720452,0
3,Adamstown,Park,Ave,Path,-32.933827,151.720236,0
4,Adamstown,St James,Rd,Stairs,-32.933414,151.720363,0


In [18]:
entrance_loc['geom'] = gpd.points_from_xy(entrance_loc.LONG, entrance_loc.LAT)      # creating the geometry column
entrance_loc = entrance_loc.drop(columns = ['LONG', 'LAT'])        # removing the old latitude/longitude fields
entrance_loc.head(5)

Unnamed: 0,Train_Station,Street_Name,Street_Type,Entrance_Type,Exit_Number,geom
0,Aberdeen,Macqueen,St,Ramp,0,POINT (150.89196 -32.16689)
1,Aberdeen,Macqueen,St,Stairs,0,POINT (150.89198 -32.1669)
2,Adamstown,Park,Ave,Path,0,POINT (151.72045 -32.93371)
3,Adamstown,Park,Ave,Path,0,POINT (151.72024 -32.93383)
4,Adamstown,St James,Rd,Stairs,0,POINT (151.72036 -32.93341)


### 1.3 Load + clean Train Station Entries Exits data

In [19]:
# Load the file
entries_exit = pd.read_csv("TrainStationEntriesExits/train-station-entries-exits-data-may-2025.csv")
entries_exit.head(5)

Unnamed: 0,MonthYear,Station,Station_Type,Entry_Exit,Trip
0,Aug-24,Aberdeen Station,train,Entry,Less than 50
1,Aug-24,Aberdeen Station,train,Exit,Less than 50
2,Aug-24,Adamstown Station,train,Entry,2585
3,Aug-24,Adamstown Station,train,Exit,2391
4,Aug-24,Albion Park Station,train,Entry,6919


In [20]:
# Check data types
entries_exit.dtypes

MonthYear        object
Station          object
Station_Type     object
Entry_Exit       object
Trip             object
dtype: object

In [21]:
# # Convert MonthYear to datetime
# entries_exit['MonthYear'] = pd.to_datetime(entries_exit['MonthYear'], 
#                                        format='%Y-%m',     # e.g. '2024-08'
#                                        errors='coerce')
# entries_exit.dtypes

In [22]:
# Check for missing values 
null_val = entries_exit.isnull().sum()
print(f"Missing values per column:\n{null_val}")

Missing values per column:
MonthYear        0
Station          0
Station_Type     0
Entry_Exit       0
Trip             0
dtype: int64


### 1.4 Load + clean Opal Patronage data

In [8]:
import glob

# 1. Find all Opal_Patronage files
file_pattern = "OpalPatronage/Opal_Patronage_202*.txt"
file_list = glob.glob(file_pattern)
print(f"Found {len(file_list)} files:")
# for f in file_list:
#     print(" ", f)

# 2. Read, clean & trim empty columns in each file
dfs = []
for fname in file_list:
    df = pd.read_csv(
        fname,
        sep="|",
        # na_values=["<50"],        # convert "<50" to NaN
        parse_dates=["trip_origin_date"]
    )
    # Drop any column that’s entirely NaN in this file
    df = df.dropna(axis=1, how="all")
    dfs.append(df)

# 3. Concatenate into one DataFrame
if not dfs:
    raise FileNotFoundError(f"No files matched pattern: {file_pattern}")
full_df = pd.concat(dfs, ignore_index=True)

# 4. Inspect the result
print("\nCombined DataFrame shape:", full_df.shape)
display(full_df.head())


Found 2023 files:

Combined DataFrame shape: (1372294, 6)


Unnamed: 0,trip_origin_date,mode_name,ti_region,tap_hour,Tap_Ons,Tap_Offs
0,2020-01-01,Bus,Chatswood,0,<50,<50
1,2020-01-01,Bus,Macquarie Park,0,<50,<50
2,2020-01-01,Bus,Newcastle and surrounds,0,<50,<50
3,2020-01-01,Bus,North Sydney,0,700,100
4,2020-01-01,Bus,Other,0,4500,3200
