# Loading data to Supabase Posgres Database

We have decided to try and host the database using Supabase as the backend. Hosting this on a Supabase's cloud, we could more easily scale and build applications that can access that database. We will eventually try this with the isochrone project.

In [1]:
import pandas as pd
import os
from supabase import create_client, Client



## Supabase API

We have created a database on Supabase off-code. This was created using my(nabilersyad) account under that organization. The database mirrors the SQLite database.
Details on the created database below.

SUPABASE_URL = 'saved locally in the shell. Get the details from supabase dashboard'  
SUPABASE_KEY = 'saved in the shell. Get the details from supabase dashboard'

database: transit-stations-database  
tables: 1. stations  
        2. entrances  
        3. station_entrances  

In [2]:
#declaring the supabase client we will working with
url: str = os.environ.get("SUPABASE_URL")
key: str = os.environ.get("SUPABASE_KEY")
supabase: Client = create_client(url, key)

## Stations table

Combining the various CSVs of different cities and storing them in one table in the database

In [3]:

# Define the directory of cleansed data to transfer to sql
cleansed_data_directory = 'data_cleansed'
cleansed_kl_file = 'klang_valley_stations_cleansed.csv'
cleansed_montreal_file = 'montreal_stations_cleansed.csv'
cleansed_singapore_file = 'singapore_stations_cleansed.csv'
cleansed_combined_file = 'combined_stations_cleansed.csv'

# read cleaned dataframes
kl_data = pd.read_csv(os.path.join(cleansed_data_directory, cleansed_kl_file))
montreal_data = pd.read_csv(os.path.join(cleansed_data_directory, cleansed_montreal_file))
singapore_data = pd.read_csv(os.path.join(cleansed_data_directory, cleansed_singapore_file))


# Combine all the dataframes
stations_data_local = pd.concat([kl_data, montreal_data, singapore_data], axis=0, ignore_index=True)

stations_data_local.index.name = 'station_id'
stations_data_local.to_csv(os.path.join(cleansed_data_directory, cleansed_combined_file), index=True)


# Print out the combined dataframe
stations_data_local


Unnamed: 0_level_0,name,station_code,service_provider_name,latitude,longitude,route_id,route_name,line_number,line_colour,colour_hex_code,region,odonym,namesake,opened
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,KL Sentral,KA01,Keretapi Tanah Melayu,3.134603,101.686567,KA,Seremban Line,1,Blue,#0000FF,Klang Valley,,,
1,Kuala Lumpur,KA02,Keretapi Tanah Melayu,3.139513,101.693789,KA,Seremban Line,1,Blue,#0000FF,Klang Valley,,,
2,Bank Negara,KA03,Keretapi Tanah Melayu,3.154542,101.693010,KA,Seremban Line,1,Blue,#0000FF,Klang Valley,,,
3,Putra,KA04,Keretapi Tanah Melayu,3.165005,101.691234,KA,Seremban Line,1,Blue,#0000FF,Klang Valley,,,
4,Mid Valley,KB01,Keretapi Tanah Melayu,3.118528,101.678985,KB,Seremban Line,1,Blue,#0000FF,Klang Valley,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
531,Outram Park MRT Station,TE17,,1.280400,103.840100,TEL,Thomson-East Coast Line,,Brown,#734538,Singapore,,,
532,Maxwell MRT Station,TE18,,1.280600,103.844000,TEL,Thomson-East Coast Line,,Brown,#734538,Singapore,,,
533,Shenton Way MRT Station,TE19,,1.277540,103.850770,TEL,Thomson-East Coast Line,,Brown,#734538,Singapore,,,
534,Marina Bay MRT Station,TE20,,1.275290,103.854810,TEL,Thomson-East Coast Line,,Brown,#734538,Singapore,,,


In [4]:
response = supabase.table('stations').select("*").execute()
data,_ = response
stations_data_supa = pd.DataFrame(data[1])
stations_data_supa.index.name = 'station_id'
stations_data_supa.drop({'station_id'},inplace=True,axis=1)
stations_data_supa

Unnamed: 0_level_0,name,station_code,service_provider_name,latitude,longitude,route_id,route_name,line_number,line_colour,colour_hex_code,region,odonym,namesake,opened
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,KL Sentral,KA01,Keretapi Tanah Melayu,3.134603,101.686567,KA,Seremban Line,1,Blue,#0000FF,Klang Valley,,,
1,Kuala Lumpur,KA02,Keretapi Tanah Melayu,3.139513,101.693789,KA,Seremban Line,1,Blue,#0000FF,Klang Valley,,,
2,Bank Negara,KA03,Keretapi Tanah Melayu,3.154542,101.693010,KA,Seremban Line,1,Blue,#0000FF,Klang Valley,,,
3,Putra,KA04,Keretapi Tanah Melayu,3.165005,101.691234,KA,Seremban Line,1,Blue,#0000FF,Klang Valley,,,
4,Mid Valley,KB01,Keretapi Tanah Melayu,3.118528,101.678985,KB,Seremban Line,1,Blue,#0000FF,Klang Valley,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
531,Outram Park MRT Station,TE17,,1.280400,103.840100,TEL,Thomson-East Coast Line,,Brown,#734538,Singapore,,,
532,Maxwell MRT Station,TE18,,1.280600,103.844000,TEL,Thomson-East Coast Line,,Brown,#734538,Singapore,,,
533,Shenton Way MRT Station,TE19,,1.277540,103.850770,TEL,Thomson-East Coast Line,,Brown,#734538,Singapore,,,
534,Marina Bay MRT Station,TE20,,1.275290,103.854810,TEL,Thomson-East Coast Line,,Brown,#734538,Singapore,,,


In [18]:
# Delete all rows from the table
data = supabase.table("stations").delete().eq("station_id", 1).execute()



In [17]:
def insert_data_to_supabase(df, chunk_size=1000):
    # Replace NaN with None
    df = df.where(pd.notna(df), None)

    # Splitting the dataframe into smaller chunks for efficient insertion
    chunks = [df[i:i+chunk_size] for i in range(0, df.shape[0], chunk_size)]

    for chunk in chunks:
        rows = chunk.to_dict(orient='records')
        data = supabase.table('stations').insert(rows).execute()
   

# Calling the function to insert the entire DataFrame
insert_data_to_supabase(stations_data_local)


APIError: {'code': '42501', 'details': None, 'hint': None, 'message': 'new row violates row-level security policy for table "stations"'}

In [5]:
# Perform an outer join on the dataframes
merged_data = pd.merge(stations_data_local, stations_data_supa, how='outer', indicator=True)


In [6]:
# Case 1 and 2: rows exist in both dataframes
both_data = merged_data[merged_data['_merge'] == 'both']
both_data = both_data.drop(columns=['_merge'])


In [7]:
both_data[stations_data_local.columns].equals(both_data[stations_data_supa.columns])

True

In [8]:
identical_rows_mask = (both_data[stations_data_local.columns] == both_data[stations_data_supa.columns]).all(axis=1)
identical_rows = both_data[identical_rows_mask]


In [9]:
different_rows = both_data[different_rows_mask]
different_rows

NameError: name 'different_rows_mask' is not defined

In [10]:

# Within these, find rows that are identical and rows that are different
identical_rows = both_data[both_data[stations_data_local.columns].equals(both_data[stations_data_supa.columns])]
different_rows = both_data[~both_data[stations_data_local.columns].equals(both_data[stations_data_supa.columns])]

KeyError: True

In [22]:

# For the different rows, you want to update data_supabase with data from data_local
for index, row in different_rows.iterrows():
    # Use Supabase update method
    # Note: replace 'id' and 'your_table' with your actual id column name and table name
    data, error = supabase.table('your_table').update(row.to_dict()).eq('id', row['id']).execute()

# Case 3: rows in data_local but not in data_supabase
only_local = merged_df[merged_df['_merge'] == 'left_only']

# For these rows, you want to insert into data_supabase
data, error = supabase.table('your_table').insert(only_local.to_dict('records')).execute()

# Case 4: rows not in data_local but in data_supabase
only_supabase = merged_df[merged_df['_merge'] == 'right_only']

# For these rows, you want to delete from data_supabase
for index, row in only_supabase.iterrows():
    # Use Supabase delete method
    data, error = supabase.table('your_table').delete().eq('id', row['id']).execute()


NameError: name 'different_rows' is not defined

In [3]:

# Create a connection to the SQLite database
# Doesn't matter if the database does not yet exist
conn = sqlite3.connect('transit_database.db')  

# Add the data from the combined dataframe to the SQLite table
combined_df.to_sql('stations', conn, if_exists='replace')



536

## Entrances tables

There are two entrances tables that we need to create. We'll call them entrances and station_entrances

In [4]:
cleansed_entrances = 'klang_valley_entrances_cleansed.csv'
cleansed_station_entrances= 'klang_valley_stations_entrances_relation_cleansed.csv'

# read cleaned entrances dataframes
entrances_data = pd.read_csv(os.path.join(cleansed_data_directory, cleansed_entrances))
station_entrances_data = pd.read_csv(os.path.join(cleansed_data_directory, cleansed_station_entrances))


In [5]:

# Add the data from the combined dataframe to the SQLite table
entrances_data.to_sql('entrances', conn, if_exists='replace', index=False)
station_entrances_data.to_sql('station_entrances', conn, if_exists='replace', index=False)


# Commit the changes and close the connection
conn.commit()