Skip to content

Commit

Permalink
Merge pull request #3448 from mathesar-foundation/eff_data_load
Browse files Browse the repository at this point in the history
Efficient data loader
  • Loading branch information
mathemancer committed Feb 26, 2024
2 parents a788a38 + 6239b7c commit e011123
Show file tree
Hide file tree
Showing 21 changed files with 542,294 additions and 9 deletions.
4 changes: 3 additions & 1 deletion demo/install/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
LIBRARY_ONE = os.path.join(RESOURCES, "library_without_checkouts.sql")
LIBRARY_TWO = os.path.join(RESOURCES, "library_add_checkouts.sql")
DEVCON_DATASET = os.path.join(RESOURCES, "devcon_dataset.sql")
MOVIES_SQL_BZ2 = os.path.join(RESOURCES, "movie_collection.sql.bz2")
MOVIES_SQL_TABLES = os.path.join(RESOURCES, "movie_collection_tables.sql")
MOVIES_SQL_FKS = os.path.join(RESOURCES, "movie_collection_fks.sql")
MOVIES_CSV = os.path.join(RESOURCES, 'movies_csv')
ARXIV_SETUP_SQL = os.path.join(RESOURCES, 'arxiv_dataset_setup.sql')
ARXIV_PAPERS_PICKLE = os.path.join(RESOURCES, 'arxiv_papers.pickle')
LIBRARY_MANAGEMENT = 'Library Management'
Expand Down
40 changes: 40 additions & 0 deletions demo/install/dumpcsvs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""
Dump data for all the tables of a provided schema to separate {table_name}.csv files
with header as column names.
Usage: python dumpcsvs.py
"""
import psycopg
import csv

DB_NAME = "mathesar"
DB_USER = "mathesar"
DB_PASSWORD = "mathesar"
DB_HOST = "mathesar_dev_db"
SCHEMA_NAME = "Movie Collection"

conn = psycopg.connect(
dbname=DB_NAME,
user=DB_USER,
password=DB_PASSWORD,
host=DB_HOST,
port=5432
)

# get names of tables.
tables = conn.execute(
f"SELECT table_name FROM information_schema.tables WHERE table_schema = '{SCHEMA_NAME}'"
).fetchall()

for table in tables:
table_name = table[0]
with open(f'{table_name}.csv', 'w', newline="") as csv_file:
csv_writer = csv.writer(csv_file)
columns = conn.execute(
f"""SELECT column_name FROM information_schema.columns WHERE
table_schema = '{SCHEMA_NAME}' AND table_name = '{table_name}';"""
).fetchall()
columns = [column[0] for column in columns]
csv_writer.writerow(columns)
with conn.cursor().copy(f"""COPY "{SCHEMA_NAME}"."{table_name}" TO STDOUT""") as copy:
csv_writer.writerows(copy.rows())
19 changes: 11 additions & 8 deletions demo/install/movies_dataset.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
"""This module contains functions to load the Movie Collection dataset."""
import bz2

import os
from sqlalchemy import text

from demo.install.base import MOVIE_COLLECTION, MOVIES_SQL_BZ2
from demo.install.base import MOVIE_COLLECTION, MOVIES_SQL_TABLES, MOVIES_CSV, MOVIES_SQL_FKS


def load_movies_dataset(engine, safe_mode=False):
Expand All @@ -16,11 +15,15 @@ def load_movies_dataset(engine, safe_mode=False):
schema already exists instead of dropping it.
"""
drop_schema_query = text(f"""DROP SCHEMA IF EXISTS "{MOVIE_COLLECTION}" CASCADE;""")
create_schema_query = text(f"""CREATE SCHEMA "{MOVIE_COLLECTION}";""")
set_search_path = text(f"""SET search_path="{MOVIE_COLLECTION}";""")
with engine.begin() as conn, bz2.open(MOVIES_SQL_BZ2, 'rt') as f:
with engine.begin() as conn, open(MOVIES_SQL_TABLES) as f, open(MOVIES_SQL_FKS) as f2:
if safe_mode is False:
conn.execute(drop_schema_query)
conn.execute(create_schema_query)
conn.execute(set_search_path)
conn.execute(text(f.read()))
for file in os.scandir(MOVIES_CSV):
table_name = file.name.split('.csv')[0]
with open(file, 'r') as csv_file:
conn.connection.cursor().copy_expert(
f"""COPY "{MOVIE_COLLECTION}"."{table_name}" FROM STDIN DELIMITER ',' CSV HEADER""",
csv_file
)
conn.execute(text(f2.read()))
Binary file removed demo/install/resources/movie_collection.sql.bz2
Binary file not shown.
105 changes: 105 additions & 0 deletions demo/install/resources/movie_collection_fks.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
SELECT pg_catalog.setval('"Departments_id_seq"', 12, true);
SELECT pg_catalog.setval('"Genres_id_seq"', 10770, true);
SELECT pg_catalog.setval('"Jobs_id_seq"', 419, true);
SELECT pg_catalog.setval('"Movie Cast Map_id_seq"', 159012, true);
SELECT pg_catalog.setval('"Movie Crew Map_id_seq"', 130711, true);
SELECT pg_catalog.setval('"Movie Genre Map_id_seq"', 25886, true);
SELECT pg_catalog.setval('"Movie Production Company Map_id_seq"', 19959, true);
SELECT pg_catalog.setval('"Movie Production Country Map_id_seq"', 14087, true);
SELECT pg_catalog.setval('"Movie Spoken Language Map_id_seq"', 14951, true);
SELECT pg_catalog.setval('"Movies_id_seq"', 469172, true);
SELECT pg_catalog.setval('"People_id_seq"', 1908262, true);
SELECT pg_catalog.setval('"Production Companies_id_seq"', 95940, true);
SELECT pg_catalog.setval('"Production Countries_id_seq"', 122, true);
SELECT pg_catalog.setval('"Spoken Languages_id_seq"', 107, true);
SELECT pg_catalog.setval('"Sub-Collections_id_seq"', 479971, true);

ALTER TABLE ONLY "Departments"
ADD CONSTRAINT "Departments_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Genres"
ADD CONSTRAINT "Genres_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Jobs"
ADD CONSTRAINT "Jobs_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Movie Cast Map"
ADD CONSTRAINT "Movie Cast Map_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Movie Crew Map"
ADD CONSTRAINT "Movie Crew Map_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Movie Genre Map"
ADD CONSTRAINT "Movie Genre Map_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Movie Production Company Map"
ADD CONSTRAINT "Movie Production Company Map_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Movie Production Country Map"
ADD CONSTRAINT "Movie Production Country Map_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Movie Spoken Language Map"
ADD CONSTRAINT "Movie Spoken Language Map_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Movies"
ADD CONSTRAINT "Movies_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "People"
ADD CONSTRAINT "People_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Production Companies"
ADD CONSTRAINT "Production Companies_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Production Countries"
ADD CONSTRAINT "Production Countries_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Spoken Languages"
ADD CONSTRAINT "Spoken Languages_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Sub-Collections"
ADD CONSTRAINT "Sub-Collections_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Movie Cast Map"
ADD CONSTRAINT "Movie Cast Map_Cast Member_fkey" FOREIGN KEY ("Cast Member") REFERENCES "People"(id);

ALTER TABLE ONLY "Movie Cast Map"
ADD CONSTRAINT "Movie Cast Map_Movie_fkey" FOREIGN KEY ("Movie") REFERENCES "Movies"(id);

ALTER TABLE ONLY "Movie Crew Map"
ADD CONSTRAINT "Movie Crew Map_Crew Member_fkey" FOREIGN KEY ("Crew Member") REFERENCES "People"(id);

ALTER TABLE ONLY "Movie Crew Map"
ADD CONSTRAINT "Movie Crew Map_Department_fkey" FOREIGN KEY ("Department") REFERENCES "Departments"(id);

ALTER TABLE ONLY "Movie Crew Map"
ADD CONSTRAINT "Movie Crew Map_Job_fkey" FOREIGN KEY ("Job") REFERENCES "Jobs"(id);

ALTER TABLE ONLY "Movie Crew Map"
ADD CONSTRAINT "Movie Crew Map_Movie_fkey" FOREIGN KEY ("Movie") REFERENCES "Movies"(id);

ALTER TABLE ONLY "Movie Genre Map"
ADD CONSTRAINT "Movie Genre Map_Genre_fkey" FOREIGN KEY ("Genre") REFERENCES "Genres"(id);

ALTER TABLE ONLY "Movie Genre Map"
ADD CONSTRAINT "Movie Genre Map_Movie_fkey" FOREIGN KEY ("Movie") REFERENCES "Movies"(id);

ALTER TABLE ONLY "Movie Production Company Map"
ADD CONSTRAINT "Movie Production Company Map_Movie_fkey" FOREIGN KEY ("Movie") REFERENCES "Movies"(id);

ALTER TABLE ONLY "Movie Production Company Map"
ADD CONSTRAINT "Movie Production Company Map_Production Company_fkey" FOREIGN KEY ("Production Company") REFERENCES "Production Companies"(id);

ALTER TABLE ONLY "Movie Production Country Map"
ADD CONSTRAINT "Movie Production Country Map_Movie_fkey" FOREIGN KEY ("Movie") REFERENCES "Movies"(id);

ALTER TABLE ONLY "Movie Production Country Map"
ADD CONSTRAINT "Movie Production Country Map_Production Country_fkey" FOREIGN KEY ("Production Country") REFERENCES "Production Countries"(id);

ALTER TABLE ONLY "Movie Spoken Language Map"
ADD CONSTRAINT "Movie Spoken Language Map_Movie_fkey" FOREIGN KEY ("Movie") REFERENCES "Movies"(id);

ALTER TABLE ONLY "Movie Spoken Language Map"
ADD CONSTRAINT "Movie Spoken Language Map_Spoken Language_fkey" FOREIGN KEY ("Spoken Language") REFERENCES "Spoken Languages"(id);

ALTER TABLE ONLY "Movies"
ADD CONSTRAINT "Movies_Sub-Collection_fkey" FOREIGN KEY ("Sub-Collection") REFERENCES "Sub-Collections"(id);

0 comments on commit e011123

Please sign in to comment.