Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Efficient data loader #3448

Merged
merged 7 commits into from
Feb 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 3 additions & 1 deletion demo/install/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
LIBRARY_ONE = os.path.join(RESOURCES, "library_without_checkouts.sql")
LIBRARY_TWO = os.path.join(RESOURCES, "library_add_checkouts.sql")
DEVCON_DATASET = os.path.join(RESOURCES, "devcon_dataset.sql")
MOVIES_SQL_BZ2 = os.path.join(RESOURCES, "movie_collection.sql.bz2")
MOVIES_SQL_TABLES = os.path.join(RESOURCES, "movie_collection_tables.sql")
MOVIES_SQL_FKS = os.path.join(RESOURCES, "movie_collection_fks.sql")
MOVIES_CSV = os.path.join(RESOURCES, 'movies_csv')
ARXIV_SETUP_SQL = os.path.join(RESOURCES, 'arxiv_dataset_setup.sql')
ARXIV_PAPERS_PICKLE = os.path.join(RESOURCES, 'arxiv_papers.pickle')
LIBRARY_MANAGEMENT = 'Library Management'
Expand Down
40 changes: 40 additions & 0 deletions demo/install/dumpcsvs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""
Dump data for all the tables of a provided schema to separate {table_name}.csv files
with header as column names.
Usage: python dumpcsvs.py
"""
import psycopg
import csv

DB_NAME = "mathesar"
DB_USER = "mathesar"
DB_PASSWORD = "mathesar"
DB_HOST = "mathesar_dev_db"
SCHEMA_NAME = "Movie Collection"

conn = psycopg.connect(
dbname=DB_NAME,
user=DB_USER,
password=DB_PASSWORD,
host=DB_HOST,
port=5432
)

# get names of tables.
tables = conn.execute(
f"SELECT table_name FROM information_schema.tables WHERE table_schema = '{SCHEMA_NAME}'"
).fetchall()

for table in tables:
table_name = table[0]
with open(f'{table_name}.csv', 'w', newline="") as csv_file:
csv_writer = csv.writer(csv_file)
columns = conn.execute(
f"""SELECT column_name FROM information_schema.columns WHERE
table_schema = '{SCHEMA_NAME}' AND table_name = '{table_name}';"""
).fetchall()
columns = [column[0] for column in columns]
csv_writer.writerow(columns)
with conn.cursor().copy(f"""COPY "{SCHEMA_NAME}"."{table_name}" TO STDOUT""") as copy:
csv_writer.writerows(copy.rows())
19 changes: 11 additions & 8 deletions demo/install/movies_dataset.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
"""This module contains functions to load the Movie Collection dataset."""
import bz2

import os
from sqlalchemy import text

from demo.install.base import MOVIE_COLLECTION, MOVIES_SQL_BZ2
from demo.install.base import MOVIE_COLLECTION, MOVIES_SQL_TABLES, MOVIES_CSV, MOVIES_SQL_FKS


def load_movies_dataset(engine, safe_mode=False):
Expand All @@ -16,11 +15,15 @@ def load_movies_dataset(engine, safe_mode=False):
schema already exists instead of dropping it.
"""
drop_schema_query = text(f"""DROP SCHEMA IF EXISTS "{MOVIE_COLLECTION}" CASCADE;""")
create_schema_query = text(f"""CREATE SCHEMA "{MOVIE_COLLECTION}";""")
set_search_path = text(f"""SET search_path="{MOVIE_COLLECTION}";""")
with engine.begin() as conn, bz2.open(MOVIES_SQL_BZ2, 'rt') as f:
with engine.begin() as conn, open(MOVIES_SQL_TABLES) as f, open(MOVIES_SQL_FKS) as f2:
if safe_mode is False:
conn.execute(drop_schema_query)
conn.execute(create_schema_query)
conn.execute(set_search_path)
conn.execute(text(f.read()))
for file in os.scandir(MOVIES_CSV):
table_name = file.name.split('.csv')[0]
with open(file, 'r') as csv_file:
conn.connection.cursor().copy_expert(
f"""COPY "{MOVIE_COLLECTION}"."{table_name}" FROM STDIN DELIMITER ',' CSV HEADER""",
csv_file
)
conn.execute(text(f2.read()))
Binary file removed demo/install/resources/movie_collection.sql.bz2
Binary file not shown.
105 changes: 105 additions & 0 deletions demo/install/resources/movie_collection_fks.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
SELECT pg_catalog.setval('"Departments_id_seq"', 12, true);
SELECT pg_catalog.setval('"Genres_id_seq"', 10770, true);
SELECT pg_catalog.setval('"Jobs_id_seq"', 419, true);
SELECT pg_catalog.setval('"Movie Cast Map_id_seq"', 159012, true);
SELECT pg_catalog.setval('"Movie Crew Map_id_seq"', 130711, true);
SELECT pg_catalog.setval('"Movie Genre Map_id_seq"', 25886, true);
SELECT pg_catalog.setval('"Movie Production Company Map_id_seq"', 19959, true);
SELECT pg_catalog.setval('"Movie Production Country Map_id_seq"', 14087, true);
SELECT pg_catalog.setval('"Movie Spoken Language Map_id_seq"', 14951, true);
SELECT pg_catalog.setval('"Movies_id_seq"', 469172, true);
SELECT pg_catalog.setval('"People_id_seq"', 1908262, true);
SELECT pg_catalog.setval('"Production Companies_id_seq"', 95940, true);
SELECT pg_catalog.setval('"Production Countries_id_seq"', 122, true);
SELECT pg_catalog.setval('"Spoken Languages_id_seq"', 107, true);
SELECT pg_catalog.setval('"Sub-Collections_id_seq"', 479971, true);

ALTER TABLE ONLY "Departments"
ADD CONSTRAINT "Departments_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Genres"
ADD CONSTRAINT "Genres_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Jobs"
ADD CONSTRAINT "Jobs_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Movie Cast Map"
ADD CONSTRAINT "Movie Cast Map_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Movie Crew Map"
ADD CONSTRAINT "Movie Crew Map_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Movie Genre Map"
ADD CONSTRAINT "Movie Genre Map_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Movie Production Company Map"
ADD CONSTRAINT "Movie Production Company Map_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Movie Production Country Map"
ADD CONSTRAINT "Movie Production Country Map_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Movie Spoken Language Map"
ADD CONSTRAINT "Movie Spoken Language Map_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Movies"
ADD CONSTRAINT "Movies_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "People"
ADD CONSTRAINT "People_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Production Companies"
ADD CONSTRAINT "Production Companies_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Production Countries"
ADD CONSTRAINT "Production Countries_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Spoken Languages"
ADD CONSTRAINT "Spoken Languages_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Sub-Collections"
ADD CONSTRAINT "Sub-Collections_pkey" PRIMARY KEY (id);

ALTER TABLE ONLY "Movie Cast Map"
ADD CONSTRAINT "Movie Cast Map_Cast Member_fkey" FOREIGN KEY ("Cast Member") REFERENCES "People"(id);

ALTER TABLE ONLY "Movie Cast Map"
ADD CONSTRAINT "Movie Cast Map_Movie_fkey" FOREIGN KEY ("Movie") REFERENCES "Movies"(id);

ALTER TABLE ONLY "Movie Crew Map"
ADD CONSTRAINT "Movie Crew Map_Crew Member_fkey" FOREIGN KEY ("Crew Member") REFERENCES "People"(id);

ALTER TABLE ONLY "Movie Crew Map"
ADD CONSTRAINT "Movie Crew Map_Department_fkey" FOREIGN KEY ("Department") REFERENCES "Departments"(id);

ALTER TABLE ONLY "Movie Crew Map"
ADD CONSTRAINT "Movie Crew Map_Job_fkey" FOREIGN KEY ("Job") REFERENCES "Jobs"(id);

ALTER TABLE ONLY "Movie Crew Map"
ADD CONSTRAINT "Movie Crew Map_Movie_fkey" FOREIGN KEY ("Movie") REFERENCES "Movies"(id);

ALTER TABLE ONLY "Movie Genre Map"
ADD CONSTRAINT "Movie Genre Map_Genre_fkey" FOREIGN KEY ("Genre") REFERENCES "Genres"(id);

ALTER TABLE ONLY "Movie Genre Map"
ADD CONSTRAINT "Movie Genre Map_Movie_fkey" FOREIGN KEY ("Movie") REFERENCES "Movies"(id);

ALTER TABLE ONLY "Movie Production Company Map"
ADD CONSTRAINT "Movie Production Company Map_Movie_fkey" FOREIGN KEY ("Movie") REFERENCES "Movies"(id);

ALTER TABLE ONLY "Movie Production Company Map"
ADD CONSTRAINT "Movie Production Company Map_Production Company_fkey" FOREIGN KEY ("Production Company") REFERENCES "Production Companies"(id);

ALTER TABLE ONLY "Movie Production Country Map"
ADD CONSTRAINT "Movie Production Country Map_Movie_fkey" FOREIGN KEY ("Movie") REFERENCES "Movies"(id);

ALTER TABLE ONLY "Movie Production Country Map"
ADD CONSTRAINT "Movie Production Country Map_Production Country_fkey" FOREIGN KEY ("Production Country") REFERENCES "Production Countries"(id);

ALTER TABLE ONLY "Movie Spoken Language Map"
ADD CONSTRAINT "Movie Spoken Language Map_Movie_fkey" FOREIGN KEY ("Movie") REFERENCES "Movies"(id);

ALTER TABLE ONLY "Movie Spoken Language Map"
ADD CONSTRAINT "Movie Spoken Language Map_Spoken Language_fkey" FOREIGN KEY ("Spoken Language") REFERENCES "Spoken Languages"(id);

ALTER TABLE ONLY "Movies"
ADD CONSTRAINT "Movies_Sub-Collection_fkey" FOREIGN KEY ("Sub-Collection") REFERENCES "Sub-Collections"(id);