Skip to content

Commit

Permalink
Merge pull request #1511 from mabel-dev/#1491
Browse files Browse the repository at this point in the history
  • Loading branch information
joocer committed Mar 7, 2024
2 parents 6cdcd29 + f2122d2 commit 40fbbcf
Show file tree
Hide file tree
Showing 14 changed files with 287 additions and 33 deletions.
2 changes: 1 addition & 1 deletion opteryx/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__build__ = 344
__build__ = 346

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
1 change: 1 addition & 0 deletions opteryx/connectors/virtual_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
WELL_KNOWN_DATASETS = {
"$astronauts": (virtual_datasets.astronauts, True),
"$planets": (virtual_datasets.planets, True),
"$missions": (virtual_datasets.missions, True),
"$satellites": (virtual_datasets.satellites, True),
"$variables": (virtual_datasets.variables, True),
"$derived": (virtual_datasets.derived, False),
Expand Down
1 change: 1 addition & 0 deletions opteryx/virtual_datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

import opteryx.virtual_datasets.astronaut_data as astronauts
import opteryx.virtual_datasets.derived_data as derived
import opteryx.virtual_datasets.missions as missions
import opteryx.virtual_datasets.no_table_data as no_table
import opteryx.virtual_datasets.planet_data as planets
import opteryx.virtual_datasets.satellite_data as satellites
Expand Down
2 changes: 2 additions & 0 deletions opteryx/virtual_datasets/astronaut_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,13 @@

from orso.schema import FlatColumn
from orso.schema import RelationSchema
from orso.tools import single_item_cache
from orso.types import OrsoTypes

__all__ = ("read", "schema")


@single_item_cache
def read(*args):
import base64
import io
Expand Down
66 changes: 66 additions & 0 deletions opteryx/virtual_datasets/missions.py

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions opteryx/virtual_datasets/planet_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,13 @@

from orso.schema import FlatColumn
from orso.schema import RelationSchema
from orso.tools import single_item_cache
from orso.types import OrsoTypes

__all__ = ("read", "schema")


@single_item_cache
def read(end_date=None, *args):
import pyarrow

Expand Down
2 changes: 2 additions & 0 deletions opteryx/virtual_datasets/satellite_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,13 @@

from orso.schema import FlatColumn
from orso.schema import RelationSchema
from orso.tools import single_item_cache
from orso.types import OrsoTypes

__all__ = ("read", "schema")


@single_item_cache
def read(*args):
import base64
import io
Expand Down
Binary file added testdata/sqlite/100000-tweets.db
Binary file not shown.
Binary file modified testdata/sqlite/database.db
Binary file not shown.
62 changes: 62 additions & 0 deletions testdata/sqlite/loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import sqlite3

DB_NAME = "sqlite3_opt.db"


def create_table_statement(name: str, schema) -> str:
yield f"CREATE TABLE IF NOT EXISTS {name} ("
total_columns = len(schema.columns)
for i, column in enumerate(schema.columns, start=1):
# Check if it's the last column
if i == total_columns:
yield f"\t{column.name:<20}\t{column.type.name}"
else:
yield f"\t{column.name:<20}\t{column.type.name},"
yield ");"


def format_value(value):
try:
if value != value: # Check for NaN values which are not equal to themselves
return "NULL"
elif isinstance(value, str):
return "'" + value.replace("'", "''") + "'" # Properly quote strings
except:
pass
return str(value) # Use the string representation for other data types


def creator(name, con: sqlite3.Connection, dataset):

create_sql = "\n".join(create_table_statement(name, dataset.schema))
print(create_sql)
con.execute(create_sql)

con.execute("BEGIN")

for row in dataset:
con.execute(f'INSERT INTO {name} VALUES ({", ".join(format_value(r) for r in row)});')
con.commit()


def main():
conn = sqlite3.connect(DB_NAME, isolation_level=None)
conn.execute("PRAGMA journal_mode = OFF;")
conn.execute("PRAGMA synchronous = 0;")
conn.execute("PRAGMA cache_size = 1000000;") # give it a GB
conn.execute("PRAGMA locking_mode = EXCLUSIVE;")
conn.execute("PRAGMA temp_store = MEMORY;")

import opteryx

# dataset = opteryx.query("SELECT tweet_id, text, timestamp, user_id, user_verified, user_name, followers, following, tweets_by_user FROM testdata.flat.formats.parquet;")
# creator("tweets", conn, dataset)
# dataset = opteryx.query("SELECT * FROM $planets;")
# creator("planets", conn, dataset)

print(conn.execute("SELECT * FROM tweets").fetchall())
# print(conn.execute("SELECT * FROM planets").fetchall())


if __name__ == "__main__":
main()
4 changes: 4 additions & 0 deletions tests/fuzzing/test_sql_fuzzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,10 @@ def generate_random_sql_select(columns, table):
"name": virtual_datasets.astronauts.schema().name,
"fields": virtual_datasets.astronauts.schema().columns,
},
{
"name": virtual_datasets.missions.schema().name,
"fields": virtual_datasets.missions.schema().columns,
},
]

TEST_CYCLES: int = 250
Expand Down
3 changes: 3 additions & 0 deletions tests/sql_battery/test_shapes_and_errors_battery.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
("SELECT * FROM $no_table", 1, 1, None),
("SELECT * FROM sqlite.planets", 9, 20, None),
("SELECT * FROM $variables", 41, 4, None),
("SELECT * FROM $missions", 4630, 8, None),
(b"SELECT * FROM $satellites", 177, 8, None),

# Does the error tester work
Expand Down Expand Up @@ -1180,6 +1181,8 @@
# 10-way join
("SELECT p1.name AS planet1_name, p2.name AS planet2_name, p3.name AS planet3_name, p4.name AS planet4_name, p5.name AS planet5_name, p6.name AS planet6_name, p7.name AS planet7_name, p8.name AS planet8_name, p9.name AS planet9_name, p10.name AS planet10_name, p1.diameter AS planet1_diameter, p2.gravity AS planet2_gravity, p3.orbitalPeriod AS planet3_orbitalPeriod, p4.numberOfMoons AS planet4_numberOfMoons, p5.meanTemperature AS planet5_meanTemperature FROM $planets p1 JOIN $planets p2 ON p1.id = p2.id JOIN $planets p3 ON p1.id = p3.id JOIN $planets p4 ON p1.id = p4.id JOIN $planets p5 ON p1.id = p5.id JOIN $planets p6 ON p1.id = p6.id JOIN $planets p7 ON p1.id = p7.id JOIN $planets p8 ON p1.id = p8.id JOIN $planets p9 ON p1.id = p9.id JOIN $planets p10 ON p1.id = p10.id WHERE p1.diameter > 10000 ORDER BY p1.name, p2.name, p3.name, p4.name, p5.name;", 6, 15, None),

("SELECT mission, LIST(name) FROM $missions INNER JOIN (SELECT * FROM $astronauts CROSS JOIN UNNEST(missions) AS mission) AS astronauts ON Mission = mission GROUP BY mission", 16, 2, None),

# virtual dataset doesn't exist
("SELECT * FROM $RomanGods", None, None, DatasetNotFoundError),
# disk dataset doesn't exist
Expand Down
14 changes: 14 additions & 0 deletions tests/storage/test_sql_postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import os
import sys
import time

sys.path.insert(1, os.path.join(sys.path[0], "../.."))

Expand All @@ -25,37 +26,50 @@ def test_postgres_storage():
connection=f"postgresql+psycopg2://{POSTGRES_USER}:{POSTGRES_PASSWORD}@trumpet.db.elephantsql.com/{POSTGRES_USER}",
)

t = time.monotonic_ns()
results = opteryx.query("SELECT * FROM pg.planets")
assert results.rowcount == 9, results.rowcount
assert results.columncount == 20
print((time.monotonic_ns() - t) / 1e9)
t = time.monotonic_ns()

# PROCESS THE DATA IN SOME WAY
results = opteryx.query("SELECT COUNT(*) FROM pg.planets;")
assert results.rowcount == 1, results.rowcount
assert results.columncount == 1
print((time.monotonic_ns() - t) / 1e9)
t = time.monotonic_ns()

# PUSH A PROJECTION
results = opteryx.query("SELECT name FROM pg.planets;")
assert results.rowcount == 9, results.rowcount
assert results.columncount == 1
print((time.monotonic_ns() - t) / 1e9)
t = time.monotonic_ns()

# JOIN ON A NON SQL TABLE
results = opteryx.query(
"SELECT * FROM pg.planets INNER JOIN $satellites ON pg.planets.id = $satellites.planetId;"
)
assert results.rowcount == 177, results.rowcount
assert results.columncount == 28, results.columncount
print((time.monotonic_ns() - t) / 1e9)
t = time.monotonic_ns()

# PUSH - CHECK STATS THE PUSHES WORKED
results = opteryx.query("SELECT name FROM pg.planets WHERE name LIKE 'Earth';")
assert results.rowcount == 1, results.rowcount
assert results.columncount == 1
assert results.stats["rows_read"] == 1
assert results.stats["columns_read"] == 1
print((time.monotonic_ns() - t) / 1e9)
t = time.monotonic_ns()

results = opteryx.query("SELECT * FROM pg.planets WHERE id > gravity")
assert results.rowcount == 2, results.rowcount
assert results.stats.get("rows_read", 0) == 9, results.stats
print((time.monotonic_ns() - t) / 1e9)
t = time.monotonic_ns()


if __name__ == "__main__": # pragma: no cover
Expand Down
Loading

0 comments on commit 40fbbcf

Please sign in to comment.