Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added pandas and JSON normalization code #2968

Merged
merged 13 commits into from
Jul 6, 2023
46 changes: 45 additions & 1 deletion db/records/operations/insert.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import pandas
import tempfile

from psycopg2 import sql
Expand Down Expand Up @@ -32,9 +33,52 @@ def insert_record_or_records(table, engine, record_data):
return None


def insert_records_from_json(table, engine, json_filepath):
def insert_records_from_json(table, engine, json_filepath, column_names):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a docstring. Include explanation for why column_names is necessary. Include explanation for why and how json_normalize is being invoked (what's max_level and why is it 0). Include explanation of the general algorithm and intent.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. Thanks!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Include a summary and explation of the algorithm. You normalize into a dataframe, then convert to a JSON string, then to Python, ending up with a sequence of rows where each row is dict-like, in which every key-value pair is a column, then you take those columns and if they are a dict or a list, you serialize them back into JSON. That takes some time to figure out and the reason for having to do some of these things is not necessarily immediately obvious. Help the reader out with a summary and an explanation.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Presume that the reader is not me or anyone that you've discussed JSON importing with.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi Dom, I have added the explanation now. PTAL. Thanks!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Awesome!

"""
Normalizes JSON data and inserts it into a table.

Args:
table: Table. The table to insert JSON data into.
engine: MockConnection. The SQLAlchemy engine.
json_filepath: str. The path to the stored JSON data file.
column_names: List[str]. List of column names.

Algorithm:
1. We convert JSON data into Python object using json.load().
2. We normalize data into a pandas dataframe using pandas.json_normalize() method.
The method takes column names as meta. We provide all possible keys as column
names, hence it adds missing keys to JSON objects and marks their values as NaN.
3. We convert the dataframe to JSON using to_json() method and then to a Python object.
This method replaces 'NaN' values in the dataframe with 'None' values in Python
object. The reason behind not using df.to_dict() method is beacuse it stringifies
'NaN' values rather than converting them to a 'None' value.
4. The processed data is now a list of dict objects. Each dict has same keys, that are
the column names of the table. We loop through each dict object, and if any value is
a dict or a list, we stringify them before inserting them into the table. This way,
our type inference logic kicks in later on converting them into
'MathesarCustomType.MATHESAR_JSON_OBJECT' and 'MathesarCustomType.MATHESAR_JSON_ARRAY'
respectively.
5. We pass data (a list of dicts) to 'insert_record_or_records()' method which inserts
them into the table.
"""

with open(json_filepath, 'r') as json_file:
data = json.load(json_file)

"""
data: JSON object. The data we want to normalize.
max_level: int. Max number of levels(depth of dict) to normalize.
If None, normalizes all levels. Normalizing a dict involes flattening it,
a behaviour we want to avoid since we have JSON dict as one of the data types.
Hence, max_level is kept 0.
meta: Fields to use as metadata for each record in resulting table. Without meta,
the method chooses keys from the first JSON object it encounters as column names.
We provide column names as meta, because we want all possible keys as columns in
our table and not just the keys from the first JSON object.
"""
df = pandas.json_normalize(data, max_level=0, meta=column_names)
data = json.loads(df.to_json(orient='records'))

for i, row in enumerate(data):
data[i] = {
k: json.dumps(v)
Expand Down
32 changes: 32 additions & 0 deletions mathesar/api/exceptions/database_exceptions/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,38 @@ def __init__(
super().__init__(exception, self.error_code, message, field, details, status_code)


class InvalidJSONFormat(MathesarAPIException):
error_code = ErrorCodes.InvalidJSONFormat.value

def __init__(
self,
exception=None,
message='Invalid JSON file.',
field=None,
details=None,
status_code=status.HTTP_400_BAD_REQUEST
):
if exception is None:
exception = Exception(message)
super().__init__(exception, self.error_code, message, field, details, status_code)


class UnsupportedJSONFormat(MathesarAPIException):
error_code = ErrorCodes.UnsupportedJSONFormat.value

def __init__(
self,
exception=None,
message='This JSON format is not supported.',
field=None,
details=None,
status_code=status.HTTP_400_BAD_REQUEST
):
if exception is None:
exception = Exception(message)
super().__init__(exception, self.error_code, message, field, details, status_code)


class DynamicDefaultModificationError(Exception):
def __init__(self, column=None):
self.column = column
2 changes: 2 additions & 0 deletions mathesar/api/exceptions/error_codes.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,5 @@ class ErrorCodes(Enum):
DuplicateUIQueryInSchema = 4422
IdentifierTooLong = 4423
DynamicDefaultAlterationToStaticDefault = 4424
InvalidJSONFormat = 4425
UnsupportedJSONFormat = 4426
31 changes: 28 additions & 3 deletions mathesar/imports/json.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,48 @@
import json
from json.decoder import JSONDecodeError

from db.tables.operations.alter import update_pk_sequence_to_latest
from mathesar.database.base import create_mathesar_engine
from db.records.operations.insert import insert_records_from_json
from db.tables.operations.create import create_string_column_table
from db.tables.operations.drop import drop_table
from mathesar.api.exceptions.database_exceptions import (
exceptions as database_api_exceptions
)
from mathesar.imports.utils import process_column_names
from db.constants import ID, ID_ORIGINAL
from psycopg2.errors import IntegrityError, DataError

from mathesar.state import reset_reflection


def validate_json_format(data_file_content):
try:
data = json.load(data_file_content)
except (JSONDecodeError, ValueError) as e:
raise database_api_exceptions.InvalidJSONFormat(e)

is_list_of_dicts = isinstance(data, list) and all(isinstance(val, dict) for val in data)
if is_list_of_dicts:
return
if isinstance(data, dict):
return
raise database_api_exceptions.UnsupportedJSONFormat()


def get_column_names_from_json(data_file):
with open(data_file, 'r') as f:
data = json.load(f)

if isinstance(data, list):
return list(data[0].keys())
return list(data.keys())
all_keys = []
for obj in data:
for key in obj.keys():
if key not in all_keys:
all_keys.append(key)
return all_keys
else:
return list(data.keys())


def insert_data_from_json_data_file(name, schema, column_names, engine, comment, json_filepath):
Expand All @@ -32,7 +56,8 @@ def insert_data_from_json_data_file(name, schema, column_names, engine, comment,
insert_records_from_json(
table,
engine,
json_filepath
json_filepath,
column_names
)
return table

Expand Down
29 changes: 29 additions & 0 deletions mathesar/tests/data/json_parsing/missing_keys.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
[
{
"first_name":"Matt",
"last_name":"Murdock",
"gender":"Male",
"friends": ["Stick", "Foggy"],
"address": {
"street": "210",
"city": "NY"
}
},
{
"first_name":"John",
"last_name":"Doe",
"email":"jd@example.org",
"gender":"Male",
"friends": ["Mark", "Bill"]
},
{
"first_name":"Frank",
"last_name":"Castle",
"email":"fc@example.org",
"address": {
"street": "211",
"city": "NY"
}
}
]

3 changes: 3 additions & 0 deletions mathesar/utils/datafiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from mathesar.errors import URLDownloadError
from mathesar.imports.csv import get_sv_dialect, get_file_encoding
from mathesar.imports.json import validate_json_format
from mathesar.models.base import DataFile


Expand Down Expand Up @@ -56,6 +57,8 @@ def create_datafile(data):

encoding = get_file_encoding(raw_file.file)
text_file = TextIOWrapper(raw_file.file, encoding=encoding)
if type == 'json':
validate_json_format(raw_file)
if type == 'csv' or type == 'tsv':
dialect = get_sv_dialect(text_file)
datafile = DataFile(
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,5 @@ thefuzz==0.19.0
whitenoise==6.4.0
git+https://github.com/centerofci/sqlalchemy-filters@models_to_tables#egg=sqlalchemy_filters
gunicorn==20.1.0
drf-spectacular==0.26.2
drf-spectacular==0.26.2
pandas==2.0.2