mathesar-foundation · dmos62 · Jul 6, 2023 · Jun 24, 2023 · Jun 25, 2023 · Jun 25, 2023
diff --git a/db/records/operations/insert.py b/db/records/operations/insert.py
@@ -1,4 +1,5 @@
 import json
+import pandas
 import tempfile
 
 from psycopg2 import sql
@@ -32,9 +33,52 @@ def insert_record_or_records(table, engine, record_data):
     return None
 
 
-def insert_records_from_json(table, engine, json_filepath):
+def insert_records_from_json(table, engine, json_filepath, column_names):
+    """
+    Normalizes JSON data and inserts it into a table.
+
+    Args:
+        table: Table. The table to insert JSON data into.
+        engine: MockConnection. The SQLAlchemy engine.
+        json_filepath: str. The path to the stored JSON data file.
+        column_names: List[str]. List of column names.
+
+    Algorithm:
+        1.  We convert JSON data into Python object using json.load().
+        2.  We normalize data into a pandas dataframe using pandas.json_normalize() method.
+            The method takes column names as meta. We provide all possible keys as column
+            names, hence it adds missing keys to JSON objects and marks their values as NaN.
+        3.  We convert the dataframe to JSON using to_json() method and then to a Python object.
+            This method replaces 'NaN' values in the dataframe with 'None' values in Python
+            object. The reason behind not using df.to_dict() method is beacuse it stringifies
+            'NaN' values rather than converting them to a 'None' value.
+        4.  The processed data is now a list of dict objects. Each dict has same keys, that are
+            the column names of the table. We loop through each dict object, and if any value is
+            a dict or a list, we stringify them before inserting them into the table. This way,
+            our type inference logic kicks in later on converting them into
+            'MathesarCustomType.MATHESAR_JSON_OBJECT' and 'MathesarCustomType.MATHESAR_JSON_ARRAY'
+            respectively.
+        5.  We pass data (a list of dicts) to 'insert_record_or_records()' method which inserts
+            them into the table.
+    """
+
     with open(json_filepath, 'r') as json_file:
         data = json.load(json_file)
+
+    """
+    data: JSON object. The data we want to normalize.
+    max_level: int. Max number of levels(depth of dict) to normalize.
+        If None, normalizes all levels. Normalizing a dict involes flattening it,
+        a behaviour we want to avoid since we have JSON dict as one of the data types.
+        Hence, max_level is kept 0.
+    meta: Fields to use as metadata for each record in resulting table. Without meta,
+        the method chooses keys from the first JSON object it encounters as column names.
+        We provide column names as meta, because we want all possible keys as columns in
+        our table and not just the keys from the first JSON object.
+    """
+    df = pandas.json_normalize(data, max_level=0, meta=column_names)
+    data = json.loads(df.to_json(orient='records'))
+
     for i, row in enumerate(data):
         data[i] = {
             k: json.dumps(v)

diff --git a/mathesar/api/exceptions/database_exceptions/exceptions.py b/mathesar/api/exceptions/database_exceptions/exceptions.py
@@ -463,6 +463,38 @@ def __init__(
         super().__init__(exception, self.error_code, message, field, details, status_code)
 
 
+class InvalidJSONFormat(MathesarAPIException):
+    error_code = ErrorCodes.InvalidJSONFormat.value
+
+    def __init__(
+            self,
+            exception=None,
+            message='Invalid JSON file.',
+            field=None,
+            details=None,
+            status_code=status.HTTP_400_BAD_REQUEST
+    ):
+        if exception is None:
+            exception = Exception(message)
+        super().__init__(exception, self.error_code, message, field, details, status_code)
+
+
+class UnsupportedJSONFormat(MathesarAPIException):
+    error_code = ErrorCodes.UnsupportedJSONFormat.value
+
+    def __init__(
+            self,
+            exception=None,
+            message='This JSON format is not supported.',
+            field=None,
+            details=None,
+            status_code=status.HTTP_400_BAD_REQUEST
+    ):
+        if exception is None:
+            exception = Exception(message)
+        super().__init__(exception, self.error_code, message, field, details, status_code)
+
+
 class DynamicDefaultModificationError(Exception):
     def __init__(self, column=None):
         self.column = column
diff --git a/mathesar/api/exceptions/error_codes.py b/mathesar/api/exceptions/error_codes.py
@@ -63,3 +63,5 @@ class ErrorCodes(Enum):
     DuplicateUIQueryInSchema = 4422
     IdentifierTooLong = 4423
     DynamicDefaultAlterationToStaticDefault = 4424
+    InvalidJSONFormat = 4425
+    UnsupportedJSONFormat = 4426
diff --git a/mathesar/imports/json.py b/mathesar/imports/json.py
@@ -1,24 +1,48 @@
 import json
+from json.decoder import JSONDecodeError
 
 from db.tables.operations.alter import update_pk_sequence_to_latest
 from mathesar.database.base import create_mathesar_engine
 from db.records.operations.insert import insert_records_from_json
 from db.tables.operations.create import create_string_column_table
 from db.tables.operations.drop import drop_table
+from mathesar.api.exceptions.database_exceptions import (
+    exceptions as database_api_exceptions
+)
 from mathesar.imports.utils import process_column_names
 from db.constants import ID, ID_ORIGINAL
 from psycopg2.errors import IntegrityError, DataError
 
 from mathesar.state import reset_reflection
 
 
+def validate_json_format(data_file_content):
+    try:
+        data = json.load(data_file_content)
+    except (JSONDecodeError, ValueError) as e:
+        raise database_api_exceptions.InvalidJSONFormat(e)
+
+    is_list_of_dicts = isinstance(data, list) and all(isinstance(val, dict) for val in data)
+    if is_list_of_dicts:
+        return
+    if isinstance(data, dict):
+        return
+    raise database_api_exceptions.UnsupportedJSONFormat()
+
+
 def get_column_names_from_json(data_file):
     with open(data_file, 'r') as f:
         data = json.load(f)
 
     if isinstance(data, list):
-        return list(data[0].keys())
-    return list(data.keys())
+        all_keys = []
+        for obj in data:
+            for key in obj.keys():
+                if key not in all_keys:
+                    all_keys.append(key)
+        return all_keys
+    else:
+        return list(data.keys())
 
 
 def insert_data_from_json_data_file(name, schema, column_names, engine, comment, json_filepath):
@@ -32,7 +56,8 @@ def insert_data_from_json_data_file(name, schema, column_names, engine, comment,
     insert_records_from_json(
         table,
         engine,
-        json_filepath
+        json_filepath,
+        column_names
     )
     return table
 

diff --git a/mathesar/tests/data/json_parsing/missing_keys.json b/mathesar/tests/data/json_parsing/missing_keys.json
@@ -0,0 +1,29 @@
+[
+    {
+        "first_name":"Matt",
+        "last_name":"Murdock",
+        "gender":"Male",
+        "friends": ["Stick", "Foggy"],
+        "address": {
+            "street": "210",
+            "city": "NY"
+        }
+    },
+    {
+        "first_name":"John",
+        "last_name":"Doe",
+        "email":"jd@example.org",
+        "gender":"Male",
+        "friends": ["Mark", "Bill"]
+    },
+    {
+        "first_name":"Frank",
+        "last_name":"Castle",
+        "email":"fc@example.org",
+        "address": {
+            "street": "211",
+            "city": "NY"
+        }
+    }
+ ]
+
diff --git a/mathesar/utils/datafiles.py b/mathesar/utils/datafiles.py
@@ -8,6 +8,7 @@
 
 from mathesar.errors import URLDownloadError
 from mathesar.imports.csv import get_sv_dialect, get_file_encoding
+from mathesar.imports.json import validate_json_format
 from mathesar.models.base import DataFile
 
 
@@ -56,6 +57,8 @@ def create_datafile(data):
 
     encoding = get_file_encoding(raw_file.file)
     text_file = TextIOWrapper(raw_file.file, encoding=encoding)
+    if type == 'json':
+        validate_json_format(raw_file)
     if type == 'csv' or type == 'tsv':
         dialect = get_sv_dialect(text_file)
         datafile = DataFile(

diff --git a/requirements.txt b/requirements.txt
@@ -26,4 +26,5 @@ thefuzz==0.19.0
 whitenoise==6.4.0
 git+https://github.com/centerofci/sqlalchemy-filters@models_to_tables#egg=sqlalchemy_filters
 gunicorn==20.1.0
-drf-spectacular==0.26.2
+drf-spectacular==0.26.2
+pandas==2.0.2