Merge pull request #2725 from centerofci/longnamecsv_bug

Reduce max column name size (during CSV import) further
mathesar-foundation · Apr 14, 2023 · 8d53d6f · 8d53d6f
2 parents 509b6ab + b22397e
commit 8d53d6f
Show file tree

Hide file tree

Showing 3 changed files with 138 additions and 78 deletions.
diff --git a/db/identifiers.py b/db/identifiers.py
@@ -1,6 +1,9 @@
 import hashlib
 
 
+POSTGRES_IDENTIFIER_SIZE_LIMIT = 63
+
+
 def truncate_if_necessary(identifier):
     """
     Takes an identifier and returns it, truncating it, if it is too long. The truncated version
@@ -30,9 +33,13 @@ def truncate_if_necessary(identifier):
 
 
 def is_identifier_too_long(identifier):
-    postgres_identifier_size_limit = 63
+    # TODO we should support POSTGRES_IDENTIFIER_SIZE_LIMIT here;
+    # Our current limit due to an unknown bug that manifests at least
+    # when importing CSVs seems to be 57 bytes. Here we're setting it even
+    # lower just in case.
+    our_temporary_identifier_size_limit = 48
     size = _get_size_of_identifier_in_bytes(identifier)
-    return size > postgres_identifier_size_limit
+    return size > our_temporary_identifier_size_limit
 
 
 def _get_truncation_hash(identifier):

diff --git a/mathesar/tests/api/test_table_api.py b/mathesar/tests/api/test_table_api.py
@@ -5,6 +5,7 @@
 from sqlalchemy import text
 
 from db.columns.operations.select import get_column_attnum_from_name, get_column_attnum_from_names_as_map
+from db.identifiers import truncate_if_necessary
 from db.types.base import PostgresType, MathesarCustomType
 from db.metadata import get_empty_metadata
 from mathesar.models.users import DatabaseRole, SchemaRole
@@ -15,6 +16,19 @@
 from mathesar.models.base import Column, Table, DataFile
 
 
+# DUPLICATE: We need a better testing organization schema. Now is not the time to fix.
+@pytest.fixture
+def long_column_data_file():
+    data_filepath = 'mathesar/tests/data/long_column_names.csv'
+    with open(data_filepath, "rb") as csv_file:
+        data_file = DataFile.objects.create(
+            file=File(csv_file),
+            created_from='file',
+            base_name='longdatafiled',
+        )
+    return data_file
+
+
 @pytest.fixture
 def schema_name():
     return 'table_tests'
@@ -158,6 +172,7 @@ def check_create_table_response(
     assert data_file.table_imported_to.id == table.id
     assert table.import_target == import_target_table
     check_table_response(response_table, table, expt_name)
+    return table
 
 
 list_clients_with_results_count = [
@@ -1768,3 +1783,117 @@ def test_table_ui_dependency(client, create_patents_table, get_uid):
         ]
     }
     assert response_data == expected_response
+
+
+@pytest.mark.parametrize(
+    'before_truncation, after_truncation',
+    [
+        [
+            "bbbbbbbbbbbbbb",
+            "bbbbbbbbbbbbbb",
+        ],
+        [
+            "cccccccccccccccccccccc",
+            "cccccccccccccccccccccc",
+        ],
+        [
+            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+        ],
+        [
+            "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffff",
+            "fffffffffffffffffffffffffffffffffffffff-7e43d30e"
+        ],
+        [
+            "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee",
+            "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee-d0ccef3c",
+        ],
+        [
+            "ggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggg",
+            "ggggggggggggggggggggggggggggggggggggggg-2910cecf",
+        ],
+    ]
+)
+def test_truncate_if_necessary(before_truncation, after_truncation):
+    assert truncate_if_necessary(before_truncation) == after_truncation
+
+
+def test_create_table_long_name_data_file(client, long_column_data_file, schema):
+    table_name = 'My Long column name datafile'
+    # response, response_table, table = _create_table(
+    # )
+    expt_name = _get_expected_name(table_name, data_file=long_column_data_file)
+    first_row = (
+        1, 'NATION', '8.6', '4.5', '8.5', '4.3', '8.3', '4.6', '78.6', '2.22',
+        '0.88', '0.66', '1.53', '3.75', '3.26', '0.45', '0.07', '53.9', '52.3',
+        '0.8', '0.38487', '3.15796', '2.3', '33247', '14.842144', '6.172333',
+        '47.158545', '1.698662', '2.345577', '7.882694', '0.145406', '3.395302',
+        '92.085375', '14.447634', '78.873848', '1.738571', '16.161024',
+        '19.436701', '8.145643', '94.937079', '74.115131', '75.601680',
+        '22.073834', '11.791045', '1.585233',
+        '1.016932', '2023-02-01'
+    )
+    column_names = [
+        "State or Nation",
+        "Cycle 1 Total Number of Health Deficiencies",
+        "Cycle 1 Total Number of Fire Safety Deficiencies",
+        "Cycle 2 Total Number of Health Deficiencies",
+        "Cycle 2 Total Number of Fire Safety Deficiencies",
+        "Cycle 3 Total Number of Health Deficiencies",
+        "Cycle 3 Total Number of Fire Safety Deficiencies",
+        "Average Number of Residents per Day",
+        "Reported Nurse Aide Staffing Hours per Resident per Day",
+        "Reported LPN Staffing Hours per Resident per Day",
+        "Reported RN Staffing Hours per Resident per Day",
+        "Reported Licensed Staffing Hours per Resident per Day",
+        "Reported Total Nurse Staffing Hours per Resident per Day",
+        "Total number of nurse staff hours per resident per day on the weekend",
+        "Registered Nurse hours per resident per day on the weekend",
+        "Reported Physical Therapist Staffing Hours per Resident Per Day",
+        "Total nursing staff turnover",
+        "Registered Nurse turnover",
+        "Number of administrators who have left the nursing home",
+        "Case-Mix RN Staffing Hours per Resident per Day",
+        "Case-Mix Total Nurse Staffing Hours per Resident per Day",
+        "Number of Fines",
+        "Fine Amount in Dollars",
+        "Percentage of long stay residents whose need for help with daily activities has increased",
+        "Percentage of long stay residents who lose too much weight",
+        "Percentage of low risk long stay residents who lose control of their bowels or bladder",
+        "Percentage of long stay residents with a catheter inserted and left in their bladder",
+        "Percentage of long stay residents with a urinary tract infection",
+        "Percentage of long stay residents who have depressive symptoms",
+        "Percentage of long stay residents who were physically restrained",
+        "Percentage of long stay residents experiencing one or more falls with major injury",
+        "Percentage of long stay residents assessed and appropriately given the pneumococcal vaccine",
+        "Percentage of long stay residents who received an antipsychotic medication",
+        "Percentage of short stay residents assessed and appropriately given the pneumococcal vaccine",
+        "Percentage of short stay residents who newly received an antipsychotic medication",
+        "Percentage of long stay residents whose ability to move independently worsened",
+        "Percentage of long stay residents who received an antianxiety or hypnotic medication",
+        "Percentage of high risk long stay residents with pressure ulcers",
+        "Percentage of long stay residents assessed and appropriately given the seasonal influenza vaccine",
+        "Percentage of short stay residents who made improvements in function",
+        "Percentage of short stay residents who were assessed and appropriately given the seasonal influenza vaccine",
+        "Percentage of short stay residents who were rehospitalized after a nursing home admission",
+        "Percentage of short stay residents who had an outpatient emergency department visit",
+        "Number of hospitalizations per 1000 long-stay resident days",
+        "Number of outpatient emergency department visits per 1000 long-stay resident days",
+        "Processing Date"
+    ]
+    # Make sure at least some column names require truncation;
+    # 63 is the hard Postgres limit; we're also experiencing problems with ids
+    # as short as 58 characters, but I'll leave this at 63 so that it doesn't
+    # have to be updated once that's fixed.
+    assert any(
+        len(column_name) >= 63
+        for column_name
+        in column_names
+    )
+    processed_column_names = [truncate_if_necessary(col) for col in column_names]
+    table = check_create_table_response(
+        client, table_name, expt_name, long_column_data_file, schema, first_row,
+        processed_column_names, import_target_table=None
+    )
+    # This just makes sure we can get records. This was a bug with long column names.
+    client.get(f'/api/db/v0/tables/{table.id}/records/')
diff --git a/mathesar/tests/imports/test_csv.py b/mathesar/tests/imports/test_csv.py
@@ -21,14 +21,6 @@ def data_file(patents_csv_filepath):
     return data_file
 
 
-@pytest.fixture
-def long_column_data_file():
-    data_filepath = 'mathesar/tests/data/long_column_names.csv'
-    with open(data_filepath, "rb") as csv_file:
-        data_file = DataFile.objects.create(file=File(csv_file))
-    return data_file
-
-
 @pytest.fixture
 def headerless_data_file(headerless_patents_csv_filepath):
     with open(headerless_patents_csv_filepath, "rb") as csv_file:
@@ -98,74 +90,6 @@ def test_csv_upload(data_file, schema):
     )
 
 
-def test_csv_upload_long_columns(long_column_data_file, schema):
-    table_name = "long_cols"
-    table = create_table_from_csv(long_column_data_file, table_name, schema)
-
-    num_records = 54
-    expected_row = (
-        1, 'NATION', '8.6', '4.5', '8.5', '4.3', '8.3', '4.6', '78.6', '2.22',
-        '0.88', '0.66', '1.53', '3.75', '3.26', '0.45', '0.07', '53.9', '52.3',
-        '0.8', '0.38487', '3.15796', '2.3', '33247', '14.842144', '6.172333',
-        '47.158545', '1.698662', '2.345577', '7.882694', '0.145406', '3.395302',
-        '92.085375', '14.447634', '78.873848', '1.738571', '16.161024',
-        '19.436701', '8.145643', '94.937079', '74.115131', '75.601680',
-        '22.073834', '11.791045', '1.585233', '1.016932', '2023-02-01',
-    )
-    expected_cols = [
-        'id',
-        'State or Nation',
-        'Cycle 1 Total Number of Health Deficiencies',
-        'Cycle 1 Total Number of Fire Safety Deficiencies',
-        'Cycle 2 Total Number of Health Deficiencies',
-        'Cycle 2 Total Number of Fire Safety Deficiencies',
-        'Cycle 3 Total Number of Health Deficiencies',
-        'Cycle 3 Total Number of Fire Safety Deficiencies',
-        'Average Number of Residents per Day',
-        'Reported Nurse Aide Staffing Hours per Resident per Day',
-        'Reported LPN Staffing Hours per Resident per Day',
-        'Reported RN Staffing Hours per Resident per Day',
-        'Reported Licensed Staffing Hours per Resident per Day',
-        'Reported Total Nurse Staffing Hours per Resident per Day',
-        'Total number of nurse staff hours per resident per day-8cd5ab5e',
-        'Registered Nurse hours per resident per day on the weekend',
-        'Reported Physical Therapist Staffing Hours per Resident Per Day',
-        'Total nursing staff turnover',
-        'Registered Nurse turnover',
-        'Number of administrators who have left the nursing home',
-        'Case-Mix RN Staffing Hours per Resident per Day',
-        'Case-Mix Total Nurse Staffing Hours per Resident per Day',
-        'Number of Fines',
-        'Fine Amount in Dollars',
-        'Percentage of long stay residents whose need for help-5c97c88f',
-        'Percentage of long stay residents who lose too much weight',
-        'Percentage of low risk long stay residents who lose co-fc6bc241',
-        'Percentage of long stay residents with a catheter inse-ce71f22a',
-        'Percentage of long stay residents with a urinary tract-f16fbec8',
-        'Percentage of long stay residents who have depressive symptoms',
-        'Percentage of long stay residents who were physically-f30de0aa',
-        'Percentage of long stay residents experiencing one or-9f9e8f36',
-        'Percentage of long stay residents assessed and appropr-84744861',
-        'Percentage of long stay residents who received an anti-20fe5d12',
-        'Percentage of short stay residents assessed and approp-3568770f',
-        'Percentage of short stay residents who newly received-e98612b4',
-        'Percentage of long stay residents whose ability to mov-66839cb4',
-        'Percentage of long stay residents who received an anti-868593e4',
-        'Percentage of high risk long stay residents with press-b624bbba',
-        'Percentage of long stay residents assessed and appropr-999c26ef',
-        'Percentage of short stay residents who made improvemen-ebe5c21e',
-        'Percentage of short stay residents who were assessed a-26e64965',
-        'Percentage of short stay residents who were rehospital-682a4dae',
-        'Percentage of short stay residents who had an outpatie-9403ec21',
-        'Number of hospitalizations per 1000 long-stay resident days',
-        'Number of outpatient emergency department visits per 1-f0fed7b5',
-        'Processing Date'
-    ]
-    check_csv_upload(
-        table, table_name, schema, num_records, expected_row, expected_cols
-    )
-
-
 def test_headerless_csv_upload(headerless_data_file, schema):
     table_name = "NASA no headers"
     table = create_table_from_csv(headerless_data_file, table_name, schema)