Skip to content

Commit

Permalink
Merge pull request #2725 from centerofci/longnamecsv_bug
Browse files Browse the repository at this point in the history
Reduce max column name size (during CSV import) further
  • Loading branch information
dmos62 committed Apr 14, 2023
2 parents 509b6ab + b22397e commit 8d53d6f
Show file tree
Hide file tree
Showing 3 changed files with 138 additions and 78 deletions.
11 changes: 9 additions & 2 deletions db/identifiers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import hashlib


POSTGRES_IDENTIFIER_SIZE_LIMIT = 63


def truncate_if_necessary(identifier):
"""
Takes an identifier and returns it, truncating it, if it is too long. The truncated version
Expand Down Expand Up @@ -30,9 +33,13 @@ def truncate_if_necessary(identifier):


def is_identifier_too_long(identifier):
postgres_identifier_size_limit = 63
# TODO we should support POSTGRES_IDENTIFIER_SIZE_LIMIT here;
# Our current limit due to an unknown bug that manifests at least
# when importing CSVs seems to be 57 bytes. Here we're setting it even
# lower just in case.
our_temporary_identifier_size_limit = 48
size = _get_size_of_identifier_in_bytes(identifier)
return size > postgres_identifier_size_limit
return size > our_temporary_identifier_size_limit


def _get_truncation_hash(identifier):
Expand Down
129 changes: 129 additions & 0 deletions mathesar/tests/api/test_table_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from sqlalchemy import text

from db.columns.operations.select import get_column_attnum_from_name, get_column_attnum_from_names_as_map
from db.identifiers import truncate_if_necessary
from db.types.base import PostgresType, MathesarCustomType
from db.metadata import get_empty_metadata
from mathesar.models.users import DatabaseRole, SchemaRole
Expand All @@ -15,6 +16,19 @@
from mathesar.models.base import Column, Table, DataFile


# DUPLICATE: We need a better testing organization schema. Now is not the time to fix.
@pytest.fixture
def long_column_data_file():
data_filepath = 'mathesar/tests/data/long_column_names.csv'
with open(data_filepath, "rb") as csv_file:
data_file = DataFile.objects.create(
file=File(csv_file),
created_from='file',
base_name='longdatafiled',
)
return data_file


@pytest.fixture
def schema_name():
return 'table_tests'
Expand Down Expand Up @@ -158,6 +172,7 @@ def check_create_table_response(
assert data_file.table_imported_to.id == table.id
assert table.import_target == import_target_table
check_table_response(response_table, table, expt_name)
return table


list_clients_with_results_count = [
Expand Down Expand Up @@ -1768,3 +1783,117 @@ def test_table_ui_dependency(client, create_patents_table, get_uid):
]
}
assert response_data == expected_response


@pytest.mark.parametrize(
'before_truncation, after_truncation',
[
[
"bbbbbbbbbbbbbb",
"bbbbbbbbbbbbbb",
],
[
"cccccccccccccccccccccc",
"cccccccccccccccccccccc",
],
[
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
],
[
"ffffffffffffffffffffffffffffffffffffffffffffffffffffffffff",
"fffffffffffffffffffffffffffffffffffffff-7e43d30e"
],
[
"eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee",
"eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee-d0ccef3c",
],
[
"ggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggg",
"ggggggggggggggggggggggggggggggggggggggg-2910cecf",
],
]
)
def test_truncate_if_necessary(before_truncation, after_truncation):
assert truncate_if_necessary(before_truncation) == after_truncation


def test_create_table_long_name_data_file(client, long_column_data_file, schema):
table_name = 'My Long column name datafile'
# response, response_table, table = _create_table(
# )
expt_name = _get_expected_name(table_name, data_file=long_column_data_file)
first_row = (
1, 'NATION', '8.6', '4.5', '8.5', '4.3', '8.3', '4.6', '78.6', '2.22',
'0.88', '0.66', '1.53', '3.75', '3.26', '0.45', '0.07', '53.9', '52.3',
'0.8', '0.38487', '3.15796', '2.3', '33247', '14.842144', '6.172333',
'47.158545', '1.698662', '2.345577', '7.882694', '0.145406', '3.395302',
'92.085375', '14.447634', '78.873848', '1.738571', '16.161024',
'19.436701', '8.145643', '94.937079', '74.115131', '75.601680',
'22.073834', '11.791045', '1.585233',
'1.016932', '2023-02-01'
)
column_names = [
"State or Nation",
"Cycle 1 Total Number of Health Deficiencies",
"Cycle 1 Total Number of Fire Safety Deficiencies",
"Cycle 2 Total Number of Health Deficiencies",
"Cycle 2 Total Number of Fire Safety Deficiencies",
"Cycle 3 Total Number of Health Deficiencies",
"Cycle 3 Total Number of Fire Safety Deficiencies",
"Average Number of Residents per Day",
"Reported Nurse Aide Staffing Hours per Resident per Day",
"Reported LPN Staffing Hours per Resident per Day",
"Reported RN Staffing Hours per Resident per Day",
"Reported Licensed Staffing Hours per Resident per Day",
"Reported Total Nurse Staffing Hours per Resident per Day",
"Total number of nurse staff hours per resident per day on the weekend",
"Registered Nurse hours per resident per day on the weekend",
"Reported Physical Therapist Staffing Hours per Resident Per Day",
"Total nursing staff turnover",
"Registered Nurse turnover",
"Number of administrators who have left the nursing home",
"Case-Mix RN Staffing Hours per Resident per Day",
"Case-Mix Total Nurse Staffing Hours per Resident per Day",
"Number of Fines",
"Fine Amount in Dollars",
"Percentage of long stay residents whose need for help with daily activities has increased",
"Percentage of long stay residents who lose too much weight",
"Percentage of low risk long stay residents who lose control of their bowels or bladder",
"Percentage of long stay residents with a catheter inserted and left in their bladder",
"Percentage of long stay residents with a urinary tract infection",
"Percentage of long stay residents who have depressive symptoms",
"Percentage of long stay residents who were physically restrained",
"Percentage of long stay residents experiencing one or more falls with major injury",
"Percentage of long stay residents assessed and appropriately given the pneumococcal vaccine",
"Percentage of long stay residents who received an antipsychotic medication",
"Percentage of short stay residents assessed and appropriately given the pneumococcal vaccine",
"Percentage of short stay residents who newly received an antipsychotic medication",
"Percentage of long stay residents whose ability to move independently worsened",
"Percentage of long stay residents who received an antianxiety or hypnotic medication",
"Percentage of high risk long stay residents with pressure ulcers",
"Percentage of long stay residents assessed and appropriately given the seasonal influenza vaccine",
"Percentage of short stay residents who made improvements in function",
"Percentage of short stay residents who were assessed and appropriately given the seasonal influenza vaccine",
"Percentage of short stay residents who were rehospitalized after a nursing home admission",
"Percentage of short stay residents who had an outpatient emergency department visit",
"Number of hospitalizations per 1000 long-stay resident days",
"Number of outpatient emergency department visits per 1000 long-stay resident days",
"Processing Date"
]
# Make sure at least some column names require truncation;
# 63 is the hard Postgres limit; we're also experiencing problems with ids
# as short as 58 characters, but I'll leave this at 63 so that it doesn't
# have to be updated once that's fixed.
assert any(
len(column_name) >= 63
for column_name
in column_names
)
processed_column_names = [truncate_if_necessary(col) for col in column_names]
table = check_create_table_response(
client, table_name, expt_name, long_column_data_file, schema, first_row,
processed_column_names, import_target_table=None
)
# This just makes sure we can get records. This was a bug with long column names.
client.get(f'/api/db/v0/tables/{table.id}/records/')
76 changes: 0 additions & 76 deletions mathesar/tests/imports/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,6 @@ def data_file(patents_csv_filepath):
return data_file


@pytest.fixture
def long_column_data_file():
data_filepath = 'mathesar/tests/data/long_column_names.csv'
with open(data_filepath, "rb") as csv_file:
data_file = DataFile.objects.create(file=File(csv_file))
return data_file


@pytest.fixture
def headerless_data_file(headerless_patents_csv_filepath):
with open(headerless_patents_csv_filepath, "rb") as csv_file:
Expand Down Expand Up @@ -98,74 +90,6 @@ def test_csv_upload(data_file, schema):
)


def test_csv_upload_long_columns(long_column_data_file, schema):
table_name = "long_cols"
table = create_table_from_csv(long_column_data_file, table_name, schema)

num_records = 54
expected_row = (
1, 'NATION', '8.6', '4.5', '8.5', '4.3', '8.3', '4.6', '78.6', '2.22',
'0.88', '0.66', '1.53', '3.75', '3.26', '0.45', '0.07', '53.9', '52.3',
'0.8', '0.38487', '3.15796', '2.3', '33247', '14.842144', '6.172333',
'47.158545', '1.698662', '2.345577', '7.882694', '0.145406', '3.395302',
'92.085375', '14.447634', '78.873848', '1.738571', '16.161024',
'19.436701', '8.145643', '94.937079', '74.115131', '75.601680',
'22.073834', '11.791045', '1.585233', '1.016932', '2023-02-01',
)
expected_cols = [
'id',
'State or Nation',
'Cycle 1 Total Number of Health Deficiencies',
'Cycle 1 Total Number of Fire Safety Deficiencies',
'Cycle 2 Total Number of Health Deficiencies',
'Cycle 2 Total Number of Fire Safety Deficiencies',
'Cycle 3 Total Number of Health Deficiencies',
'Cycle 3 Total Number of Fire Safety Deficiencies',
'Average Number of Residents per Day',
'Reported Nurse Aide Staffing Hours per Resident per Day',
'Reported LPN Staffing Hours per Resident per Day',
'Reported RN Staffing Hours per Resident per Day',
'Reported Licensed Staffing Hours per Resident per Day',
'Reported Total Nurse Staffing Hours per Resident per Day',
'Total number of nurse staff hours per resident per day-8cd5ab5e',
'Registered Nurse hours per resident per day on the weekend',
'Reported Physical Therapist Staffing Hours per Resident Per Day',
'Total nursing staff turnover',
'Registered Nurse turnover',
'Number of administrators who have left the nursing home',
'Case-Mix RN Staffing Hours per Resident per Day',
'Case-Mix Total Nurse Staffing Hours per Resident per Day',
'Number of Fines',
'Fine Amount in Dollars',
'Percentage of long stay residents whose need for help-5c97c88f',
'Percentage of long stay residents who lose too much weight',
'Percentage of low risk long stay residents who lose co-fc6bc241',
'Percentage of long stay residents with a catheter inse-ce71f22a',
'Percentage of long stay residents with a urinary tract-f16fbec8',
'Percentage of long stay residents who have depressive symptoms',
'Percentage of long stay residents who were physically-f30de0aa',
'Percentage of long stay residents experiencing one or-9f9e8f36',
'Percentage of long stay residents assessed and appropr-84744861',
'Percentage of long stay residents who received an anti-20fe5d12',
'Percentage of short stay residents assessed and approp-3568770f',
'Percentage of short stay residents who newly received-e98612b4',
'Percentage of long stay residents whose ability to mov-66839cb4',
'Percentage of long stay residents who received an anti-868593e4',
'Percentage of high risk long stay residents with press-b624bbba',
'Percentage of long stay residents assessed and appropr-999c26ef',
'Percentage of short stay residents who made improvemen-ebe5c21e',
'Percentage of short stay residents who were assessed a-26e64965',
'Percentage of short stay residents who were rehospital-682a4dae',
'Percentage of short stay residents who had an outpatie-9403ec21',
'Number of hospitalizations per 1000 long-stay resident days',
'Number of outpatient emergency department visits per 1-f0fed7b5',
'Processing Date'
]
check_csv_upload(
table, table_name, schema, num_records, expected_row, expected_cols
)


def test_headerless_csv_upload(headerless_data_file, schema):
table_name = "NASA no headers"
table = create_table_from_csv(headerless_data_file, table_name, schema)
Expand Down

0 comments on commit 8d53d6f

Please sign in to comment.