Merge pull request #3059 from IamEzio/perfect-excel

Added functionality to import perfect Excel
mathesar-foundation · Aug 3, 2023 · 6bdb38a · 6bdb38a
2 parents 8c8dc44 + bcc5325
commit 6bdb38a
Show file tree

Hide file tree

Showing 11 changed files with 162 additions and 30 deletions.
diff --git a/db/records/operations/insert.py b/db/records/operations/insert.py
@@ -33,6 +33,18 @@ def insert_record_or_records(table, engine, record_data):
     return None
 
 
+def get_records_from_dataframe(df):
+    """
+    We convert the dataframe to JSON using to_json() method and then to a Python object.
+    This method replaces 'NaN' values in the dataframe with 'None' values in Python
+    object. The reason behind not using df.to_dict() method is beacuse it stringifies
+    'NaN' values rather than converting them to a 'None' value.
+    We pass 'records' as the orientation parameter because we want each record to contain
+    data of a single row and not of a single column (which is the default behaviour).
+    """
+    return json.loads(df.to_json(orient='records'))
+
+
 def insert_records_from_json(table, engine, json_filepath, column_names, max_level):
     """
     Normalizes JSON data and inserts it into a table.
@@ -49,10 +61,7 @@ def insert_records_from_json(table, engine, json_filepath, column_names, max_lev
         2.  We normalize data into a pandas dataframe using pandas.json_normalize() method.
             The method takes column names as meta. We provide all possible keys as column
             names, hence it adds missing keys to JSON objects and marks their values as NaN.
-        3.  We convert the dataframe to JSON using to_json() method and then to a Python object.
-            This method replaces 'NaN' values in the dataframe with 'None' values in Python
-            object. The reason behind not using df.to_dict() method is beacuse it stringifies
-            'NaN' values rather than converting them to a 'None' value.
+        3.  We get records from the dataframe using the method get_records_from_dataframe().
         4.  The processed data is now a list of dict objects. Each dict has same keys, that are
             the column names of the table. We loop through each dict object, and if any value is
             a dict or a list, we stringify them before inserting them into the table. This way,
@@ -77,16 +86,21 @@ def insert_records_from_json(table, engine, json_filepath, column_names, max_lev
         our table and not just the keys from the first JSON object.
     """
     df = pandas.json_normalize(data, max_level=max_level, meta=column_names)
-    data = json.loads(df.to_json(orient='records'))
+    records = get_records_from_dataframe(df)
 
-    for i, row in enumerate(data):
-        data[i] = {
+    for i, row in enumerate(records):
+        records[i] = {
             k: json.dumps(v)
             if (isinstance(v, dict) or isinstance(v, list))
             else v
             for k, v in row.items()
         }
-    insert_record_or_records(table, engine, data)
+    insert_record_or_records(table, engine, records)
+
+
+def insert_records_from_excel(table, engine, dataframe):
+    records = get_records_from_dataframe(dataframe)
+    insert_record_or_records(table, engine, records)
 
 
 def insert_records_from_csv(table, engine, csv_filepath, column_names, header, delimiter=None, escape=None, quote=None, encoding=None):

diff --git a/mathesar/api/exceptions/database_exceptions/exceptions.py b/mathesar/api/exceptions/database_exceptions/exceptions.py
@@ -498,6 +498,22 @@ def __init__(
         super().__init__(exception, self.error_code, message, field, details, status_code)
 
 
+class UnsupportedFileFormat(MathesarAPIException):
+    error_code = ErrorCodes.UnsupportedFileFormat.value
+
+    def __init__(
+            self,
+            exception=None,
+            message='This file format is not supported.',
+            field=None,
+            details=None,
+            status_code=status.HTTP_400_BAD_REQUEST
+    ):
+        if exception is None:
+            exception = Exception(message)
+        super().__init__(exception, self.error_code, message, field, details, status_code)
+
+
 class DynamicDefaultModificationError(Exception):
     def __init__(self, column=None):
         self.column = column
diff --git a/mathesar/api/exceptions/error_codes.py b/mathesar/api/exceptions/error_codes.py
@@ -66,3 +66,4 @@ class ErrorCodes(Enum):
     DynamicDefaultAlterationToStaticDefault = 4424
     InvalidJSONFormat = 4425
     UnsupportedJSONFormat = 4426
+    UnsupportedFileFormat = 4427
diff --git a/mathesar/imports/base.py b/mathesar/imports/base.py
@@ -1,6 +1,7 @@
 from mathesar.database.base import create_mathesar_engine
 from mathesar.models.base import Table
 from mathesar.imports.csv import create_db_table_from_csv_data_file
+from mathesar.imports.excel import create_db_table_from_excel_data_file
 from mathesar.imports.json import create_db_table_from_json_data_file
 from db.tables.operations.select import get_oid_from_table
 from mathesar.errors import InvalidTableError
@@ -19,6 +20,10 @@ def create_table_from_data_file(data_file, name, schema, comment=None):
         db_table = create_db_table_from_json_data_file(
             data_file, name, schema, comment=comment
         )
+    elif data_file.type == 'excel':
+        db_table = create_db_table_from_excel_data_file(
+            data_file, name, schema, comment=comment
+        )
     else:
         raise InvalidTableError
     db_table_oid = get_oid_from_table(db_table.name, db_table.schema, engine)

diff --git a/mathesar/imports/csv.py b/mathesar/imports/csv.py
@@ -8,8 +8,8 @@
 from db.tables.operations.create import create_string_column_table
 from db.tables.operations.drop import drop_table
 from mathesar.errors import InvalidTableError
-from mathesar.imports.utils import process_column_names
-from db.constants import ID, ID_ORIGINAL, COLUMN_NAME_TEMPLATE
+from mathesar.imports.utils import get_alternate_column_names, process_column_names
+from db.constants import COLUMN_NAME_TEMPLATE
 from psycopg2.errors import IntegrityError, DataError
 
 from mathesar.state import reset_reflection
@@ -21,6 +21,14 @@
 CHECK_ROWS = 10
 
 
+def is_valid_csv(data):
+    try:
+        csv.reader(data)
+    except (csv.CsvError, ValueError):
+        return False
+    return True
+
+
 def get_file_encoding(file):
     """
     Given a file, uses charset_normalizer if installed or chardet which is installed as part of clevercsv module to
@@ -109,7 +117,7 @@ def get_sv_reader(file, header, dialect=None):
     return reader
 
 
-def insert_data_from_csv_data_file(name, schema, column_names, engine, comment, data_file):
+def insert_records_from_csv_data_file(name, schema, column_names, engine, comment, data_file):
     dialect = csv.dialect.SimpleDialect(data_file.delimiter, data_file.quotechar,
                                         data_file.escapechar)
     encoding = get_file_encoding(data_file.file)
@@ -145,14 +153,11 @@ def create_db_table_from_csv_data_file(data_file, name, schema, comment=None):
         sv_reader = get_sv_reader(sv_file, header, dialect=dialect)
         column_names = process_column_names(sv_reader.fieldnames)
     try:
-        table = insert_data_from_csv_data_file(name, schema, column_names, engine, comment, data_file)
+        table = insert_records_from_csv_data_file(name, schema, column_names, engine, comment, data_file)
         update_pk_sequence_to_latest(engine, table)
     except (IntegrityError, DataError):
         drop_table(name=name, schema=schema.name, engine=engine)
-        column_names_alt = [
-            column_name if column_name != ID else ID_ORIGINAL
-            for column_name in column_names
-        ]
-        insert_data_from_csv_data_file(name, schema, column_names_alt, engine, comment, data_file)
+        column_names_alt = get_alternate_column_names(column_names)
+        insert_records_from_csv_data_file(name, schema, column_names_alt, engine, comment, data_file)
     reset_reflection(db_name=db_name)
     return table
diff --git a/mathesar/imports/excel.py b/mathesar/imports/excel.py
@@ -0,0 +1,45 @@
+import pandas
+
+from db.tables.operations.alter import update_pk_sequence_to_latest
+from mathesar.database.base import create_mathesar_engine
+from db.records.operations.insert import insert_records_from_excel
+from db.tables.operations.create import create_string_column_table
+from db.tables.operations.drop import drop_table
+from mathesar.imports.utils import get_alternate_column_names, process_column_names
+from psycopg2.errors import IntegrityError, DataError
+
+from mathesar.state import reset_reflection
+
+
+def insert_records_from_dataframe(name, schema, column_names, engine, comment, dataframe):
+    table = create_string_column_table(
+        name=name,
+        schema=schema.name,
+        column_names=column_names,
+        engine=engine,
+        comment=comment,
+    )
+
+    insert_records_from_excel(
+        table,
+        engine,
+        dataframe,
+    )
+    return table
+
+
+def create_db_table_from_excel_data_file(data_file, name, schema, comment=None):
+    db_name = schema.database.name
+    engine = create_mathesar_engine(db_name)
+    dataframe = pandas.read_excel(data_file.file.path)
+    column_names = process_column_names(dataframe.columns)
+    try:
+        table = insert_records_from_dataframe(name, schema, column_names, engine, comment, dataframe)
+        update_pk_sequence_to_latest(engine, table)
+    except (IntegrityError, DataError):
+        drop_table(name=name, schema=schema.name, engine=engine)
+        column_names_alt = get_alternate_column_names(column_names)
+        table = insert_records_from_dataframe(name, schema, column_names_alt, engine, comment, dataframe)
+
+    reset_reflection(db_name=db_name)
+    return table
diff --git a/mathesar/imports/json.py b/mathesar/imports/json.py
@@ -9,8 +9,7 @@
 from mathesar.api.exceptions.database_exceptions import (
     exceptions as database_api_exceptions
 )
-from mathesar.imports.utils import process_column_names
-from db.constants import ID, ID_ORIGINAL
+from mathesar.imports.utils import get_alternate_column_names, process_column_names
 from psycopg2.errors import IntegrityError, DataError
 
 from mathesar.state import reset_reflection
@@ -64,7 +63,7 @@ def get_column_names_from_json(data_file, max_level):
         return get_flattened_keys(data, max_level)
 
 
-def insert_data_from_json_data_file(name, schema, column_names, engine, comment, json_filepath, max_level):
+def insert_records_from_json_data_file(name, schema, column_names, engine, comment, json_filepath, max_level):
     table = create_string_column_table(
         name=name,
         schema=schema.name,
@@ -91,15 +90,12 @@ def create_db_table_from_json_data_file(data_file, name, schema, comment=None):
         get_column_names_from_json(json_filepath, max_level)
     )
     try:
-        table = insert_data_from_json_data_file(name, schema, column_names, engine, comment, json_filepath, max_level)
+        table = insert_records_from_json_data_file(name, schema, column_names, engine, comment, json_filepath, max_level)
         update_pk_sequence_to_latest(engine, table)
     except (IntegrityError, DataError):
         drop_table(name=name, schema=schema.name, engine=engine)
-        column_names_alt = [
-            fieldname if fieldname != ID else ID_ORIGINAL
-            for fieldname in column_names
-        ]
-        table = insert_data_from_json_data_file(name, schema, column_names_alt, engine, comment, json_filepath, max_level)
+        column_names_alt = get_alternate_column_names(column_names)
+        table = insert_records_from_json_data_file(name, schema, column_names_alt, engine, comment, json_filepath, max_level)
 
     reset_reflection(db_name=db_name)
     return table
diff --git a/mathesar/imports/utils.py b/mathesar/imports/utils.py
@@ -1,5 +1,5 @@
 from db.identifiers import truncate_if_necessary
-from db.constants import COLUMN_NAME_TEMPLATE
+from db.constants import COLUMN_NAME_TEMPLATE, ID, ID_ORIGINAL
 
 
 def process_column_names(column_names):
@@ -19,3 +19,10 @@ def process_column_names(column_names):
         in enumerate(column_names)
     )
     return list(column_names)
+
+
+def get_alternate_column_names(column_names):
+    return [
+        fieldname if fieldname != ID else ID_ORIGINAL
+        for fieldname in column_names
+    ]
diff --git a/mathesar/tests/data/patents.xlsx b/mathesar/tests/data/patents.xlsx
diff --git a/mathesar/utils/datafiles.py b/mathesar/utils/datafiles.py
@@ -1,17 +1,24 @@
 import os
+import pandas
 from time import time
 from io import TextIOWrapper
 
 import requests
 from django.core.files.base import ContentFile
 from django.core.files.uploadedfile import TemporaryUploadedFile
 
+from mathesar.api.exceptions.database_exceptions import (
+    exceptions as database_api_exceptions
+)
 from mathesar.errors import URLDownloadError
-from mathesar.imports.csv import get_sv_dialect, get_file_encoding
+from mathesar.imports.csv import is_valid_csv, get_sv_dialect, get_file_encoding
 from mathesar.imports.json import is_valid_json, validate_json_format
 from mathesar.models.base import DataFile
 
 
+ALLOWED_FILE_FORMATS = ['csv', 'tsv', 'json', 'xls', 'xlsx', 'xlsm', 'xlsb', 'odf', 'ods', 'odt']
+
+
 def _download_datafile(url):
     name = 'file_from_url'
     if '/' in url:
@@ -29,6 +36,40 @@ def _download_datafile(url):
     return temp_file
 
 
+def _get_file_type(raw_file):
+    """
+    Algorithm:
+    1.  Get file extension using 'os' library.
+    2.  If the file extension is in ALLOWED_FILE_FORMATS then return file type
+        as 'csv', 'tsv', 'json' or 'excel'.
+    3.  If the file does not have an extension or does not have an allowed one,
+        we check for the file type using brute force approach. Similar case can
+        also arise when we download a file from an URL and it does not have a
+        file type. We first try to read the file using 'csv' library. If it fails,
+        we check if it is a valid JSON (using 'json' library) or a valid Excel like
+        file (using 'pandas' library).
+    4.  If it fails all the above operations, we raise UnsupportedFileFormat exception.
+    """
+
+    file_extension = os.path.splitext(raw_file.name)[1][1:]
+    if file_extension in ALLOWED_FILE_FORMATS:
+        if file_extension in ['csv', 'tsv', 'json']:
+            return file_extension
+        else:
+            return 'excel'
+
+    if is_valid_csv(raw_file):
+        return 'csv'
+    elif is_valid_json(raw_file):
+        return 'json'
+    else:
+        try:
+            pandas.read_excel(raw_file)
+            return 'excel'
+        except pandas.errors.ParserError:
+            raise database_api_exceptions.UnsupportedFileFormat()
+
+
 def create_datafile(data):
     header = data.get('header', True)
 
@@ -43,12 +84,12 @@ def create_datafile(data):
         raw_file = _download_datafile(data['url'])
         created_from = 'url'
         base_name = raw_file.name
-        type = os.path.splitext(raw_file.name)[1][1:]
+        type = _get_file_type(raw_file)
     elif 'file' in data:
         raw_file = data['file']
         created_from = 'file'
         base_name = raw_file.name
-        type = os.path.splitext(raw_file.name)[1][1:]
+        type = _get_file_type(raw_file)
 
     if base_name:
         max_length = DataFile._meta.get_field('base_name').max_length

diff --git a/requirements.txt b/requirements.txt
@@ -26,3 +26,5 @@ git+https://github.com/centerofci/sqlalchemy-filters@models_to_tables#egg=sqlalc
 gunicorn==20.1.0
 drf-spectacular==0.26.2
 pandas==2.0.2
+openpyxl==3.1.2
+pyxlsb==1.0.10