mathesar-foundation · dmos62 · Aug 3, 2023 · Jul 14, 2023 · Jul 18, 2023 · Jul 19, 2023
diff --git a/db/records/operations/insert.py b/db/records/operations/insert.py
@@ -89,6 +89,12 @@ def insert_records_from_json(table, engine, json_filepath, column_names, max_lev
     insert_record_or_records(table, engine, data)
 
 
+def insert_records_from_excel(table, engine, excel_filepath):
+    df = pandas.read_excel(excel_filepath)
+    records = json.loads(df.to_json(orient='records'))
+    insert_record_or_records(table, engine, records)
+
+
 def insert_records_from_csv(table, engine, csv_filepath, column_names, header, delimiter=None, escape=None, quote=None, encoding=None):
     with open(csv_filepath, "r", encoding=encoding) as csv_file:
         with engine.begin() as conn:

diff --git a/mathesar/api/exceptions/database_exceptions/exceptions.py b/mathesar/api/exceptions/database_exceptions/exceptions.py
@@ -495,6 +495,22 @@ def __init__(
         super().__init__(exception, self.error_code, message, field, details, status_code)
 
 
+class UnsupportedFileFormat(MathesarAPIException):
+    error_code = ErrorCodes.UnsupportedFileFormat.value
+
+    def __init__(
+            self,
+            exception=None,
+            message='This file format is not supported.',
+            field=None,
+            details=None,
+            status_code=status.HTTP_400_BAD_REQUEST
+    ):
+        if exception is None:
+            exception = Exception(message)
+        super().__init__(exception, self.error_code, message, field, details, status_code)
+
+
 class DynamicDefaultModificationError(Exception):
     def __init__(self, column=None):
         self.column = column
diff --git a/mathesar/api/exceptions/error_codes.py b/mathesar/api/exceptions/error_codes.py
@@ -65,3 +65,4 @@ class ErrorCodes(Enum):
     DynamicDefaultAlterationToStaticDefault = 4424
     InvalidJSONFormat = 4425
     UnsupportedJSONFormat = 4426
+    UnsupportedFileFormat = 4427
diff --git a/mathesar/imports/base.py b/mathesar/imports/base.py
@@ -1,6 +1,7 @@
 from mathesar.database.base import create_mathesar_engine
 from mathesar.models.base import Table
 from mathesar.imports.csv import create_db_table_from_csv_data_file
+from mathesar.imports.excel import create_db_table_from_excel_data_file
 from mathesar.imports.json import create_db_table_from_json_data_file
 from db.tables.operations.select import get_oid_from_table
 from mathesar.errors import InvalidTableError
@@ -19,6 +20,10 @@ def create_table_from_data_file(data_file, name, schema, comment=None):
         db_table = create_db_table_from_json_data_file(
             data_file, name, schema, comment=comment
         )
+    elif data_file.type == 'excel':
+        db_table = create_db_table_from_excel_data_file(
+            data_file, name, schema, comment=comment
+        )
     else:
         raise InvalidTableError
     db_table_oid = get_oid_from_table(db_table.name, db_table.schema, engine)

diff --git a/mathesar/imports/csv.py b/mathesar/imports/csv.py
@@ -21,6 +21,14 @@
 CHECK_ROWS = 10
 
 
+def is_valid_csv(data):
+    try:
+        csv.reader(data)
+    except (csv.CsvError, ValueError):
+        return False
+    return True
+
+
 def get_file_encoding(file):
     """
     Given a file, uses charset_normalizer if installed or chardet which is installed as part of clevercsv module to

diff --git a/mathesar/imports/excel.py b/mathesar/imports/excel.py
@@ -0,0 +1,56 @@
+import pandas
+
+from db.tables.operations.alter import update_pk_sequence_to_latest
+from mathesar.database.base import create_mathesar_engine
+from db.records.operations.insert import insert_records_from_excel
+from db.tables.operations.create import create_string_column_table
+from db.tables.operations.drop import drop_table
+from mathesar.imports.utils import process_column_names
+from db.constants import ID, ID_ORIGINAL
+from psycopg2.errors import IntegrityError, DataError
+
+from mathesar.state import reset_reflection
+
+
+def get_column_names_from_excel(data_file):
+    df = pandas.read_excel(data_file)
+    return list(df.columns)
+
+
+def insert_data_from_excel_data_file(name, schema, column_names, engine, comment, excel_filepath):
+    table = create_string_column_table(
+        name=name,
+        schema=schema.name,
+        column_names=column_names,
+        engine=engine,
+        comment=comment,
+    )
+
+    insert_records_from_excel(
+        table,
+        engine,
+        excel_filepath,
+    )
+    return table
+
+
+def create_db_table_from_excel_data_file(data_file, name, schema, comment=None):
+    db_name = schema.database.name
+    engine = create_mathesar_engine(db_name)
+    excel_filepath = data_file.file.path
+    column_names = process_column_names(
+        get_column_names_from_excel(excel_filepath)
+    )
+    try:
+        table = insert_data_from_excel_data_file(name, schema, column_names, engine, comment, excel_filepath)
+        update_pk_sequence_to_latest(engine, table)
+    except (IntegrityError, DataError):
+        drop_table(name=name, schema=schema.name, engine=engine)
+        column_names_alt = [
+            fieldname if fieldname != ID else ID_ORIGINAL
+            for fieldname in column_names
+        ]
+        table = insert_data_from_excel_data_file(name, schema, column_names_alt, engine, comment, excel_filepath)
+
+    reset_reflection(db_name=db_name)
+    return table
diff --git a/mathesar/tests/data/patents.xlsx b/mathesar/tests/data/patents.xlsx
diff --git a/mathesar/utils/datafiles.py b/mathesar/utils/datafiles.py
@@ -1,17 +1,24 @@
 import os
+import pandas
 from time import time
 from io import TextIOWrapper
 
 import requests
 from django.core.files.base import ContentFile
 from django.core.files.uploadedfile import TemporaryUploadedFile
 
+from mathesar.api.exceptions.database_exceptions import (
+    exceptions as database_api_exceptions
+)
 from mathesar.errors import URLDownloadError
-from mathesar.imports.csv import get_sv_dialect, get_file_encoding
+from mathesar.imports.csv import is_valid_csv, get_sv_dialect, get_file_encoding
 from mathesar.imports.json import is_valid_json, validate_json_format
 from mathesar.models.base import DataFile
 
 
+ALLOWED_FILE_FORMATS = ['csv', 'tsv', 'json', 'xls', 'xlsx', 'xlsm', 'xlsb', 'odf', 'ods', 'odt']
+
+
 def _download_datafile(url):
     name = 'file_from_url'
     if '/' in url:
@@ -29,6 +36,40 @@ def _download_datafile(url):
     return temp_file
 
 
+def _get_file_type(raw_file):
+    """
+    Algorithm:
+    1.  Get file extension using 'os' library.
+    2.  If the file extension is in ALLOWED_FILE_FORMATS then return file type
+        as 'csv', 'tsv', 'json' or 'excel'.
+    3.  If the file does not have an extension or does not have an allowed one,
+        we check for the file type using brute force approach. Similar case can
+        also arise when we download a file from an URL and it does not have a
+        file type. We first try to read the file using 'csv' library. If it fails,
+        we check if it is a valid JSON (using 'json' library) or a valid Excel like
+        file (using 'pandas' library).
+    4.  If it fails all the above operations, we raise UnsupportedFileFormat exception.
+    """
+
+    file_extension = os.path.splitext(raw_file.name)[1][1:]
+    if file_extension in ALLOWED_FILE_FORMATS:
+        if file_extension in ['csv', 'tsv', 'json']:
+            return file_extension
+        else:
+            return 'excel'
+
+    if is_valid_csv(raw_file):
+        return file_extension if file_extension in ['csv', 'tsv'] else 'csv'
+    elif is_valid_json(raw_file):
+        return 'json'
+    else:
+        try:
+            pandas.read_excel(raw_file)
+            return 'excel'
+        except pandas.errors.ParserError:
+            raise database_api_exceptions.UnsupportedFileFormat()
+
+
 def create_datafile(data):
     header = data.get('header', True)
 
@@ -43,12 +84,12 @@ def create_datafile(data):
         raw_file = _download_datafile(data['url'])
         created_from = 'url'
         base_name = raw_file.name
-        type = os.path.splitext(raw_file.name)[1][1:]
+        type = _get_file_type(raw_file)
     elif 'file' in data:
         raw_file = data['file']
         created_from = 'file'
         base_name = raw_file.name
-        type = os.path.splitext(raw_file.name)[1][1:]
+        type = _get_file_type(raw_file)
 
     if base_name:
         max_length = DataFile._meta.get_field('base_name').max_length

diff --git a/requirements.txt b/requirements.txt
@@ -26,4 +26,5 @@ whitenoise==6.4.0
 git+https://github.com/centerofci/sqlalchemy-filters@models_to_tables#egg=sqlalchemy_filters
 gunicorn==20.1.0
 drf-spectacular==0.26.2
-pandas==2.0.2
+pandas==2.0.2
+openpyxl==3.1.2