Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added functionality to import perfect Excel #3059

Merged
merged 10 commits into from
Aug 3, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 6 additions & 0 deletions db/records/operations/insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,12 @@ def insert_records_from_json(table, engine, json_filepath, column_names, max_lev
insert_record_or_records(table, engine, data)


def insert_records_from_excel(table, engine, excel_filepath):
df = pandas.read_excel(excel_filepath)
records = json.loads(df.to_json(orient='records'))
dmos62 marked this conversation as resolved.
Show resolved Hide resolved
insert_record_or_records(table, engine, records)


def insert_records_from_csv(table, engine, csv_filepath, column_names, header, delimiter=None, escape=None, quote=None, encoding=None):
with open(csv_filepath, "r", encoding=encoding) as csv_file:
with engine.begin() as conn:
Expand Down
16 changes: 16 additions & 0 deletions mathesar/api/exceptions/database_exceptions/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,22 @@ def __init__(
super().__init__(exception, self.error_code, message, field, details, status_code)


class UnsupportedFileFormat(MathesarAPIException):
error_code = ErrorCodes.UnsupportedFileFormat.value

def __init__(
self,
exception=None,
message='This file format is not supported.',
field=None,
details=None,
status_code=status.HTTP_400_BAD_REQUEST
):
if exception is None:
exception = Exception(message)
super().__init__(exception, self.error_code, message, field, details, status_code)


class DynamicDefaultModificationError(Exception):
def __init__(self, column=None):
self.column = column
1 change: 1 addition & 0 deletions mathesar/api/exceptions/error_codes.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,4 @@ class ErrorCodes(Enum):
DynamicDefaultAlterationToStaticDefault = 4424
InvalidJSONFormat = 4425
UnsupportedJSONFormat = 4426
UnsupportedFileFormat = 4427
5 changes: 5 additions & 0 deletions mathesar/imports/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from mathesar.database.base import create_mathesar_engine
from mathesar.models.base import Table
from mathesar.imports.csv import create_db_table_from_csv_data_file
from mathesar.imports.excel import create_db_table_from_excel_data_file
from mathesar.imports.json import create_db_table_from_json_data_file
from db.tables.operations.select import get_oid_from_table
from mathesar.errors import InvalidTableError
Expand All @@ -19,6 +20,10 @@ def create_table_from_data_file(data_file, name, schema, comment=None):
db_table = create_db_table_from_json_data_file(
data_file, name, schema, comment=comment
)
elif data_file.type == 'excel':
db_table = create_db_table_from_excel_data_file(
data_file, name, schema, comment=comment
)
else:
raise InvalidTableError
db_table_oid = get_oid_from_table(db_table.name, db_table.schema, engine)
Expand Down
8 changes: 8 additions & 0 deletions mathesar/imports/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,14 @@
CHECK_ROWS = 10


def is_valid_csv(data):
try:
csv.reader(data)
except (csv.CsvError, ValueError):
return False
return True


def get_file_encoding(file):
"""
Given a file, uses charset_normalizer if installed or chardet which is installed as part of clevercsv module to
Expand Down
56 changes: 56 additions & 0 deletions mathesar/imports/excel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import pandas

from db.tables.operations.alter import update_pk_sequence_to_latest
from mathesar.database.base import create_mathesar_engine
from db.records.operations.insert import insert_records_from_excel
from db.tables.operations.create import create_string_column_table
from db.tables.operations.drop import drop_table
from mathesar.imports.utils import process_column_names
from db.constants import ID, ID_ORIGINAL
from psycopg2.errors import IntegrityError, DataError

from mathesar.state import reset_reflection


def get_column_names_from_excel(data_file):
dmos62 marked this conversation as resolved.
Show resolved Hide resolved
df = pandas.read_excel(data_file)
dmos62 marked this conversation as resolved.
Show resolved Hide resolved
return list(df.columns)


def insert_data_from_excel_data_file(name, schema, column_names, engine, comment, excel_filepath):
table = create_string_column_table(
name=name,
schema=schema.name,
column_names=column_names,
engine=engine,
comment=comment,
)

insert_records_from_excel(
table,
engine,
excel_filepath,
)
return table


def create_db_table_from_excel_data_file(data_file, name, schema, comment=None):
db_name = schema.database.name
engine = create_mathesar_engine(db_name)
excel_filepath = data_file.file.path
column_names = process_column_names(
get_column_names_from_excel(excel_filepath)
)
try:
table = insert_data_from_excel_data_file(name, schema, column_names, engine, comment, excel_filepath)
update_pk_sequence_to_latest(engine, table)
except (IntegrityError, DataError):
drop_table(name=name, schema=schema.name, engine=engine)
column_names_alt = [
fieldname if fieldname != ID else ID_ORIGINAL
for fieldname in column_names
]
dmos62 marked this conversation as resolved.
Show resolved Hide resolved
table = insert_data_from_excel_data_file(name, schema, column_names_alt, engine, comment, excel_filepath)
dmos62 marked this conversation as resolved.
Show resolved Hide resolved

reset_reflection(db_name=db_name)
return table
Binary file added mathesar/tests/data/patents.xlsx
Binary file not shown.
47 changes: 44 additions & 3 deletions mathesar/utils/datafiles.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@
import os
import pandas
from time import time
from io import TextIOWrapper

import requests
from django.core.files.base import ContentFile
from django.core.files.uploadedfile import TemporaryUploadedFile

from mathesar.api.exceptions.database_exceptions import (
exceptions as database_api_exceptions
)
from mathesar.errors import URLDownloadError
from mathesar.imports.csv import get_sv_dialect, get_file_encoding
from mathesar.imports.csv import is_valid_csv, get_sv_dialect, get_file_encoding
from mathesar.imports.json import is_valid_json, validate_json_format
from mathesar.models.base import DataFile


ALLOWED_FILE_FORMATS = ['csv', 'tsv', 'json', 'xls', 'xlsx', 'xlsm', 'xlsb', 'odf', 'ods', 'odt']
dmos62 marked this conversation as resolved.
Show resolved Hide resolved


def _download_datafile(url):
name = 'file_from_url'
if '/' in url:
Expand All @@ -29,6 +36,40 @@ def _download_datafile(url):
return temp_file


def _get_file_type(raw_file):
"""
Algorithm:
1. Get file extension using 'os' library.
2. If the file extension is in ALLOWED_FILE_FORMATS then return file type
as 'csv', 'tsv', 'json' or 'excel'.
3. If the file does not have an extension or does not have an allowed one,
we check for the file type using brute force approach. Similar case can
also arise when we download a file from an URL and it does not have a
file type. We first try to read the file using 'csv' library. If it fails,
we check if it is a valid JSON (using 'json' library) or a valid Excel like
file (using 'pandas' library).
4. If it fails all the above operations, we raise UnsupportedFileFormat exception.
"""

file_extension = os.path.splitext(raw_file.name)[1][1:]
if file_extension in ALLOWED_FILE_FORMATS:
if file_extension in ['csv', 'tsv', 'json']:
return file_extension
else:
return 'excel'

if is_valid_csv(raw_file):
return file_extension if file_extension in ['csv', 'tsv'] else 'csv'
dmos62 marked this conversation as resolved.
Show resolved Hide resolved
elif is_valid_json(raw_file):
return 'json'
else:
try:
pandas.read_excel(raw_file)
return 'excel'
except pandas.errors.ParserError:
raise database_api_exceptions.UnsupportedFileFormat()


def create_datafile(data):
header = data.get('header', True)

Expand All @@ -43,12 +84,12 @@ def create_datafile(data):
raw_file = _download_datafile(data['url'])
created_from = 'url'
base_name = raw_file.name
type = os.path.splitext(raw_file.name)[1][1:]
type = _get_file_type(raw_file)
elif 'file' in data:
raw_file = data['file']
created_from = 'file'
base_name = raw_file.name
type = os.path.splitext(raw_file.name)[1][1:]
type = _get_file_type(raw_file)

if base_name:
max_length = DataFile._meta.get_field('base_name').max_length
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,5 @@ whitenoise==6.4.0
git+https://github.com/centerofci/sqlalchemy-filters@models_to_tables#egg=sqlalchemy_filters
gunicorn==20.1.0
drf-spectacular==0.26.2
pandas==2.0.2
pandas==2.0.2
openpyxl==3.1.2