Skip to content

Commit

Permalink
Merge pull request #3059 from IamEzio/perfect-excel
Browse files Browse the repository at this point in the history
Added functionality to import perfect Excel
  • Loading branch information
dmos62 committed Aug 3, 2023
2 parents 8c8dc44 + bcc5325 commit 6bdb38a
Show file tree
Hide file tree
Showing 11 changed files with 162 additions and 30 deletions.
30 changes: 22 additions & 8 deletions db/records/operations/insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,18 @@ def insert_record_or_records(table, engine, record_data):
return None


def get_records_from_dataframe(df):
"""
We convert the dataframe to JSON using to_json() method and then to a Python object.
This method replaces 'NaN' values in the dataframe with 'None' values in Python
object. The reason behind not using df.to_dict() method is beacuse it stringifies
'NaN' values rather than converting them to a 'None' value.
We pass 'records' as the orientation parameter because we want each record to contain
data of a single row and not of a single column (which is the default behaviour).
"""
return json.loads(df.to_json(orient='records'))


def insert_records_from_json(table, engine, json_filepath, column_names, max_level):
"""
Normalizes JSON data and inserts it into a table.
Expand All @@ -49,10 +61,7 @@ def insert_records_from_json(table, engine, json_filepath, column_names, max_lev
2. We normalize data into a pandas dataframe using pandas.json_normalize() method.
The method takes column names as meta. We provide all possible keys as column
names, hence it adds missing keys to JSON objects and marks their values as NaN.
3. We convert the dataframe to JSON using to_json() method and then to a Python object.
This method replaces 'NaN' values in the dataframe with 'None' values in Python
object. The reason behind not using df.to_dict() method is beacuse it stringifies
'NaN' values rather than converting them to a 'None' value.
3. We get records from the dataframe using the method get_records_from_dataframe().
4. The processed data is now a list of dict objects. Each dict has same keys, that are
the column names of the table. We loop through each dict object, and if any value is
a dict or a list, we stringify them before inserting them into the table. This way,
Expand All @@ -77,16 +86,21 @@ def insert_records_from_json(table, engine, json_filepath, column_names, max_lev
our table and not just the keys from the first JSON object.
"""
df = pandas.json_normalize(data, max_level=max_level, meta=column_names)
data = json.loads(df.to_json(orient='records'))
records = get_records_from_dataframe(df)

for i, row in enumerate(data):
data[i] = {
for i, row in enumerate(records):
records[i] = {
k: json.dumps(v)
if (isinstance(v, dict) or isinstance(v, list))
else v
for k, v in row.items()
}
insert_record_or_records(table, engine, data)
insert_record_or_records(table, engine, records)


def insert_records_from_excel(table, engine, dataframe):
records = get_records_from_dataframe(dataframe)
insert_record_or_records(table, engine, records)


def insert_records_from_csv(table, engine, csv_filepath, column_names, header, delimiter=None, escape=None, quote=None, encoding=None):
Expand Down
16 changes: 16 additions & 0 deletions mathesar/api/exceptions/database_exceptions/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,22 @@ def __init__(
super().__init__(exception, self.error_code, message, field, details, status_code)


class UnsupportedFileFormat(MathesarAPIException):
error_code = ErrorCodes.UnsupportedFileFormat.value

def __init__(
self,
exception=None,
message='This file format is not supported.',
field=None,
details=None,
status_code=status.HTTP_400_BAD_REQUEST
):
if exception is None:
exception = Exception(message)
super().__init__(exception, self.error_code, message, field, details, status_code)


class DynamicDefaultModificationError(Exception):
def __init__(self, column=None):
self.column = column
1 change: 1 addition & 0 deletions mathesar/api/exceptions/error_codes.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,4 @@ class ErrorCodes(Enum):
DynamicDefaultAlterationToStaticDefault = 4424
InvalidJSONFormat = 4425
UnsupportedJSONFormat = 4426
UnsupportedFileFormat = 4427
5 changes: 5 additions & 0 deletions mathesar/imports/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from mathesar.database.base import create_mathesar_engine
from mathesar.models.base import Table
from mathesar.imports.csv import create_db_table_from_csv_data_file
from mathesar.imports.excel import create_db_table_from_excel_data_file
from mathesar.imports.json import create_db_table_from_json_data_file
from db.tables.operations.select import get_oid_from_table
from mathesar.errors import InvalidTableError
Expand All @@ -19,6 +20,10 @@ def create_table_from_data_file(data_file, name, schema, comment=None):
db_table = create_db_table_from_json_data_file(
data_file, name, schema, comment=comment
)
elif data_file.type == 'excel':
db_table = create_db_table_from_excel_data_file(
data_file, name, schema, comment=comment
)
else:
raise InvalidTableError
db_table_oid = get_oid_from_table(db_table.name, db_table.schema, engine)
Expand Down
23 changes: 14 additions & 9 deletions mathesar/imports/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from db.tables.operations.create import create_string_column_table
from db.tables.operations.drop import drop_table
from mathesar.errors import InvalidTableError
from mathesar.imports.utils import process_column_names
from db.constants import ID, ID_ORIGINAL, COLUMN_NAME_TEMPLATE
from mathesar.imports.utils import get_alternate_column_names, process_column_names
from db.constants import COLUMN_NAME_TEMPLATE
from psycopg2.errors import IntegrityError, DataError

from mathesar.state import reset_reflection
Expand All @@ -21,6 +21,14 @@
CHECK_ROWS = 10


def is_valid_csv(data):
try:
csv.reader(data)
except (csv.CsvError, ValueError):
return False
return True


def get_file_encoding(file):
"""
Given a file, uses charset_normalizer if installed or chardet which is installed as part of clevercsv module to
Expand Down Expand Up @@ -109,7 +117,7 @@ def get_sv_reader(file, header, dialect=None):
return reader


def insert_data_from_csv_data_file(name, schema, column_names, engine, comment, data_file):
def insert_records_from_csv_data_file(name, schema, column_names, engine, comment, data_file):
dialect = csv.dialect.SimpleDialect(data_file.delimiter, data_file.quotechar,
data_file.escapechar)
encoding = get_file_encoding(data_file.file)
Expand Down Expand Up @@ -145,14 +153,11 @@ def create_db_table_from_csv_data_file(data_file, name, schema, comment=None):
sv_reader = get_sv_reader(sv_file, header, dialect=dialect)
column_names = process_column_names(sv_reader.fieldnames)
try:
table = insert_data_from_csv_data_file(name, schema, column_names, engine, comment, data_file)
table = insert_records_from_csv_data_file(name, schema, column_names, engine, comment, data_file)
update_pk_sequence_to_latest(engine, table)
except (IntegrityError, DataError):
drop_table(name=name, schema=schema.name, engine=engine)
column_names_alt = [
column_name if column_name != ID else ID_ORIGINAL
for column_name in column_names
]
insert_data_from_csv_data_file(name, schema, column_names_alt, engine, comment, data_file)
column_names_alt = get_alternate_column_names(column_names)
insert_records_from_csv_data_file(name, schema, column_names_alt, engine, comment, data_file)
reset_reflection(db_name=db_name)
return table
45 changes: 45 additions & 0 deletions mathesar/imports/excel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import pandas

from db.tables.operations.alter import update_pk_sequence_to_latest
from mathesar.database.base import create_mathesar_engine
from db.records.operations.insert import insert_records_from_excel
from db.tables.operations.create import create_string_column_table
from db.tables.operations.drop import drop_table
from mathesar.imports.utils import get_alternate_column_names, process_column_names
from psycopg2.errors import IntegrityError, DataError

from mathesar.state import reset_reflection


def insert_records_from_dataframe(name, schema, column_names, engine, comment, dataframe):
table = create_string_column_table(
name=name,
schema=schema.name,
column_names=column_names,
engine=engine,
comment=comment,
)

insert_records_from_excel(
table,
engine,
dataframe,
)
return table


def create_db_table_from_excel_data_file(data_file, name, schema, comment=None):
db_name = schema.database.name
engine = create_mathesar_engine(db_name)
dataframe = pandas.read_excel(data_file.file.path)
column_names = process_column_names(dataframe.columns)
try:
table = insert_records_from_dataframe(name, schema, column_names, engine, comment, dataframe)
update_pk_sequence_to_latest(engine, table)
except (IntegrityError, DataError):
drop_table(name=name, schema=schema.name, engine=engine)
column_names_alt = get_alternate_column_names(column_names)
table = insert_records_from_dataframe(name, schema, column_names_alt, engine, comment, dataframe)

reset_reflection(db_name=db_name)
return table
14 changes: 5 additions & 9 deletions mathesar/imports/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@
from mathesar.api.exceptions.database_exceptions import (
exceptions as database_api_exceptions
)
from mathesar.imports.utils import process_column_names
from db.constants import ID, ID_ORIGINAL
from mathesar.imports.utils import get_alternate_column_names, process_column_names
from psycopg2.errors import IntegrityError, DataError

from mathesar.state import reset_reflection
Expand Down Expand Up @@ -64,7 +63,7 @@ def get_column_names_from_json(data_file, max_level):
return get_flattened_keys(data, max_level)


def insert_data_from_json_data_file(name, schema, column_names, engine, comment, json_filepath, max_level):
def insert_records_from_json_data_file(name, schema, column_names, engine, comment, json_filepath, max_level):
table = create_string_column_table(
name=name,
schema=schema.name,
Expand All @@ -91,15 +90,12 @@ def create_db_table_from_json_data_file(data_file, name, schema, comment=None):
get_column_names_from_json(json_filepath, max_level)
)
try:
table = insert_data_from_json_data_file(name, schema, column_names, engine, comment, json_filepath, max_level)
table = insert_records_from_json_data_file(name, schema, column_names, engine, comment, json_filepath, max_level)
update_pk_sequence_to_latest(engine, table)
except (IntegrityError, DataError):
drop_table(name=name, schema=schema.name, engine=engine)
column_names_alt = [
fieldname if fieldname != ID else ID_ORIGINAL
for fieldname in column_names
]
table = insert_data_from_json_data_file(name, schema, column_names_alt, engine, comment, json_filepath, max_level)
column_names_alt = get_alternate_column_names(column_names)
table = insert_records_from_json_data_file(name, schema, column_names_alt, engine, comment, json_filepath, max_level)

reset_reflection(db_name=db_name)
return table
9 changes: 8 additions & 1 deletion mathesar/imports/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from db.identifiers import truncate_if_necessary
from db.constants import COLUMN_NAME_TEMPLATE
from db.constants import COLUMN_NAME_TEMPLATE, ID, ID_ORIGINAL


def process_column_names(column_names):
Expand All @@ -19,3 +19,10 @@ def process_column_names(column_names):
in enumerate(column_names)
)
return list(column_names)


def get_alternate_column_names(column_names):
return [
fieldname if fieldname != ID else ID_ORIGINAL
for fieldname in column_names
]
Binary file added mathesar/tests/data/patents.xlsx
Binary file not shown.
47 changes: 44 additions & 3 deletions mathesar/utils/datafiles.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@
import os
import pandas
from time import time
from io import TextIOWrapper

import requests
from django.core.files.base import ContentFile
from django.core.files.uploadedfile import TemporaryUploadedFile

from mathesar.api.exceptions.database_exceptions import (
exceptions as database_api_exceptions
)
from mathesar.errors import URLDownloadError
from mathesar.imports.csv import get_sv_dialect, get_file_encoding
from mathesar.imports.csv import is_valid_csv, get_sv_dialect, get_file_encoding
from mathesar.imports.json import is_valid_json, validate_json_format
from mathesar.models.base import DataFile


ALLOWED_FILE_FORMATS = ['csv', 'tsv', 'json', 'xls', 'xlsx', 'xlsm', 'xlsb', 'odf', 'ods', 'odt']


def _download_datafile(url):
name = 'file_from_url'
if '/' in url:
Expand All @@ -29,6 +36,40 @@ def _download_datafile(url):
return temp_file


def _get_file_type(raw_file):
"""
Algorithm:
1. Get file extension using 'os' library.
2. If the file extension is in ALLOWED_FILE_FORMATS then return file type
as 'csv', 'tsv', 'json' or 'excel'.
3. If the file does not have an extension or does not have an allowed one,
we check for the file type using brute force approach. Similar case can
also arise when we download a file from an URL and it does not have a
file type. We first try to read the file using 'csv' library. If it fails,
we check if it is a valid JSON (using 'json' library) or a valid Excel like
file (using 'pandas' library).
4. If it fails all the above operations, we raise UnsupportedFileFormat exception.
"""

file_extension = os.path.splitext(raw_file.name)[1][1:]
if file_extension in ALLOWED_FILE_FORMATS:
if file_extension in ['csv', 'tsv', 'json']:
return file_extension
else:
return 'excel'

if is_valid_csv(raw_file):
return 'csv'
elif is_valid_json(raw_file):
return 'json'
else:
try:
pandas.read_excel(raw_file)
return 'excel'
except pandas.errors.ParserError:
raise database_api_exceptions.UnsupportedFileFormat()


def create_datafile(data):
header = data.get('header', True)

Expand All @@ -43,12 +84,12 @@ def create_datafile(data):
raw_file = _download_datafile(data['url'])
created_from = 'url'
base_name = raw_file.name
type = os.path.splitext(raw_file.name)[1][1:]
type = _get_file_type(raw_file)
elif 'file' in data:
raw_file = data['file']
created_from = 'file'
base_name = raw_file.name
type = os.path.splitext(raw_file.name)[1][1:]
type = _get_file_type(raw_file)

if base_name:
max_length = DataFile._meta.get_field('base_name').max_length
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,5 @@ git+https://github.com/centerofci/sqlalchemy-filters@models_to_tables#egg=sqlalc
gunicorn==20.1.0
drf-spectacular==0.26.2
pandas==2.0.2
openpyxl==3.1.2
pyxlsb==1.0.10

0 comments on commit 6bdb38a

Please sign in to comment.