In [1]:
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
from pyarrow import csv,json
from IPython.display import display
import re

In [2]:
table = pa.csv.read_csv('FLOWS.csv',
                read_options=pa.csv.ReadOptions(use_threads=True, block_size=None,
                                                skip_rows=None, column_names=None,
                                                autogenerate_column_names=None),
                parse_options=pa.csv.ParseOptions(delimiter=',', quote_char='"',
                                                  double_quote=None, escape_char=None,
                                                  newlines_in_values=None, ignore_empty_lines=None),
                convert_options=pa.csv.ConvertOptions(check_utf8=None, column_types=None,
                                                      null_values=None, true_values=None,
                                                      false_values=None, strings_can_be_null=None,
                                                      include_columns=None, include_missing_columns=None))
display(table)
display(table.shape)

pyarrow.Table
Flow Type: string
Member: string
Description: string
Transaction Value Date: string
Amount: int64
Original Disbursement Date: string
Original Arrangement Date: string

(25543, 7)

In [3]:
a = pd.read_csv('FLOWS.csv')
b = pd.read_csv('FLOWS2.csv')
a.to_parquet('FLOWS.parquet')
b.to_parquet('FLOWS2.parquet')
a.to_json('FLOWS.json',orient='records',lines=True)
b.to_json('FLOWS2.json',orient='records',lines=True)
csv_pandas = pd.read_csv('FLOWS.csv')
csv_arrow = pa.csv.read_csv('FLOWS.csv').to_pandas(strings_to_categorical=True,use_threads=True)
json_arrow = pa.json.read_json('FLOWS.json').to_pandas(strings_to_categorical=True,use_threads=True)
table = pq.ParquetFile('FLOWS.parquet').read(use_pandas_metadata=True)
parquet = table.to_pandas(
    strings_to_categorical=True,
    use_threads=True)
def compare(a,b):
    return (a == b) | ((a != a) & (b != b))

In [4]:
%timeit csv_pandas = pd.read_csv('FLOWS.csv')

23.1 ms ± 759 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
%timeit csv_arrow = pa.csv.read_csv('FLOWS.csv').to_pandas(strings_to_categorical=True,use_threads=True)

7.32 ms ± 82.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [6]:
pa.csv.read_csv('FLOWS.csv')

pyarrow.Table
Flow Type: string
Member: string
Description: string
Transaction Value Date: string
Amount: int64
Original Disbursement Date: string
Original Arrangement Date: string

In [7]:
%%timeit
table = pq.ParquetFile('FLOWS.parquet').read(use_pandas_metadata=True)
parquet = table.to_pandas(
          strings_to_categorical=True,
          use_threads=True)

7.09 ms ± 546 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [8]:
%timeit -c csv_arrow = pa.csv.read_csv('FLOWS.csv').to_pandas(strings_to_categorical=True,use_threads=True)

20.5 ms ± 2.64 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [9]:
%timeit json_pandas = pd.read_json('FLOWS.json',orient='records',lines=True)

137 ms ± 1.56 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [10]:
%%timeit
json_arrow = pa.json.read_json('FLOWS.json').to_pandas(strings_to_categorical=True,use_threads=True)

13.7 ms ± 577 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [11]:
csv_pandas.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25543 entries, 0 to 25542
Data columns (total 7 columns):
Flow Type                     25543 non-null object
Member                        25543 non-null object
Description                   25543 non-null object
Transaction Value Date        25543 non-null object
Amount                        25543 non-null int64
Original Disbursement Date    17992 non-null object
Original Arrangement Date     16787 non-null object
dtypes: int64(1), object(6)
memory usage: 9.7 MB


In [12]:
parquet.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25543 entries, 0 to 25542
Data columns (total 7 columns):
Flow Type                     25543 non-null category
Member                        25543 non-null category
Description                   25543 non-null category
Transaction Value Date        25543 non-null category
Amount                        25543 non-null int64
Original Disbursement Date    17992 non-null category
Original Arrangement Date     16787 non-null category
dtypes: category(6), int64(1)
memory usage: 1.0 MB


In [13]:
csv_arrow.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25543 entries, 0 to 25542
Data columns (total 7 columns):
Flow Type                     25543 non-null category
Member                        25543 non-null category
Description                   25543 non-null category
Transaction Value Date        25543 non-null category
Amount                        25543 non-null int64
Original Disbursement Date    25543 non-null category
Original Arrangement Date     25543 non-null category
dtypes: category(6), int64(1)
memory usage: 1.0 MB


In [14]:
json_arrow.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25542 entries, 0 to 25541
Data columns (total 7 columns):
Flow Type                     25542 non-null category
Member                        25542 non-null category
Description                   25542 non-null category
Transaction Value Date        25542 non-null category
Amount                        25542 non-null int64
Original Disbursement Date    17991 non-null category
Original Arrangement Date     16786 non-null category
dtypes: category(6), int64(1)
memory usage: 1.0 MB


In [15]:
%timeit compare(csv_pandas,csv_pandas)

29.7 ms ± 550 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
%timeit compare(csv_arrow,csv_arrow)

14.4 ms ± 107 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [17]:
%timeit csv_pandas.select_dtypes(['object'], []).astype('category')

15 ms ± 57.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [18]:
r"""
# Match one value in valid CSV string.
(?!\s*$)                            # Don't match empty last value.
\s*                                 # Strip whitespace before value.
(?:                                 # Group for value alternatives.
  '([^'\\]*(?:\\[\S\s][^'\\]*)*)'   # Either $1: Single quoted string,
| "([^"\\]*(?:\\[\S\s][^"\\]*)*)"   # or $2: Double quoted string,
| ([^,'"\s\\]*(?:\s+[^,'"\s\\]+)*)  # or $3: Non-comma, non-quote stuff.
)                                   # End group of value alternatives.
\s*                                 # Strip whitespace after value.
(?:,|$)                             # Field ends on comma or EOS.
"""
link = 'https://softwareengineering.stackexchange.com/questions/166454/can-the-csv-format-be-defined-by-a-regex'

In [19]:
re_valid = r"""(?!\s*$)\s*('(?:[^'\\]*(?:\\[\S\s][^'\\]*)*)'|"(?:[^"\\]*(?:\\[\S\s][^"\\]*)*)"|(?:[^,'"\s\\]*(?:\s+[^,'"\s\\]+)*))\s*(?:,|$)"""
def csv_parser(lines):
    column_names = re.findall(re_valid,lines[0])
    columns = [[] for c in column_names]
    for i in range(1,len(lines)):
        values = re.findall(re_valid,lines[i])
        for j in range(len(column_names)):
            if j >= len(values):
                columns[j].append('')
            else:
                columns[j].append(values[j])
    return column_names,columns

In [20]:
with open('FLOWS.csv') as f:
    lines = f.readlines()
column_names,columns = csv_parser(lines)

In [21]:
column_names

['Flow Type',
 'Member',
 'Description',
 'Transaction Value Date',
 'Amount',
 'Original Disbursement Date',
 'Original Arrangement Date']

In [22]:
columns[0][:5],columns[1][:5]

(['PRGT Disbursements',
  'PRGT Disbursements',
  'PRGT Interest',
  'PRGT Disbursements',
  'PRGT Interest'],
 ['"Afghanistan, Islamic Republic of"',
  '"Afghanistan, Islamic Republic of"',
  '"Afghanistan, Islamic Republic of"',
  '"Afghanistan, Islamic Republic of"',
  '"Afghanistan, Islamic Republic of"'])

In [23]:
%timeit df = pd.DataFrame(data={column_names[i]:columns[i] for i in range(len(column_names))})

19.3 ms ± 848 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [24]:
%%timeit
table = pa.Table.from_arrays([pa.array(c) for c in columns],names=column_names)
df = table.to_pandas(strings_to_categorical=True,use_threads=True)

17.7 ms ± 406 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [25]:
%timeit table = pa.Table.from_arrays([pa.array(c) for c in columns],names=column_names)

7.66 ms ± 264 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [26]:
table = pa.Table.from_arrays([pa.array(c) for c in columns],names=column_names)
df = table.to_pandas(strings_to_categorical=True,use_threads=True)
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25543 entries, 0 to 25542
Data columns (total 7 columns):
Flow Type                     25543 non-null category
Member                        25543 non-null category
Description                   25543 non-null category
Transaction Value Date        25543 non-null category
Amount                        25543 non-null category
Original Disbursement Date    25543 non-null category
Original Arrangement Date     25543 non-null category
dtypes: category(7)
memory usage: 1.8 MB


In [27]:
table = pa.Table.from_arrays([pa.array(c) for c in columns],names=column_names)
df_pandas = pd.DataFrame(data={column_names[i]:columns[i] for i in range(len(column_names))})

In [28]:
%timeit df_pandas.to_parquet('FLOWS_.parquet')

27.3 ms ± 1.16 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [29]:
%timeit pq.write_table(table, 'FLOWS_.parquet')

11.1 ms ± 357 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [30]:
!ls -lh FLOWS_.parquet FLOWS.csv

-rw-r--r--@ 1 mwojton  staff   1.8M Oct 23 12:10 FLOWS.csv
-rw-r--r--  1 mwojton  staff   231K Nov  4 16:00 FLOWS_.parquet


In [31]:
countries = ['Sweden','Norway']
populations = [{'city': 'Stockholm', 
                'population': {'seniors':1,'kids':5}},
               {'city': 'Oslo', 
                'population': {'seniors':4,'kids':20}}]

In [32]:
data = [
    pa.array(countries),
    pa.array(populations)
]
table = pa.Table.from_arrays(data, 
                             names=['country','populations'])
table

pyarrow.Table
country: string
populations: struct<city: string, population: struct<kids: int64, seniors: int64>>
  child 0, city: string
  child 1, population: struct<kids: int64, seniors: int64>
      child 0, kids: int64
      child 1, seniors: int64

In [33]:
display(table.flatten().to_pandas())

Unnamed: 0,country,populations.city,populations.population
0,Sweden,Stockholm,"{'kids': 5, 'seniors': 1}"
1,Norway,Oslo,"{'kids': 20, 'seniors': 4}"


In [34]:
display(table.flatten().flatten().to_pandas())

Unnamed: 0,country,populations.city,populations.population.kids,populations.population.seniors
0,Sweden,Stockholm,5,1
1,Norway,Oslo,20,4
