In [1]:
import sys
sys.path.append('..')

In [2]:
import os
import time
import timeit
import pathlib
import pandas as pd
from credible import connectors

In [3]:
tablename = 'businesses'
glob_pattern = '_x'
data_folderpath = os.path.join(os.pardir, 'data')
engine = connectors.connect_to_sqlite()

In [18]:
def df_to_table(df, tablename, con):
    df.to_sql(name=tablename, con=con, if_exists='replace', index=False)

def table_to_df(tablename, con):
    query = f'select * from {tablename};'
    return pd.read_sql_query(sql=query, con=con)

def table_info(tablename, con):
    query = f'pragma table_info({tablename});'
    return pd.read_sql_query(sql=query, con=con)

def wrapper(filepath, tablename, con):
    df = pd.read_csv(filepath, low_memory=False, memory_map=True)
    df_to_table(df, tablename, engine)
    df_q = table_to_df(tablename, engine)
    
    assert all(df.columns == df_q.columns)
    assert df.shape == df_q.shape

In [19]:
generator = pathlib.Path(data_folderpath).glob('*' + glob_pattern + '.csv')

In [20]:
time_list = []
for file in sorted(generator, key=os.path.getsize, reverse=False)[1:3]:
    res = %timeit -o -n 1 -r 1 wrapper(file, file.stem, engine)
    time_list.append({
        'filename': file.name, 
        'avg': res.average, 
        'best': res.best, 
        'compile': res.compile_time
    })

31.9 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
21.7 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [21]:
pd.DataFrame(time_list)

Unnamed: 0,avg,best,compile,filename
0,31.877452,31.877452,7.7e-05,business_x.csv
1,21.708888,21.708888,6.9e-05,tip_x.csv
