# CSV to SQLite, Streaming

This notebook shows how to `crawl` a directory of `*.csv` files and import them into SQLite.

In this example, we store `*.csv` files in a folder relative to the notebook `./data`. We then create a SQLite database from these CSV files.

To customize this notebook, change the values of the 2 variables below.

- `dir_to_csv`
- `sqlite_db`

When you have a huge dataset, you should use this code.

## Create SQLite database

In [None]:
import pandas as pd
from io import StringIO
import pathlib
import sqlite3
import os

def get_sample(file_path, max_lines=100):
    with open(file_path, 'r') as fp:
        lines = []
        
        for i, line in enumerate(fp):
            if i >= max_lines:
                break
            
            lines.append(line)
            
        return pd.read_csv(StringIO(''.join(lines)))
    
def get_create_sql(table, df):
    def get_type(t):
        if t.startswith('int') or t.startswith('bool'):
            return 'INTEGER'
        if t.startswith('float'):
            return 'REAL'
        return 'TEXT'
    
    fields = [(n, get_type(str(t))) for n, t in zip(df.dtypes.index, df.dtypes.values)]
    fields = [f'{n} {t}' for n, t in fields]
    fields = ', '.join(fields)
    
    sql = f'CREATE TABLE {table} ({fields})'
    return sql

def get_headers(file_path):
    df = get_sample(file_path)
    columns = ','.join(df.columns)
    columns = f'{columns}\n'
    return columns

def get_csv_files(dir_path):
    csv_files = list(pathlib.Path(dir_path).glob('*.csv'))
    return csv_files

def delete_file_if_exists(file_path):
    if os.path.exists(file_path):
        os.remove(file_path)

def create_db(input_dir, output_db, flush_size=1_000_000):
    delete_file_if_exists(output_db)
    pathlib.Path(output_db).touch()
    
    files = get_csv_files(input_dir)
    
    with sqlite3.connect(output_db) as conn:
        cur = conn.cursor()
        
        for file_path in files:
            print(f'processing {file_path}')
            sql = get_create_sql(file_path.stem, get_sample(file_path))
            headers = get_headers(file_path)
            
            cur.execute(sql)
            
            with open(file_path, 'r') as fp:
                lines = []
                lines.append(headers)
                
                for i, line in enumerate(fp):
                    if i == 0:
                        continue
                    
                    lines.append(line)
                    
                    if len(lines) == flush_size:
                        pd.read_csv(StringIO(''.join(lines)), low_memory=False) \
                            .to_sql(file_path.stem, conn, if_exists='append', index=False)
                        
                        lines = []
                        lines.append(headers)
                
                pd.read_csv(StringIO(''.join(lines)), low_memory=False) \
                    to_sql(file_path.stem, conn, if_exists='append', index=False)
    
                
create_db('./mimic-iv-1.0', 'mimic.db')
print('done')

processing mimic-iv-1.0\admissions.csv
processing mimic-iv-1.0\chartevents.csv


  if (await self.run_code(code, result,  async_=asy)):


## Verify that it works with the SQLite driver

In [None]:
with sqlite3.connect('mimic.db') as conn:
    sql = '''
    SELECT count(*) as total
    FROM admissions
    '''
    
    cur = conn.cursor()
    cur.execute(sql)
    
    items = cur.fetchall()
    for i in items:
        print(i)