In [1]:
from pandas import pandas as pd
import numpy as np
import csv, sqlite3
import logging
import cProfile

In [87]:
### utilities

def _get_col_datatypes(fin):
    """from https://stackoverflow.com/questions/2887878/importing-a-csv-file-into-a-sqlite3-database-table-using-python """
    dr = csv.DictReader(fin) # comma is default delimiter
    fieldTypes = {}
    for entry in dr:
        feildslLeft = [f for f in dr.fieldnames if f not in fieldTypes.keys()]
        if not feildslLeft: break # We're done
        for field in feildslLeft:
            data = entry[field]

            # Need data to decide
            if len(data) == 0:
                continue

            if data.isdigit():
                fieldTypes[field] = "INTEGER"
            else:
                fieldTypes[field] = "TEXT"
        # TODO: Currently there's no support for DATE in sqllite

    if len(feildslLeft) > 0:
        raise Exception("Failed to find all the columns data types - Maybe some are empty?")

    return fieldTypes


def escapingGenerator(f):
    """from https://stackoverflow.com/questions/2887878/importing-a-csv-file-into-a-sqlite3-database-table-using-python """
    for line in f:
        yield line.encode("ascii", "xmlcharrefreplace").decode("ascii")

class DAG():
    def __init__(self):
        self.nodes = []
        self.edges = []
    def add(self, operator):
        self.nodes.append(operator)
    def __repr__(self):
        ret = ''
        for node in self.nodes:
            ret += str(node) + '\n'
        return ret

class RA_operator():
    def __init__(self, tokens):
        self.tokens = tokens
    def __repr__(self):
        ret = ''
        for token in self.tokens:
            if isinstance(token, lazyDf):
                ret += 'df '
            else:
                ret += str(token) + ' '
        return ret
        
        
class lazyDf():
    def __init__(self):
        self.DAG = DAG()
    def add_engine(self, con):
        self.con = con
    def __getitem__(self, attribute):
        print('caught []')
        self.DAG.add( RA_operator(('project', attribute)) )
        return self
    def __eq__(self, attribute):
        print('caught eq')
        self.DAG.add( RA_operator(('predicate', 'equal', attribute)) )
        return self
    def __repr__(self):
        ret = ' lazy Df '
        ret += str(self.con) + '\n'
        ret += str(self.DAG) + '\n'
        return ret

def csvToDb(csvFile, con=None, table_name=None):
    """ from https://stackoverflow.com/questions/2887878/importing-a-csv-file-into-a-sqlite3-database-table-using-python 
    with small changes """
    if table_name is None:
        table_name = 'ads'

    with open(csvFile,mode='r', encoding="ISO-8859-1") as fin:
        dt = _get_col_datatypes(fin)
        fin.seek(0)
        reader = csv.DictReader(fin)
        # Keep the order of the columns name just as in the CSV
        fields = reader.fieldnames
        cols = []
        # Set field and type
        for f in fields:
            cols.append("%s %s" % (f, dt[f]))
        # Generate create table statement:
        stmt = "CREATE TABLE " + table_name + " (%s)" % ",".join(cols)
        if con is None:
            print('Creating engine')
            con = sqlite3.connect(":memory:")
        cur = con.cursor()
        cur.execute(stmt)
        fin.seek(0)
        reader = csv.reader(escapingGenerator(fin))
        # Generate insert statement:
        stmt = "INSERT INTO " + table_name + " VALUES(%s);" % ','.join('?' * len(cols))
        cur.executemany(stmt, reader)
        con.commit()
    ret = lazyDf()
    ret.add_engine(con)
    return ret

class Profiler():
    """ ctxtM for old python without profile"""
    def __init__(self, nlines):
        self.nlines = nlines

    def __enter__(self):
        self.pr = cProfile.Profile()
        self.pr.enable()

    def __exit__(self, *args):
        import pstats, io
        self.pr.disable()
        s = io.StringIO()
        sortby = 'cumulative'
        ps = pstats.Stats(self.pr, stream=s).sort_stats(sortby)
        ps.print_stats(self.nlines)
        print(s.getvalue())
### end of utilities

In [88]:
class csv_reader():
    def __init__(self, con, table_name):
        self.con = con
        self.table_name = table_name
    def __call__(self, filename):
        return csvToDb(filename, con=self.con, table_name=self.table_name)

class lazy_pandas():
    def __init__(self):
        self.con = None
        self.db = None
        
    def __getattribute__(self, name, *args, **kwargs):
        base = pd
        TO_OVERRIDE = ['read_csv', 'con']
        if name in TO_OVERRIDE:
            print('over', name, args, kwargs)
            if name == 'read_csv':
                self.con = sqlite3.connect(':memory:')
                return csv_reader(con=self.con, table_name='client_info')
        else:
            ret = base.__getattribute__(name)
            return ret

In [93]:
number_of_test_subjects = 4
lz = lazy_pandas()
client_info = lz.read_csv('../MOCK_DATA/mock_data_1.csv')
male_clients = client_info['gender'] == 'Male'
test_male_client = client_info[ male_clients ][:number_of_test_subjects]
print(test_male_client)

print(client_info is male_clients)

over read_csv () {}
over con () {}
Creating engine
caught []
caught eq
caught []
caught []
 lazy Df <sqlite3.Connection object at 0x7f04976b3d50>
project gender 
predicate equal Male 
project df 
project slice(None, 4, None) 


True


In [94]:
number_of_test_subjects = 8
## pandas
with Profiler(2):
    client_info = pd.read_csv('../MOCK_DATA/mock_data_1.csv')
    male_clients = client_info['gender'] == 'Male'
    test_male_client = client_info[ male_clients ][:number_of_test_subjects]
    pd_out = test_male_client.ip_address

## sqlite3
with Profiler(2):
    con = sqlite3.connect(':memory:') # or file or ?
    con = csvToDb('../MOCK_DATA/mock_data_1.csv', con=con, table_name='client_info')
    query = """
    select ip_address
    from client_info
    where gender = 'Male'
    limit """ + str(number_of_test_subjects) + """
    """
    sql_out = con.execute(query)
    sql_label = [description[0] for description in sql_out.description]
    sql_data = sql_out.fetchall()

## now check
for i, (sql_x, pd_x) in enumerate(zip(sql_data, pd_out)):
    if i < number_of_test_subjects//2:
        print(sql_x[0], '|', pd_x)
    assert sql_x[0] == pd_x
print('     PASSED')

         2633 function calls (2610 primitive calls) in 0.005 seconds

   Ordered by: cumulative time
   List reduced from 346 to 2 due to restriction <2>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.003    0.003 /home/marcello/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py:503(parser_f)
        1    0.000    0.000    0.003    0.003 /home/marcello/anaconda3/lib/python3.6/site-packages/pandas/io/parsers.py:382(_read)



         390 function calls in 0.001 seconds

   Ordered by: cumulative time
   List reduced from 33 to 2 due to restriction <2>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.001    0.001 <ipython-input-87-77f4dc9149ec>:78(csvToDb)
        1    0.000    0.000    0.001    0.001 {method 'executemany' of 'sqlite3.Cursor' objects}





AttributeError: 'lazyDf' object has no attribute 'execute'

In [None]:
client_info = pd.read_csv('../MOCK_DATA/mock_data_1.csv')
works_at = pd.read_csv('../MOCK_DATA/mock_data_2.csv')

# dumb pandas
print('email company name')
print('------------------')
res = []
with Profiler(2):
    for id_w, w in enumerate(works_at.email):
        for id_c, c in enumerate(client_info.email):
            if w == c:
                res.append((works_at.email[id_w], works_at.company[id_w], client_info.first_name[id_c]))
print(res)

In [None]:
## pandas
print('email company name')
print('------------------')
res = []
with Profiler(2):
    matches = works_at.email.isin(client_info.email)
    for email, company in zip( works_at[matches].email, works_at[matches].company):
        guy = client_info[client_info.email == email]
        res.append([email, company, guy.first_name.values[0]])
print(res)


In [None]:
## sql
con = sqlite3.connect(':memory:') # or file or ?
con = csvToDb('../MOCK_DATA/mock_data_1.csv', con=con, table_name='client_info')
con = csvToDb('../MOCK_DATA/mock_data_2.csv', con=con, table_name='works_at')
with Profiler(2):
    query = """
    select works_at.email, works_at.company, client_info.last_name
    from client_info, works_at
    where client_info.email = works_at.email
    """
    sql_out = con.execute(query)
    sql_label = [description[0] for description in sql_out.description]
    sql_data = sql_out.fetchall()
print(sql_data)