### use sqlite for large data-sets that do not fit into memory

In [1]:
import sqlite3
import pandas as pd
df = pd.read_csv("../data/creditcard.csv")

In [2]:
df.memory_usage(index=True, deep=True).sum()/1024**2

67.36017608642578

In [8]:
df.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')

In [13]:
df[['Time', 'Amount', 'Class']]

Unnamed: 0,Time,Amount,Class
0,0.0,149.62,0
1,0.0,2.69,0
2,1.0,378.66,0
3,1.0,123.50,0
4,2.0,69.99,0
...,...,...,...
284802,172786.0,0.77,0
284803,172787.0,24.79,0
284804,172788.0,67.88,0
284805,172788.0,10.00,0


In [4]:
# create a db

connection = sqlite3.connect("creditcard_fraud.sqlite")

# if your data-frame is too large for your memory - load it in chunks:
for df in pd.read_csv("../data/creditcard.csv", chunksize=1000):
    df[['Time', 'Amount', 'Class']].\
        to_sql("class", connection, if_exists="append", index=True, index_label='id')
    df.drop(['Time', 'Amount', 'Class'], axis=1).\
        to_sql("variables", connection, if_exists="append", index=True, index_label='id')

In [5]:
cursor = connection.cursor()
cursor.execute('SELECT name from sqlite_master where type= "table"')
print(cursor.fetchall())

[('class',), ('variables',)]


In [6]:
cursor.execute('SELECT * FROM class LIMIT 2;')
names = [i[0] for i in cursor.description]
print(names)

['id', 'Time', 'Amount', 'Class']


In [7]:
cursor.execute("CREATE INDEX c_id ON class(id)")

cursor.execute("CREATE INDEX v_id ON variables(id)")

<sqlite3.Cursor at 0x7efc8aabcf10>

In [12]:
pd.read_sql("""\
            SELECT Class, AVG(Amount) AS mean_amount, AVG(V5) AS mean_v5
            FROM variables v
            INNER JOIN class c
            ON v.id = c.id
            WHERE c.Amount > 100
            GROUP BY Class
            """, connection)

Unnamed: 0,Class,mean_amount,mean_v5
0,0,348.227734,-0.626036
1,1,408.528538,-3.28223


### unlist list of lists

In [None]:
import random
import string
import numpy as np
list_of_lists_of_strings = [[''.join(random.choices(
    string.ascii_uppercase + string.digits, k=kl)) for kl in np.random.randint(1, 6, 4)] for _ in range(1000)]
display(list_of_lists_of_strings[0:5])

[['9CHT', 'P4K', 'L5FI', 'RDS3N'],
 ['QF', '8', 'S0U0', '04E'],
 ['FQ', 'ZB', 'SAZ9', 'TEZL'],
 ['JO', 'L3EG', '7', '2J9'],
 ['0K', 'MN', 'MS', 'M']]

In [None]:
from itertools import chain
unlisted_list_comprehension = [s for l in list_of_lists_of_strings for s  in l]
unlisted_itertools = list(chain(*list_of_lists_of_strings))
display(unlisted_list_comprehension[50:55])
display(unlisted_itertools[50:55])

['RTRZ9', '0W27', 'EHV', '34', 'TVFB']

['RTRZ9', '0W27', 'EHV', '34', 'TVFB']

In [None]:
%timeit [s for l in list_of_lists_of_strings for s  in l]
%timeit list(chain(*list_of_lists_of_strings))

185 µs ± 111 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
91.7 µs ± 11.7 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
