In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


In [8]:
# Stolen from StackOverflow
# https://stackoverflow.com/questions/14996453/python-libraries-to-calculate-human-readable-filesize-from-bytes
suffixes = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']
def humansize(nbytes):
    i = 0
    while nbytes >= 1024 and i < len(suffixes)-1:
        nbytes /= 1024.
        i += 1
    f = ('%.2f' % nbytes).rstrip('0').rstrip('.')
    return '%s %s' % (f, suffixes[i])


## CSV Generation
`generate(rows, generators)` generates a csv with `1000*rows` rows to `./data/out.csv` creating the directory and file if necessary and overwriting the file if it exists

Across 8 processes, workers generate 1000 lines from the generators, joining the result of each generator for each column to create a line. A listener accepts output from the workers and writes it to a file.

All columns are random integers from 1 to 999

In [16]:
from server.Generator import Generator


In [17]:
# Generate './data/out.csv' with 1000*1000 lines
Generator.generate(1000, Generator.generate_random_int_cols())


# FixedWidth CSV files
FixedWidth is a class that normalizes a csv to have columns of fixed width. It performs column operations by seeking through the fixed-width csv file.

It can compute the maximum width for each column if `tmp_dir+'widths.pickle'` does not exist, otherwise it will just load the widths using pickle.

Writing a new fixed width CSV requires a call to `write(output_csv)`, where `output_csv` is the filename of the output.

A column comparison is performed using `column_operation(self, csv, first_column, second_column, operation)`. Currently, `csv` is required because we don't keep a standard location for the normalized csv location (we could). Also, we are ignoring `operation` and just performing `==` at the moment



In [1]:
from server.FixedTable import FixedTable
import os

In [27]:
# Read the widths from scratch
if os.path.exists('./data/tmp/widths.pickle'):
    os.remove('./data/tmp/widths.pickle')
csv = FixedTable('./data/out.csv', './data/tmp/')

Splitting 1124751382 bytes between 8 processes in chunks of 70296961
Got widths in 7.65 seconds


In [5]:
# Write a normalized csv file
if os.path.exists('./data/fixed_out.csv'):
    os.remove('./data/fixed_out.csv')
csv = FixedTable('./data/out.csv', './data/tmp/')
csv.write('./data/fixed_out.csv')

Splitting 1124751382 bytes between 8 processes in chunks of 70296961
Got widths in 7.06 seconds
Wrote fixed width csv in 74.23 seconds


In [3]:
# Perform a column operation on fixed_out.csv
csv = FixedTable('./data/out.csv', './data/tmp/')
results = csv.column_operation('./data/fixed_out.csv', 'aa', 'ab', '=')

Got widths in 0.00 seconds
Performed operation in 9.709 seconds


# Scratch
Cells for reading csv line by line

In [21]:
f = open('./data/out.csv', 'rb')

In [22]:
f.readline()

b'aa,ab,ac,ad,ae,af,ah,ai,aj,ak,al,am,an,ao,ap,aq,ar,ba,bb,bc,bd,be,bf,bh,bi,bj,bk,bl,bm,bn,bo,bp,bq,br,ca,cb,cc,cd,ce,cf,ch,ci,cj,ck,cl,cm,cn,co,cp,cq,cr,da,db,dc,dd,de,df,dh,di,dj,dk,dl,dm,dn,do,dp,dq,dr,ea,eb,ec,ed,ee,ef,eh,ei,ej,ek,el,em,en,eo,ep,eq,er,fa,fb,fc,fd,fe,ff,fh,fi,fj,fk,fl,fm,fn,fo,fp,fq,fr,ha,hb,hc,hd,he,hf,hh,hi,hj,hk,hl,hm,hn,ho,hp,hq,hr,ia,ib,ic,id,ie,if,ih,ii,ij,ik,il,im,in,io,ip,iq,ir,ja,jb,jc,jd,je,jf,jh,ji,jj,jk,jl,jm,jn,jo,jp,jq,jr,ka,kb,kc,kd,ke,kf,kh,ki,kj,kk,kl,km,kn,ko,kp,kq,kr,la,lb,lc,ld,le,lf,lh,li,lj,lk,ll,lm,ln,lo,lp,lq,lr,ma,mb,mc,md,me,mf,mh,mi,mj,mk,ml,mm,mn,mo,mp,mq,mr,na,nb,nc,nd,ne,nf,nh,ni,nj,nk,nl,nm,nn,no,np,nq,nr,oa,ob,oc,od,oe,of,oh,oi,oj,ok,ol,om,on,oo,op,oq,or,pa,pb,pc,pd,pe,pf,ph,pi,pj,pk,pl,pm,pn,po,pp,pq,pr,qa,qb,qc,qd,qe,qf,qh,qi,qj,qk,ql,qm,qn,qo,qp,qq,qr,ra,rb,rc,rd,re,rf,rh,ri,rj,rk,rl,rm,rn,ro,rp,rq,rr\n'

In [23]:
f.tell()

867

In [25]:
f.close()


In [14]:
f.seek(0)
l = len(f.readline())
l1 = len(f.readline())
size = os.path.getsize('./data/fixed_out.csv') - l
rows = size / l1
print(size, l1, size / l1, rows / 16, rows / 16 * l1)

ValueError: seek of closed file

In [11]:
f = open('./data/fixed_out.csv', 'rb')

In [12]:
f.readline()

b'aa,ab,ac,ad,ae,af,ah,ai,aj,ak,al,am,an,ao,ap,aq,ar,ba,bb,bc,bd,be,bf,bh,bi,bj,bk,bl,bm,bn,bo,bp,bq,br,ca,cb,cc,cd,ce,cf,ch,ci,cj,ck,cl,cm,cn,co,cp,cq,cr,da,db,dc,dd,de,df,dh,di,dj,dk,dl,dm,dn,do,dp,dq,dr,ea,eb,ec,ed,ee,ef,eh,ei,ej,ek,el,em,en,eo,ep,eq,er,fa,fb,fc,fd,fe,ff,fh,fi,fj,fk,fl,fm,fn,fo,fp,fq,fr,ha,hb,hc,hd,he,hf,hh,hi,hj,hk,hl,hm,hn,ho,hp,hq,hr,ia,ib,ic,id,ie,if,ih,ii,ij,ik,il,im,in,io,ip,iq,ir,ja,jb,jc,jd,je,jf,jh,ji,jj,jk,jl,jm,jn,jo,jp,jq,jr,ka,kb,kc,kd,ke,kf,kh,ki,kj,kk,kl,km,kn,ko,kp,kq,kr,la,lb,lc,ld,le,lf,lh,li,lj,lk,ll,lm,ln,lo,lp,lq,lr,ma,mb,mc,md,me,mf,mh,mi,mj,mk,ml,mm,mn,mo,mp,mq,mr,na,nb,nc,nd,ne,nf,nh,ni,nj,nk,nl,nm,nn,no,np,nq,nr,oa,ob,oc,od,oe,of,oh,oi,oj,ok,ol,om,on,oo,op,oq,or,pa,pb,pc,pd,pe,pf,ph,pi,pj,pk,pl,pm,pn,po,pp,pq,pr,qa,qb,qc,qd,qe,qf,qh,qi,qj,qk,ql,qm,qn,qo,qp,qq,qr,ra,rb,rc,rd,re,rf,rh,ri,rj,rk,rl,rm,rn,ro,rp,rq,rr\r\n'

In [6]:
f.tell()

868

In [7]:
f.seek(0)
headers = len(f.readline())
row = len(f.readline())
print(headers + row + 2, f.tell())
row1 = len(f.readline())
headers, row, row1

2025 2025


(867, 1156, 1156)

In [9]:
import os
datasize = os.path.getsize('./data/fixed_out.csv') - headers
datasize


1126729410

In [38]:
import os
datasize = os.path.getsize('./data/fixed_out.csv') - headers
chunksize = datasize // 16
chunksize = chunksize // row * row


70420052

In [24]:
f.seek(1126728924)
f.readline()

'32,623,153,729, 51,702,478,774, 50,208,757,380, 66,417, 12,929,587,908,926,246,917,943,476,697,881, 55, 95,192, 69,583,419,383,370,292, 87,201,112, 15,258,193,282,163,424,592, 50,760,313,572,181\n'