In [1]:
# %load_ext autoreload

import numpy as np
from tafra import Tafra
import pprint as _pprint
pprint = _pprint.PrettyPrinter(indent=1).pprint

# %autoreload 2

In [56]:
t = Tafra({
    'x': np.array([1, 2, 3, 4, 5, 6]),
    'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'z': np.array([0, 0, 0, 1, 1, 1])
})

t

Unnamed: 0,x,y,z
,int,object,int
0.0,1,one,0
1.0,2,two,0
2.0,3,one,0
3.0,4,two,1
4.0,5,one,1
5.0,6,two,1


In [3]:
t.pprint()

Tafra(
data = {
  'x': array([1, 2, 3, 4, 5, 6]),
 'y': array(['one', 'two', 'one', 'two', 'one', 'two'], dtype=object),
 'z': array([0, 0, 0, 1, 1, 1])},
dtypes = {
  'x': 'int', 'y': 'object', 'z': 'int'}
)


In [4]:
print('List:\n', t.to_list())
print('Record:\n', tuple(t.to_records()))

List:
 [array([1, 2, 3, 4, 5, 6]), array(['one', 'two', 'one', 'two', 'one', 'two'], dtype=object), array([0, 0, 0, 1, 1, 1])]
Record:
 ((1, 'one', 0), (2, 'two', 0), (3, 'one', 0), (4, 'two', 1), (5, 'one', 1), (6, 'two', 1))


In [5]:
pformat = _pprint.PrettyPrinter(indent=2).pformat

Group By

In [6]:
gb = t.group_by(
    ['y', 'z'], {'x': sum}, {'count': len}
)
print('Group By:')
gb

Group By:


Unnamed: 0,y,z,x,count
,object,int,int,int
0.0,one,0,4,2
1.0,two,0,2,1
2.0,two,1,10,2
3.0,one,1,5,1


Transform

In [7]:
tr = t.transform(
    ['y', 'z'], {'x': sum}, {'id': max}
)
print('Transform:')
tr

Transform:


Unnamed: 0,y,z,x,id
,object,int,int,int
0.0,one,0,4,0
1.0,two,0,2,1
2.0,one,0,4,0
3.0,two,1,10,2
4.0,one,1,5,3
5.0,two,1,10,2


We can set an custom attribute, but it will no longer point to the `_data` item of the same name

In [8]:
t2 = t.copy()
t2.id = np.empty_like(t2.x)
t2['id'] = np.empty_like(t2.x)
for i, (u, ix, grouped) in enumerate(t.iterate_by(['y', 'z'])):
    t2.x[ix] = sum(grouped.x)
    t2.id[ix] = len(grouped.x)
    t2['id'][ix] = max(grouped.x)
    
t2['attr_id'] = t2.id
print('Transformed without magic:')
t2

Transformed without magic:


Unnamed: 0,x,y,z,id,attr_id
,int,object,int,int,int
0.0,4,one,0,3,2
1.0,2,two,0,2,1
2.0,4,one,0,3,2
3.0,10,two,1,6,2
4.0,5,one,1,5,1
5.0,10,two,1,6,2


Iterate By

In [9]:
print('Iterate By:')
for u, ix, grouped in t.iterate_by(['y']):
    display(grouped)

Iterate By:


Unnamed: 0,x,y,z
,int,object,int
0.0,1,one,0
1.0,3,one,0
2.0,5,one,1


Unnamed: 0,x,y,z
,int,object,int
0.0,2,two,0
1.0,4,two,1
2.0,6,two,1


Group By in Interate By

In [10]:
print('Group By in Iterate By:')
for u, ix, grouped in t.iterate_by(['y']):
    display(grouped.group_by(['z'], {'x': sum}))

Group By in Iterate By:


Unnamed: 0,z,x
,int,int
0.0,0,4
1.0,1,5


Unnamed: 0,z,x
,int,int
0.0,0,2
1.0,1,10


In [11]:
from IPython.core.display import HTML

In [12]:
t = Tafra({
    'x': np.array([1, 2, 3, 4, 5, 6]),
    'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'z': np.array([0, 0, 0, 1, 1, 1])
})

HTML(t.to_html())

Unnamed: 0,x,y,z
,int,object,int
0.0,1,one,0
1.0,2,two,0
2.0,3,one,0
3.0,4,two,1
4.0,5,one,1
5.0,6,two,1


In [13]:
_t = t.copy()
_t.update(_t.transform(['y'], {}, {'id': max}))

for u, ix, it in t.iterate_by(['y']):
    _t['x'][ix] = it['x'] - np.mean(it['x'])

print('Index Use in Iterate By:')
_t

Index Use in Iterate By:


Unnamed: 0,x,y,z,id
,int,object,int,int
0.0,-2,one,0,0
1.0,-2,two,0,1
2.0,0,one,0,0
3.0,0,two,1,1
4.0,2,one,1,0
5.0,2,two,1,1


In [14]:
print('Group By in Iterate By:')
for u, ix, it in t.iterate_by(['y']):
    it['x'][0] = 9
    display(it)

t

Group By in Iterate By:


Unnamed: 0,x,y,z
,int,object,int
0.0,9,one,0
1.0,3,one,0
2.0,5,one,1


Unnamed: 0,x,y,z
,int,object,int
0.0,9,two,0
1.0,4,two,1
2.0,6,two,1


Unnamed: 0,x,y,z
,int,object,int
0.0,1,one,0
1.0,2,two,0
2.0,3,one,0
3.0,4,two,1
4.0,5,one,1
5.0,6,two,1


In [15]:
import pandas as pd

In [16]:
l = pd.DataFrame.from_dict({
    'x': np.arange(1000),
})

r = pd.DataFrame.from_dict({
    'a': np.tile(np.arange(1000), 1000),
})

l = l.merge(r, how='left', left_on=['x'], right_on=['a'])

In [17]:
l = Tafra({
    'x': np.arange(1000),
})

r = Tafra({
    'a': np.tile(np.arange(1000), 1000),
})

l = l.left_join(r, [('x', 'a', '==')], ['x', 'a'])

In [18]:
l = Tafra({
    'x': np.arange(100),
})

r = Tafra({
    'a': np.tile(np.arange(10), 1000),
})

l = l.cross_join(r, ['x', 'a'])

In [19]:
l = Tafra({
    'x': np.array([1, 2, 3, 4, 5, 6]),
    'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'z': np.array([0, 0, 0, 1, 1, 1])
})

r = Tafra({
    'a': np.array([1, 2, 3, 4, 5, 6]),
    'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'c': np.array([0, 0, 0, 1, 1, 1])
})

l = l.left_join(r, [('x', 'a', '==')], ['x', 'y', 'a', 'b'])
l

Unnamed: 0,x,y,a,b
,int,object,int,object
0.0,1,one,1,one
1.0,2,two,2,two
2.0,3,one,3,one
3.0,4,two,4,two
4.0,5,one,5,one
5.0,6,two,6,two


In [20]:
l = Tafra({
    'x': np.array([1, 2, 3, 4, 5, 6]),
    'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'z': np.array([0, 0, 0, 1, 1, 1])
})

r = Tafra({
    'a': np.array([1, 1, 1, 2, 2, 2]),
    'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'c': np.array([0, 0, 0, 1, 1, 1])
})

l = l.left_join(r, [('x', 'a', '==')], ['x', 'y', 'a', 'b'])
l

Unnamed: 0,x,y,a,b
,int,object,object,object
0.0,1,one,1,one
1.0,1,one,1,two
2.0,1,one,1,one
3.0,2,two,2,two
4.0,2,two,2,one
5.0,2,two,2,two
6.0,3,one,,
7.0,4,two,,
8.0,5,one,,


In [21]:
l = Tafra({
    'x': np.array([1, 2, 3, 4, 5, 6]),
    'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'z': np.array([0, 0, 0, 1, 1, 1])
})

r = Tafra({
    'a': np.array([1, 1, 1, 2, 2, 2]),
    '_a': np.array([1, 1, 2, 2, 3, 3]),
    'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'c': np.array([0, 0, 0, 1, 1, 1])
})

l = l.left_join(r, [('x', 'a', '=='), ('x', '_a', '==')], ['x', 'y', 'a', 'b'])
l

Unnamed: 0,x,y,a,b
,int,object,object,object
0.0,1,one,1,one
1.0,1,one,1,two
2.0,2,two,2,two
3.0,3,one,,
4.0,4,two,,
5.0,5,one,,
6.0,6,two,,


In [22]:
l = Tafra({
    'x': np.array([1, 2, 3, 4, 5, 6]),
    'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'z': np.array([0, 0, 0, 1, 1, 1])
})

r = Tafra({
    'a': np.array([1, 1, 2, 2, 3, 3]),
    'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'c': np.array([0, 0, 0, 1, 1, 1])
})

l = l.left_join(r, [('x', 'a', '<')], ['x', 'y', 'a', 'b'])
l

Unnamed: 0,x,y,a,b
,int,object,object,object
0.0,1,one,2,one
1.0,1,one,2,two
2.0,1,one,3,one
3.0,1,one,3,two
4.0,2,two,3,one
5.0,2,two,3,two
6.0,3,one,,
7.0,4,two,,
8.0,5,one,,


In [23]:
l = Tafra({
    'x': np.array([1, 2, 3, 4, 5, 6]),
    'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'z': np.array([0, 0, 0, 1, 1, 1])
})

r = Tafra({
    'a': np.array([1, 1, 1, 2, 2, 2]),
    'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'c': np.array([0, 0, 0, 1, 1, 1])
})

l = l.left_join(r, [('x', 'a', '==')], ['x', 'y', 'a', 'b'])
l

Unnamed: 0,x,y,a,b
,int,object,object,object
0.0,1,one,1,one
1.0,1,one,1,two
2.0,1,one,1,one
3.0,2,two,2,two
4.0,2,two,2,one
5.0,2,two,2,two
6.0,3,one,,
7.0,4,two,,
8.0,5,one,,


In [24]:
l = Tafra({
    'x': np.array([1, 2, 3, 4, 5, 6]),
    'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'z': np.array([0, 0, 0, 1, 1, 1])
})

r = Tafra({
    'a': np.array([1, 2, 3, 4, 5, 6]),
    'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'c': np.array([0, 0, 0, 1, 1, 1])
})

l = l.inner_join(r, [('x', 'a', '==')], ['x', 'y', 'a', 'b'])
l

Unnamed: 0,x,y,a,b
,int,object,int,object
0.0,1,one,1,one
1.0,2,two,2,two
2.0,3,one,3,one
3.0,4,two,4,two
4.0,5,one,5,one
5.0,6,two,6,two


In [25]:
l = Tafra({
    'x': np.array([1, 2, 3, 4, 5, 6]),
    'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'z': np.array([0, 0, 0, 1, 1, 1])
})

r = Tafra({
    'a': np.array([1, 1, 2, 2, 3, 3]),
    'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'c': np.array([0, 0, 0, 1, 1, 1])
})

l = l.inner_join(r, [('x', 'a', '==')], ['x', 'y', 'a', 'b'])
l

Unnamed: 0,x,y,a,b
,int,object,int,object
0.0,1,one,1,one
1.0,1,one,1,two
2.0,2,two,2,one
3.0,2,two,2,two
4.0,3,one,3,one
5.0,3,one,3,two


In [26]:
l = Tafra({
    'x': np.array([1, 2, 3, 4, 5, 6]),
    'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'z': np.array([0, 0, 0, 1, 1, 1])
})

r = Tafra({
    'a': np.array([1, 1, 1, 2, 2, 2]),
    'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'c': np.array([0, 0, 0, 1, 1, 1])
})

l = l.inner_join(r, [('x', 'a', '==')], ['x', 'y', 'a', 'b'])
l

Unnamed: 0,x,y,a,b
,int,object,int,object
0.0,1,one,1,one
1.0,1,one,1,two
2.0,1,one,1,one
3.0,2,two,2,two
4.0,2,two,2,one
5.0,2,two,2,two


In [27]:
l = Tafra({
    'x': np.array([1, 2, 3, 4, 5, 6]),
    'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'z': np.array([0, 0, 0, 1, 1, 1])
})

r = Tafra({
    'a': np.array([1, 1, 1, 2, 2, 2]),
    'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'c': np.array([0, 0, 0, 1, 1, 1])
})

l = l.inner_join(r, [('x', 'a', '<=')], ['x', 'y', 'a', 'b'])
l

Unnamed: 0,x,y,a,b
,int,object,int,object
0.0,1,one,1,one
1.0,1,one,1,two
2.0,1,one,1,one
3.0,1,one,2,two
4.0,1,one,2,one
5.0,1,one,2,two
6.0,2,two,2,two
7.0,2,two,2,one
8.0,2,two,2,two


In [28]:
try:
    t.union(_t)
except Exception as e:
    print(e)

This `Tafra` column count does not match other `Tafra` column count.


In [29]:
t2 = t.copy()
t3 = t.copy()
t3.update_dtypes({'x': float})

In [30]:
t.union(t2)

Unnamed: 0,x,y,z
,int,object,int
0.0,1,one,0
1.0,2,two,0
2.0,3,one,0
3.0,4,two,1
4.0,5,one,1
5.0,6,two,1
6.0,1,one,0
7.0,2,two,0
8.0,3,one,0


In [31]:
try:
    t.union(t3)
except Exception as e:
    print(e)

This `Tafra` column `x` dtype `int32` does not match other `Tafra` dtype `float64`.


In [32]:
t3 = t.copy()
t2.union(t3, inplace=True)
t2

Unnamed: 0,x,y,z
,int,object,int
0.0,1,one,0
1.0,2,two,0
2.0,3,one,0
3.0,4,two,1
4.0,5,one,1
5.0,6,two,1


In [33]:
x = t[:3]
x['x'][0] = 0
t['x']

array([0, 2, 3, 4, 5, 6])

In [34]:
x = t[slice(0, 3)]
x['x'][0] = 7
t['x']

array([7, 2, 3, 4, 5, 6])

In [35]:
z = t[:3].copy()
z['x'][0] = 9
t['x']

array([7, 2, 3, 4, 5, 6])

In [36]:
a = t[t['x'] <= 4]
a['x'][1] = 15
print(a['x'])
print(t['x'])

[ 2 15  4]
[7 2 3 4 5 6]


In [37]:
t.update_dtypes({'x': float})

In [38]:
t.update_dtypes({'x': 'float'})

In [39]:
try:
    t.update_dtypes({'x': 'flot', 'y': 'st'})
except Exception as e:
    print(e)

`flot` is not a valid dtype for `x.`
`st` is not a valid dtype for `y.`



In [40]:
t.update_dtypes({'x': int})
o = t.copy()
t.update(o)
t

Unnamed: 0,x,y,z
,int,object,int
0.0,7,one,0
1.0,2,two,0
2.0,3,one,0
3.0,4,two,1
4.0,5,one,1
5.0,6,two,1


In [41]:
o.update_dtypes({'x': float})
t.update(o)
t

Unnamed: 0,x,y,z
,float,object,int
0.0,7.0,one,0
1.0,2.0,two,0
2.0,3.0,one,0
3.0,4.0,two,1
4.0,5.0,one,1
5.0,6.0,two,1


Direct assignment to the `_data` `dict()` is not recommended as it bypasses validation.

In [42]:
o._data['x'] = np.arange(5)
try:
    o.__post_init__()
except Exception as e:
    print(e)

`Tafra` must have consistent row counts.


In [43]:
try:
    t.update(o)
except Exception as e:
    print(e)
t

Other `Tafra` must have consistent row count. This `Tafra` has 6 rows, other `Tafra` has 5 rows.


Unnamed: 0,x,y,z
,float,object,int
0.0,7.0,one,0
1.0,2.0,two,0
2.0,3.0,one,0
3.0,4.0,two,1
4.0,5.0,one,1
5.0,6.0,two,1


In [44]:
try:
    t['x'] = list(range(6))
except Exception as e:
    print(e)
t

Unnamed: 0,x,y,z
,int,object,int
0.0,0,one,0
1.0,1,two,0
2.0,2,one,0
3.0,3,two,1
4.0,4,one,1
5.0,5,two,1


In [45]:
t['x'] = np.arange(6)[:, None]



In [46]:
try:
    t['x'] = np.atleast_2d(np.arange(6))
except Exception as e:
    print(e)
t



Unnamed: 0,x,y,z
,int,object,int
0.0,0,one,0
1.0,1,two,0
2.0,2,one,0
3.0,3,two,1
4.0,4,one,1
5.0,5,two,1


In [47]:
t['x'] = np.atleast_2d(np.arange(6)).T
t



Unnamed: 0,x,y,z
,int,object,int
0.0,0,one,0
1.0,1,two,0
2.0,2,one,0
3.0,3,two,1
4.0,4,one,1
5.0,5,two,1


In [48]:
try:
    t['x'] = np.atleast_2d(np.arange(6))
except Exception as e:
    print(e)



In [49]:
try:
    t['x'] = np.repeat(np.arange(6)[:, None], repeats=2, axis=1)
except Exception as e:
    print(e)

`ndarray` or `np.squeeze(ndarray)` must have ndim == 1.


In [50]:
t = Tafra({
    'x': np.array([1, 2, 3, 4, 5, 6]),
    'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'z': np.array([0, 0, 0, 1, 1, 1])
})

t['d'] = np.array([np.datetime64(_, 'D') for _ in range(6)])

t

Unnamed: 0,x,y,z,d
,int,object,int,date
0.0,1,one,0,1970-01-01
1.0,2,two,0,1970-01-02
2.0,3,one,0,1970-01-03
3.0,4,two,1,1970-01-04
4.0,5,one,1,1970-01-05
5.0,6,two,1,1970-01-06


In [51]:
pprint(tuple(t.to_records()))

((1, 'one', 0, '1970-01-01'),
 (2, 'two', 0, '1970-01-02'),
 (3, 'one', 0, '1970-01-03'),
 (4, 'two', 1, '1970-01-04'),
 (5, 'one', 1, '1970-01-05'),
 (6, 'two', 1, '1970-01-06'))


In [52]:
pprint(t.to_list())

[array([1, 2, 3, 4, 5, 6]),
 array(['one', 'two', 'one', 'two', 'one', 'two'], dtype=object),
 array([0, 0, 0, 1, 1, 1]),
 array(['1970-01-01', '1970-01-02', '1970-01-03', '1970-01-04',
       '1970-01-05', '1970-01-06'], dtype='datetime64[D]')]


In [53]:
t = Tafra({'x': np.array([1, 2, None, 4, None])})
t

Unnamed: 0,x
,object
0.0,1
1.0,2
2.0,
3.0,4
4.0,


In [54]:
t['x'] = t.coalesce('x', [[1, 2, 3, None, 5], [None, None, None, None, 'five']])
t

Unnamed: 0,x
,object
0.0,1
1.0,2
2.0,1
3.0,4
4.0,1
