In [1]:
# %load_ext autoreload

import numpy as np
from tafra import Tafra

# %autoreload 2

In [2]:
t = Tafra({
    'x': np.array([1., 2., 3., 4., 5., 6.]),
    'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'z': np.array([0, 0, 0, 1, 1, 1])
})

t

Unnamed: 0,x,y,z
dtype,float,object,int
0,1.0,one,0
1,2.0,two,0
2,3.0,one,0
3,4.0,two,1
4,5.0,one,1
5,6.0,two,1


In [3]:
import pandas as pd
df = pd.DataFrame(t.data)

df

Unnamed: 0,x,y,z
0,1.0,one,0
1,2.0,two,0
2,3.0,one,0
3,4.0,two,1
4,5.0,one,1
5,6.0,two,1


In [4]:
Tafra.as_tafra(df)

Unnamed: 0,x,y,z
dtype,float,object,int
0,1.0,one,0
1,2.0,two,0
2,3.0,one,0
3,4.0,two,1
4,5.0,one,1
5,6.0,two,1


Print Options

In [5]:
print(t)

Tafra(data={'x': array([1., 2., 3., 4., 5., 6.]), 'y': array(['one', 'two', 'one', 'two', 'one', 'two'], dtype=object), 'z': array([0, 0, 0, 1, 1, 1])}, dtypes={'x': 'float', 'y': 'object', 'z': 'int'}, rows=6)


In [6]:
t.pprint()

Tafra(data = {
 'x': array([1., 2., 3., 4., 5., 6.]),
 'y': array(['one', 'two', 'one', 'two', 'one', 'two']),
 'z': array([0, 0, 0, 1, 1, 1])},
dtypes = {
 'x': 'float', 'y': 'object', 'z': 'int'},
rows = 6)


In [7]:
t.pprint(indent=4, width=140)

Tafra(data = {
    'x': array([1., 2., 3., 4., 5., 6.]), 'y': array(['one', 'two', 'one', 'two', 'one', 'two']), 'z': array([0, 0, 0, 1, 1, 1])},
dtypes = {
    'x': 'float', 'y': 'object', 'z': 'int'},
rows = 6)


In [8]:
t.head(5)

Unnamed: 0,x,y,z
dtype,float,object,int
0,1.0,one,0
1,2.0,two,0
2,3.0,one,0
3,4.0,two,1
4,5.0,one,1


Group By:

In [9]:
gb = t.group_by(
    ['y', 'z'], {'x': sum}, {'count': len}
)

gb

Unnamed: 0,y,z,x,count
dtype,object,int,float,int
0,one,0,4.0,2
1,two,0,2.0,1
2,two,1,10.0,2
3,one,1,5.0,1


In [10]:
gb = t.group_by(
    ['y', 'z'], {
        'x': sum,
        'new_x': (sum, 'x')
    }, {'count': len}
)

gb

Unnamed: 0,y,z,x,new_x,count
dtype,object,int,float,float,int
0,one,0,4.0,4.0,2
1,two,0,2.0,2.0,1
2,two,1,10.0,10.0,2
3,one,1,5.0,5.0,1


Transform

In [11]:
tr = t.transform(
    ['y', 'z'], {'x': sum}, {'id': max}
)

tr

Unnamed: 0,y,z,x,id
dtype,object,int,float,int
0,one,0,4.0,0
1,two,0,2.0,1
2,one,0,4.0,0
3,two,1,10.0,2
4,one,1,5.0,3
5,two,1,10.0,2


We can set an custom attribute, but it does not point to the `data` item of the same name

In [12]:
t2 = t.copy()
t2.id = np.empty(t2.rows, dtype=int)
t2['id'] = np.empty(t2.rows, dtype=int)
for i, (u, ix, grouped) in enumerate(t.iterate_by(['y', 'z'])):
    t2['x'][ix] = sum(grouped['x'])
    t2.id[ix] = len(grouped['x'])
    t2['id'][ix] = max(grouped['x'])
    
print(f't2.id = array({t2.id})')
t2

t2.id = array([2 1 2 2 1 2])


Unnamed: 0,x,y,z,id
dtype,float,object,int,int
0,4.0,one,0,3
1,2.0,two,0,2
2,4.0,one,0,3
3,10.0,two,1,6
4,5.0,one,1,5
5,10.0,two,1,6


Iterate By

In [13]:
print('Iterate By:')
for u, ix, grouped in t.iterate_by(['y']):
    display(grouped)

Iterate By:


Unnamed: 0,x,y,z
dtype,float,object,int
0,1.0,one,0
1,3.0,one,0
2,5.0,one,1


Unnamed: 0,x,y,z
dtype,float,object,int
0,2.0,two,0
1,4.0,two,1
2,6.0,two,1


Group By in Interate By

In [14]:
print('Group By in Iterate By:')
for u, ix, grouped in t.iterate_by(['y']):
    display(grouped.group_by(['z'], {'x': sum}))

Group By in Iterate By:


Unnamed: 0,z,x
dtype,int,float
0,0,4.0
1,1,5.0


Unnamed: 0,z,x
dtype,int,float
0,0,2.0
1,1,10.0


In [15]:
_t = t.copy()
_t.update(_t.transform(['y'], {}, {'id': max}))

for u, ix, it in t.iterate_by(['y']):
    _t['x'][ix] = it['x'] - np.mean(it['x'])

print('Index Use in Iterate By:')
_t

Index Use in Iterate By:


Unnamed: 0,x,y,z,id
dtype,float,object,int,int
0,-2.0,one,0,0
1,-2.0,two,0,1
2,0.0,one,0,0
3,0.0,two,1,1
4,2.0,one,1,0
5,2.0,two,1,1


In [16]:
print('Group By in Iterate By:')
_t = t.copy()
for u, ix, it in _t.iterate_by(['y']):
    it['x'][0] = 9
    display(it)

_t

Group By in Iterate By:


Unnamed: 0,x,y,z
dtype,float,object,int
0,9.0,one,0
1,3.0,one,0
2,5.0,one,1


Unnamed: 0,x,y,z
dtype,float,object,int
0,9.0,two,0
1,4.0,two,1
2,6.0,two,1


Unnamed: 0,x,y,z
dtype,float,object,int
0,1.0,one,0
1,2.0,two,0
2,3.0,one,0
3,4.0,two,1
4,5.0,one,1
5,6.0,two,1


In [17]:
l = pd.DataFrame.from_dict({
    'x': np.arange(1000),
})

r = pd.DataFrame.from_dict({
    'a': np.tile(np.arange(1000), 1000),
})

l = l.merge(r, how='left', left_on=['x'], right_on=['a'])

In [18]:
l = Tafra({
    'x': np.arange(1000),
})

r = Tafra({
    'a': np.tile(np.arange(1000), 1000),
})

l = l.left_join(r, [('x', 'a', '==')], ['x', 'a'])

In [19]:
l = Tafra({
    'x': np.arange(100),
})

r = Tafra({
    'a': np.tile(np.arange(10), 1000),
})

l = l.cross_join(r, ['x', 'a'])

In [20]:
l = Tafra({
    'x': np.array([1, 2, 3, 4, 5, 6]),
    'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'z': np.array([0, 0, 0, 1, 1, 1])
})

r = Tafra({
    'a': np.array([1, 2, 3, 4, 5, 6]),
    'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'c': np.array([0, 0, 0, 1, 1, 1])
})

l = l.left_join(r, [('x', 'a', '==')], ['x', 'y', 'a', 'b'])
l

Unnamed: 0,x,y,a,b
dtype,int,object,int,object
0,1,one,1,one
1,2,two,2,two
2,3,one,3,one
3,4,two,4,two
4,5,one,5,one
5,6,two,6,two


In [21]:
l = Tafra({
    'x': np.array([1, 2, 3, 4, 5, 6]),
    'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'z': np.array([0, 0, 0, 1, 1, 1])
})

r = Tafra({
    'a': np.array([1, 1, 1, 2, 2, 2]),
    'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'c': np.array([0, 0, 0, 1, 1, 1])
})

l = l.left_join(r, [('x', 'a', '==')], ['x', 'y', 'a', 'b'])
l

Unnamed: 0,x,y,a,b
dtype,int,object,object,object
0,1,one,1,one
1,1,one,1,two
2,1,one,1,one
3,2,two,2,two
4,2,two,2,one
5,2,two,2,two
6,3,one,,
7,4,two,,
8,5,one,,


In [22]:
l = Tafra({
    'x': np.array([1, 2, 3, 4, 5, 6]),
    'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'z': np.array([0, 0, 0, 1, 1, 1])
})

r = Tafra({
    'a': np.array([1, 1, 1, 2, 2, 2]),
    '_a': np.array([1, 1, 2, 2, 3, 3]),
    'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'c': np.array([0, 0, 0, 1, 1, 1])
})

l = l.left_join(r, [('x', 'a', '=='), ('x', '_a', '==')], ['x', 'y', 'a', 'b'])
l

Unnamed: 0,x,y,a,b
dtype,int,object,object,object
0,1,one,1,one
1,1,one,1,two
2,2,two,2,two
3,3,one,,
4,4,two,,
5,5,one,,
6,6,two,,


In [23]:
l = Tafra({
    'x': np.array([1, 2, 3, 4, 5, 6]),
    'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'z': np.array([0, 0, 0, 1, 1, 1])
})

r = Tafra({
    'a': np.array([1, 1, 2, 2, 3, 3]),
    'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'c': np.array([0, 0, 0, 1, 1, 1])
})

l = l.left_join(r, [('x', 'a', '<')], ['x', 'y', 'a', 'b'])
l

Unnamed: 0,x,y,a,b
dtype,int,object,object,object
0,1,one,2,one
1,1,one,2,two
2,1,one,3,one
3,1,one,3,two
4,2,two,3,one
5,2,two,3,two
6,3,one,,
7,4,two,,
8,5,one,,


In [24]:
l = Tafra({
    'x': np.array([1, 2, 3, 4, 5, 6]),
    'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'z': np.array([0, 0, 0, 1, 1, 1])
})

r = Tafra({
    'a': np.array([1, 1, 1, 2, 2, 2]),
    'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'c': np.array([0, 0, 0, 1, 1, 1])
})

l = l.left_join(r, [('x', 'a', '==')], ['x', 'y', 'a', 'b'])
l

Unnamed: 0,x,y,a,b
dtype,int,object,object,object
0,1,one,1,one
1,1,one,1,two
2,1,one,1,one
3,2,two,2,two
4,2,two,2,one
5,2,two,2,two
6,3,one,,
7,4,two,,
8,5,one,,


In [25]:
l = Tafra({
    'x': np.array([1, 2, 3, 4, 5, 6]),
    'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'z': np.array([0, 0, 0, 1, 1, 1])
})

r = Tafra({
    'a': np.array([1, 2, 3, 4, 5, 6]),
    'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'c': np.array([0, 0, 0, 1, 1, 1])
})

l = l.inner_join(r, [('x', 'a', '==')], ['x', 'y', 'a', 'b'])
l

Unnamed: 0,x,y,a,b
dtype,int,object,int,object
0,1,one,1,one
1,2,two,2,two
2,3,one,3,one
3,4,two,4,two
4,5,one,5,one
5,6,two,6,two


In [26]:
l = Tafra({
    'x': np.array([1, 2, 3, 4, 5, 6]),
    'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'z': np.array([0, 0, 0, 1, 1, 1])
})

r = Tafra({
    'a': np.array([1, 1, 2, 2, 3, 3]),
    'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'c': np.array([0, 0, 0, 1, 1, 1])
})

l = l.inner_join(r, [('x', 'a', '==')], ['x', 'y', 'a', 'b'])
l

Unnamed: 0,x,y,a,b
dtype,int,object,int,object
0,1,one,1,one
1,1,one,1,two
2,2,two,2,one
3,2,two,2,two
4,3,one,3,one
5,3,one,3,two


In [27]:
l = Tafra({
    'x': np.array([1, 2, 3, 4, 5, 6]),
    'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'z': np.array([0, 0, 0, 1, 1, 1])
})

r = Tafra({
    'a': np.array([1, 1, 1, 2, 2, 2]),
    'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'c': np.array([0, 0, 0, 1, 1, 1])
})

l = l.inner_join(r, [('x', 'a', '==')], ['x', 'y', 'a', 'b'])
l

Unnamed: 0,x,y,a,b
dtype,int,object,int,object
0,1,one,1,one
1,1,one,1,two
2,1,one,1,one
3,2,two,2,two
4,2,two,2,one
5,2,two,2,two


In [28]:
l = Tafra({
    'x': np.array([1, 2, 3, 4, 5, 6]),
    'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'z': np.array([0, 0, 0, 1, 1, 1])
})

r = Tafra({
    'a': np.array([1, 1, 1, 2, 2, 2]),
    'b': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'c': np.array([0, 0, 0, 1, 1, 1])
})

l = l.inner_join(r, [('x', 'a', '<=')], ['x', 'y', 'a', 'b'])
l

Unnamed: 0,x,y,a,b
dtype,int,object,int,object
0,1,one,1,one
1,1,one,1,two
2,1,one,1,one
3,1,one,2,two
4,1,one,2,one
5,1,one,2,two
6,2,two,2,two
7,2,two,2,one
8,2,two,2,two


In [29]:
t0 = t.copy()
try:
    t0.union(_t)
except Exception as e:
    print(e)

In [30]:
t2 = t.copy()
t3 = t.copy()
t3.update_dtypes({'x': float})

In [31]:
t0.union(t2)

Unnamed: 0,x,y,z
dtype,float,object,int
0,1.0,one,0
1,2.0,two,0
2,3.0,one,0
3,4.0,two,1
4,5.0,one,1
5,6.0,two,1
6,1.0,one,0
7,2.0,two,0
8,3.0,one,0


In [32]:
try:
    t0.union(t3)
except Exception as e:
    print(e)

In [33]:
t3 = t.copy()
t2.union(t3, inplace=True)
t2

Unnamed: 0,x,y,z
dtype,float,object,int
0,1.0,one,0
1,2.0,two,0
2,3.0,one,0
3,4.0,two,1
4,5.0,one,1
5,6.0,two,1


In [34]:
x = t0[:3]
x['x'][0] = 0
t0['x']

array([0., 2., 3., 4., 5., 6.])

In [35]:
x = t0[slice(0, 3)]
x['x'][0] = 7
t0['x']

array([7., 2., 3., 4., 5., 6.])

In [36]:
z = t0[:3].copy()
z['x'][0] = 9
t0['x']

array([7., 2., 3., 4., 5., 6.])

In [37]:
a = t0[t0['x'] <= 4]
a['x'][1] = 15
print(a['x'])
print(t0['x'])

[ 2. 15.  4.]
[7. 2. 3. 4. 5. 6.]


In [38]:
t0.update_dtypes({'x': float})

In [39]:
t0.update_dtypes({'x': 'float'})

In [40]:
try:
    t0.update_dtypes({'x': 'flot', 'y': 'st'})
except Exception as e:
    print(e)

`flot` is not a valid dtype for `x.`
`st` is not a valid dtype for `y.`



In [41]:
t0.update_dtypes({'x': int})
o = t0.copy()
t0.update(o)
t0

Unnamed: 0,x,y,z
dtype,int,object,int
0,7,one,0
1,2,two,0
2,3,one,0
3,4,two,1
4,5,one,1
5,6,two,1


In [42]:
o.update_dtypes({'x': float})
t0.update(o)
t0

Unnamed: 0,x,y,z
dtype,float,object,int
0,7.0,one,0
1,2.0,two,0
2,3.0,one,0
3,4.0,two,1
4,5.0,one,1
5,6.0,two,1


Direct assignment to the `_data` `dict()` is not recommended as it bypasses validation.

In [43]:
o._data['x'] = np.arange(5)
try:
    o.__post_init__()
except Exception as e:
    print(e)

`Tafra` must have consistent row counts.


In [44]:
try:
    t0.update(o)
except Exception as e:
    print(e)
t0

Other `Tafra` must have consistent row count. This `Tafra` has 6 rows, other `Tafra` has 5 rows.


Unnamed: 0,x,y,z
dtype,float,object,int
0,7.0,one,0
1,2.0,two,0
2,3.0,one,0
3,4.0,two,1
4,5.0,one,1
5,6.0,two,1


In [45]:
try:
    t0['x'] = list(range(6))
except Exception as e:
    print(e)
t0

Unnamed: 0,x,y,z
dtype,int,object,int
0,0,one,0
1,1,two,0
2,2,one,0
3,3,two,1
4,4,one,1
5,5,two,1


In [46]:
t0['x'] = np.arange(6)[:, None]



In [47]:
try:
    t0['x'] = np.atleast_2d(np.arange(6))
except Exception as e:
    print(e)
t0



Unnamed: 0,x,y,z
dtype,int,object,int
0,0,one,0
1,1,two,0
2,2,one,0
3,3,two,1
4,4,one,1
5,5,two,1


In [48]:
t0['x'] = np.atleast_2d(np.arange(6)).T
t0



Unnamed: 0,x,y,z
dtype,int,object,int
0,0,one,0
1,1,two,0
2,2,one,0
3,3,two,1
4,4,one,1
5,5,two,1


In [49]:
try:
    t0['x'] = np.atleast_2d(np.arange(6))
except Exception as e:
    print(e)



In [50]:
try:
    t0['x'] = np.repeat(np.arange(6)[:, None], repeats=2, axis=1)
except Exception as e:
    print(e)

`ndarray` or `np.squeeze(ndarray)` must have ndim == 1.


In [51]:
t4 = Tafra({
    'x': np.array([1, 2, 3, 4, 5, 6]),
    'y': np.array(['one', 'two', 'one', 'two', 'one', 'two'], dtype='object'),
    'z': np.array([0, 0, 0, 1, 1, 1])
})

t4['d'] = np.array([np.datetime64(_, 'D') for _ in range(6)])

t4

Unnamed: 0,x,y,z,d
dtype,int,object,int,date
0,1,one,0,1970-01-01
1,2,two,0,1970-01-02
2,3,one,0,1970-01-03
3,4,two,1,1970-01-04
4,5,one,1,1970-01-05
5,6,two,1,1970-01-06


In [52]:
tuple(t4.to_records())

((1, 'one', 0, '1970-01-01'),
 (2, 'two', 0, '1970-01-02'),
 (3, 'one', 0, '1970-01-03'),
 (4, 'two', 1, '1970-01-04'),
 (5, 'one', 1, '1970-01-05'),
 (6, 'two', 1, '1970-01-06'))

In [53]:
t4.to_list()

[array([1, 2, 3, 4, 5, 6]),
 array(['one', 'two', 'one', 'two', 'one', 'two'], dtype=object),
 array([0, 0, 0, 1, 1, 1]),
 array(['1970-01-01', '1970-01-02', '1970-01-03', '1970-01-04',
        '1970-01-05', '1970-01-06'], dtype='datetime64[D]')]

In [54]:
t5 = Tafra({'x': np.array([1, 2, None, 4, None])})
t5

Unnamed: 0,x
dtype,object
0,1
1,2
2,
3,4
4,


In [55]:
t5['x'] = t5.coalesce('x', [[1, 2, 3, None, 5], [None, None, None, None, 'five']])
t5

Unnamed: 0,x
dtype,object
0,1
1,2
2,3
3,4
4,5
