# Tables
Module to simplify handling of input and output tables (as .csv files).  

In [1]:
from __future__ import print_function

from salib import extend
import pandas as pd
import os, os.path
import StringIO
import hashlib
from IPython.core.magic import register_cell_magic
import re

## class Table

In [2]:
class Table(pd.DataFrame):
    
    """A Table is just like a pandas DataFrame except that it has
    a table name, a data set name, and a file name - the latter two describing
    the source of the data."""
    
    _internal_names = pd.DataFrame._internal_names + ['filename','tablename']
    _internal_names_set = set(_internal_names)

    _metadata = ['dsname']
            
    def __init__(self,*args,**kwargs):
        dsname = kwargs.pop('dsname',None)
        tablename = kwargs.pop('tablename',None)
        filename = kwargs.pop('filename',None)
        super(self.__class__,self).__init__(*args,**kwargs)
        if dsname is not None:
            self.dsname = dsname
        if tablename is not None:
            self.tablename = tablename
        if filename is not None:
            self.filename = filename
        
    @property
    def _constructor(self):
        return self.__class__

In [3]:
##test:
t = Table(data=[(10,20.,'a'),(11,22.,'b'),(12,23.,'c')],
          columns=['I','F','S'],tablename='Test',dsname='Notebook')
t

Unnamed: 0,I,F,S
0,10,20,a
1,11,22,b
2,12,23,c


In [4]:
##test:
t.dtypes

I      int64
F    float64
S     object
dtype: object

In [5]:
##test:
t.tablename, t.dsname

('Test', 'Notebook')

In [6]:
##test:
t2 = t[['S','I']]
t2

Unnamed: 0,S,I
0,a,10
1,b,11
2,c,12


In [7]:
##test:
hasattr(t2,'tablename'), hasattr(t2,'dsname')

(False, True)

In [8]:
##test:
t2.dsname

'Notebook'

In [9]:
##test:
t = pd.DataFrame(data=[(10,20.,'a'),(11,22.,'b'),(12,23.,'c')],columns=['I','F','S'])
u = Table(data=t,dsname='foo',copy=False)
u

Unnamed: 0,I,F,S
0,10,20,a
1,11,22,b
2,12,23,c


In [10]:
##test:
u['F'] *= 3
u

Unnamed: 0,I,F,S
0,10,60,a
1,11,66,b
2,12,69,c


In [11]:
##test:
t

Unnamed: 0,I,F,S
0,10,60,a
1,11,66,b
2,12,69,c


In [12]:
##test:
u.dsname

'foo'

## class DataSource
Class to unify the source of tables.  For now, this assumes that a table:
 * has been specified directly via '`.set_data`', or
 * has been provided in CSV form using the cell magic '`%%Table`', or
 * is in a CSV file available in a directory '`<root>/xyz.d`',  where '`xyz`' is the 
   'data set name'.  
   
Eventually, we will have a way of archiving sets of files in .zip files.

In [13]:
class DataSource(object):
    
    ROOT = 'data'
    DSNAME = None     # default data set name
    DSTYPE = 'dir'    # someday we will allow 'zip' for zip archives
    #DSTYPE = 'cell'  # for CSV data provided via %%Table cell magic
    #DSTYPE = 'data'  # for dataframe data provided directly
    CELLDATA = {}     # csv text from %%Table magic cells, indexed by table name
    TABLES = {}       # dataframes directly provided by client, indexed by table name
    
    DATASOURCE = None # the one and only data source
    
    def __init__(self):
        cls = self.__class__
        if cls.DATASOURCE is not None:
            raise ValueError("Can only create one instance of class '{}'".format(cls.__name__))
        self.root = cls.ROOT
        self.dsname = cls.DSNAME
        self.prefix = None
        self.dstype = cls.DSTYPE
        self.celldata = cls.CELLDATA
        self.tables = cls.TABLES
        cls.DATASOURCE = self

In [14]:
##test:
d = DataSource()
vars(d)

{'celldata': {},
 'dsname': None,
 'dstype': 'dir',
 'prefix': None,
 'root': 'data',
 'tables': {}}

In [15]:
##test:
try:
    d2 = DataSource()
except Exception as e:
    print('*'*5,e)
    d2 = None
d2

***** Can only create one instance of class 'DataSource'


In [26]:
@extend
class DataSource:
    
    @classmethod
    def set_root(cls,newroot):
        self = cls.DATASOURCE
        if not os.path.exists(newroot):
            raise ValueError,"Root '{}' does not exist.".format(newroot)
        self.root = newroot

    @classmethod
    def set_source(cls,dsname,dstype=None):
        self = cls.DATASOURCE
        if dstype is None:
            dirname = self.root + '/' + dsname + '.d'
            if os.path.exists(dirname):
                dstype = 'dir'
            else:
                dstype = 'unknown'
        if dstype not in ['dir','cell','data']:
            raise ValueError,"dstype '{}' is invalid.".format(dstype)
        self.dsname = dsname
        self.dstype = dstype
        self.celldata = {}
        self.tables = {}
        
    @classmethod
    def set_table(cls,tablename,table):
        self = cls.DATASOURCE
        self.tables[tablename] = table
        if tablename in self.celldata:
            del self.celldata[tablename]
    
    @classmethod
    def set_celldata(cls,tablename,celltext):
        self = cls.DATASOURCE
        self.celldata[tablename] = celltext
        if tablename in self.tables:
            del self.tables[tablename]
    
    def _file_name(self,tablename,prefix=None):
        n = tablename
        if prefix:
            n = prefix + '/' + tablename
        return self.root + '/' + self.dsname + '.d/' + n + '.csv'

In [27]:
##test:
DataSource.DATASOURCE = None
ds = DataSource()
vars(ds)

{'celldata': {},
 'dsname': None,
 'dstype': 'dir',
 'prefix': None,
 'root': 'data',
 'tables': {}}

In [28]:
##test:
try:
    DataSource.set_root('foo')
except Exception as e:
    print('*'*5,e)
vars(ds)

***** Root 'foo' does not exist.


{'celldata': {},
 'dsname': None,
 'dstype': 'dir',
 'prefix': None,
 'root': 'data',
 'tables': {}}

In [32]:
##test:
DataSource.set_root('img')
vars(ds)

{'celldata': {},
 'dsname': None,
 'dstype': 'dir',
 'prefix': None,
 'root': 'img',
 'tables': {}}

In [34]:
##test:
DataSource.set_root('data')

In [38]:
##test:
DataSource.set_source('frame-1')
vars(ds)

{'celldata': {},
 'dsname': 'frame-1',
 'dstype': 'dir',
 'prefix': None,
 'root': 'data',
 'tables': {}}

In [39]:
##test:
DataSource.set_table('joints',[dict(NODEID='A',X=10,Y=20),dict(NODEID='B',Y=20,X=30)])
vars(ds)

{'celldata': {},
 'dsname': 'frame-1',
 'dstype': 'dir',
 'prefix': None,
 'root': 'data',
 'tables': {'joints': [{'NODEID': 'A', 'X': 10, 'Y': 20},
   {'NODEID': 'B', 'X': 30, 'Y': 20}]}}

In [41]:
##test:
DataSource.set_celldata('joints','NODEID,X,Y\nA,10,20\nB,30,20')
vars(ds)

{'celldata': {'joints': 'NODEID,X,Y\nA,10,20\nB,30,20'},
 'dsname': 'frame-1',
 'dstype': 'dir',
 'prefix': None,
 'root': 'data',
 'tables': {}}

In [42]:
ds._file_name('joints')

'data/frame-1.d/joints.csv'

In [43]:
ds._file_name('joints',prefix='lcase1')

'data/frame-1.d/lcase1/joints.csv'

In [None]:
@extend
class DataSource:
    
    @classmethod
    def read_table(cls,tablename,optional=False,prefix=None,columns=None,extrasok=True):
        self = cls.DATASOURCE
        stream = None
        filename = None
        t = None
        if tablename in self.tables:
            t = self.tables[tablename]
        else:
            if tablename in self.celldata:
                stream = StringIO.StringIO(self.celldata[tablename])
            else:
                filename = self._file_name(tablename,prefix=prefix)
                if os.path.exists(filename):
                    stream = file(filename,'r')
            if stream is None:
                if optional:
                    d = pd.DataFrame(columns=columns)
                else:
                    raise ValueError("Table '{}' does not exist.".format(tablename))
            else:
                d = pd.read_csv(stream,index_col=None,skipinitialspace=True)
            t = Table(d,dsname=self.dsname,tablename=tablename,filename=filename)

        if columns is None:
            return t
        prov = set(t.columns)
        reqd = set(columns)
        if reqd-prov:
            raise ValueError("Columns missing for table '{}': {}. Required columns are: {}"
                             .format(tablename,list(reqd-prov),columns))
        if prov-reqd:
            if not extrasok:
                raise ValueError("Extra columns for table '{}': {}. Required columns are: '{}'"
                                .format(tablename,list(prov-reqd),columns))
            t = t[columns]
        return t

In [None]:
##test:
Table.DSNAME = 'frame-6'
t = Table('nodes',columns=['NODEID','X','Y'])

In [None]:
##test:
t.read()

In [None]:
##test:
len(t)

In [None]:
##test:
t.data.loc[:,['X','Y']] /= 3.
t.data

In [None]:
##test:
t.write(precision=7,prefix='out',makedir=True)

In [None]:
##test:
t.signature()

In [None]:
##test:
vars(t)

In [None]:
##test:
t.read()

In [None]:
##test:
vars(t)

In [None]:
@register_cell_magic('Table')
def cell_table(line,cell):
    mo = re.match(r'\s*(\S+)\s*$',line)
    if not mo:
        raise ValueError('Usage: %%Table tablename')
    table_name = mo.group(1)
    global Table
    Table.DSTYPE = 'cell'
    Table.CELLDATA[table_name] = cell

In [None]:
%%Table nodes
NODEID,X,Y,Z
A,0,0,5000
B,0,4000,5000
C,8000,4000,5000
D,8000,0,5000

In [None]:
##test:
Table.DSTYPE

In [None]:
##test:
Table.CELLDATA

In [None]:
##test:
t = Table('nodes',columns=['NODEID','Y','Z'])
t.read()

In [None]:
##test:
Table.set_source('xxx','data')
Table.set_data('nodes',t.data)
tt = Table('nodes',columns=['NODEID','Y','Z'])
tt.read()

In [None]:
##test:
tt.file_name

In [None]:
##test:
vars(tt)

In [None]:
@extend
class DataSource:
    
    def write(self,ds_name=None,precision=None,index=False,prefix=None,makedir=False):
        if ds_name is None:
            ds_name = self.ds_name
        dirname = 'data/' + ds_name + '.d'
        if makedir and not os.path.exists(dirname):
            os.mkdir(dirname)
        if prefix is not None:
            dirname = dirname + '/' + prefix
            if makedir and not os.path.exists(dirname):
                os.mkdir(dirname)
        self.file_name = file_name = dirname + '/' + self.table_name + '.csv'
        float_format = None
        if precision is not None:
            float_format = '%.{:d}g'.format(precision)
        self.data.to_csv(file_name,index=index,float_format=float_format)
        return file_name
        
    def basename(self,file_name=None):
        if file_name is None:
            file_name = self.file_name
        return os.path.basename(file_name)
    
    def signature(self):
        file_name = self.file_name
        return (self.table_name,file_name,signature(file_name))
    
def signature(file_name):
    f = open(file_name,mode='rb')
    m = hashlib.sha256(f.read())
    f.close()
    return m.hexdigest()