## Tables
Module to simplify handling of input and output tables (as .csv files).  For now, this assumes that all files are
archived in a directory 'xyz.d', where 'xyz' is the 'data set name'.  Eventually, we will have a way of
archiving sets of files in .zip files.

In [1]:
from __future__ import print_function

import pandas as pd
import os, os.path
import StringIO
import hashlib
from IPython.core.magic import register_cell_magic
import re

In [2]:
class Table(object):
    
    ROOT = 'data'
    DSNAME = None     # default data set name
    DSTYPE = 'dir'    # someday we will allow 'zip' for zip archives
    #DSTYPE = 'cell'  # for CSV data provided via %%Table cell magic
    #DSTYPE = 'data'  # for dataframe data provided directly
    CELLDATA = {}     # csv text from %%Table magic cells, indexed by table name
    DATAFRAMES = {}   # dataframes directly provided by client, indexed by table name
    
    @classmethod
    def set_source(cls,dsname,dstype=None):
        if dstype is None:
            dirname = cls.ROOT + '/' + dsname + '.d'
            if os.path.exists(dirname):
                dstype = 'dir'
            else:
                dstype = 'unknown'
        assert dstype in ['dir','cell','data']
        cls.DSNAME = dsname
        cls.DSTYPE = dstype
        cls.CELLDATA = {}
        cls.DATAFRAMES = {}
        
    @classmethod
    def set_data(cls,tablename,data):
        assert cls.DSTYPE == 'data'
        cls.DATAFRAMES[tablename] = data
    
    def __init__(self,tablename,dsname=None,columns=None,indexcol=None,optional=False,data=[]):
        if dsname is None and self.DSNAME is not None:
            dsname = self.DSNAME
        self.dsname = ds_name
        self.tablename = table_name
        self.prefix = None
        self.filename = None
        self.columns = columns
        self.indexcol = index_col
        self.optional = optional
        self.data = pd.DataFrame(data,columns=columns)
        
    def _file_name(self,prefix=None):
        self.prefix = prefix
        n = self.tablename
        if prefix:
            n = prefix + '/' + self.tablename
        return 'data/' + self.ds_name + '.d/' + n + '.csv'
        
    def read(self,file_name=None,optional=None):
        if optional is None:
            optional = self.optional
        if self.DSTYPE == 'dir':
            if not file_name:
                file_name = self._file_name()
            self.filename = file_name
            if optional:
                if not os.path.exists(file_name):
                    return self.data
            stream = file(file_name,'r')
        elif self.DSTYPE == 'cell':
            if optional:
                if self.tablename not in self.CELLDATA:
                    return self.data
            stream = StringIO.StringIO(self.CELLDATA[self.table_name])
        elif self.DSTYPE == 'data':
            if optional:
                if self.tablename not in self.DATAFRAMES:
                    return self.data
            self.data = self.DATAFRAMES[self.tablename]
            return self.data
        else:
            raise ValueError("Invalid DS Type: {}".format(self.DSTYPE))
            
        try:
            self.data = pd.read_csv(stream,usecols=self.columns,index_col=self.index_col)
        except ValueError as err:
            msg = err.args[0]
            if msg.endswith('is not in list'):
                c = msg.split("'")[1]
                raise ValueError("'{}' is not in the set of columns in file '{}'".format(c,file_name))
            if msg.startswith('Index') and msg.endswith('invalid'):
                raise ValueError("Index column '{}' is not in the set of columns in file '{}'".format(self.index_col,file_name))
            raise
        stream.close()
        return self.data
    
    def write(self,ds_name=None,precision=None,index=False,prefix=None,makedir=False):
        if ds_name is None:
            ds_name = self.ds_name
        dirname = 'data/' + ds_name + '.d'
        if makedir and not os.path.exists(dirname):
            os.mkdir(dirname)
        if prefix is not None:
            dirname = dirname + '/' + prefix
            if makedir and not os.path.exists(dirname):
                os.mkdir(dirname)
        self.file_name = file_name = dirname + '/' + self.table_name + '.csv'
        float_format = None
        if precision is not None:
            float_format = '%.{:d}g'.format(precision)
        self.data.to_csv(file_name,index=index,float_format=float_format)
        return file_name
        
    def basename(self,file_name=None):
        if file_name is None:
            file_name = self.file_name
        return os.path.basename(file_name)
    
    def signature(self):
        file_name = self.file_name
        return (self.table_name,file_name,signature(file_name))
    
    def __len__(self):
        return len(self.data)
    
def signature(file_name):
    f = open(file_name,mode='rb')
    m = hashlib.sha256(f.read())
    f.close()
    return m.hexdigest()

In [3]:
##test:
Table.DSNAME = 'frame-6'
t = Table('nodes',columns=['NODEID','X','Y'])

In [4]:
##test:
t.read()

Unnamed: 0,NODEID,X,Y
0,A,0,0
1,B,0,4000
2,C,8000,4000
3,D,8000,0


In [5]:
##test:
len(t)

4

In [6]:
##test:
t.data.loc[:,['X','Y']] /= 3.
t.data

Unnamed: 0,NODEID,X,Y
0,A,0.0,0.0
1,B,0.0,1333.333333
2,C,2666.666667,1333.333333
3,D,2666.666667,0.0


In [7]:
##test:
t.write(precision=7,prefix='out',makedir=True)

'data/frame-6.d/out/nodes.csv'

In [8]:
##test:
t.signature()

('nodes',
 'data/frame-6.d/out/nodes.csv',
 '71080f20c6f926bb9ef71cfe01103ed5e3ba618bb305f713e26194198220ecce')

In [9]:
##test:
vars(t)

{'columns': ['NODEID', 'X', 'Y'], 'data':   NODEID            X            Y
 0      A     0.000000     0.000000
 1      B     0.000000  1333.333333
 2      C  2666.666667  1333.333333
 3      D  2666.666667     0.000000, 'ds_name': 'frame-6', 'file_name': 'data/frame-6.d/out/nodes.csv', 'index_col': None, 'optional': False, 'prefix': None, 'table_name': 'nodes'}

In [10]:
##test:
t.read()

Unnamed: 0,NODEID,X,Y
0,A,0,0
1,B,0,4000
2,C,8000,4000
3,D,8000,0


In [11]:
##test:
vars(t)

{'columns': ['NODEID', 'X', 'Y'], 'data':   NODEID     X     Y
 0      A     0     0
 1      B     0  4000
 2      C  8000  4000
 3      D  8000     0, 'ds_name': 'frame-6', 'file_name': 'data/frame-6.d/nodes.csv', 'index_col': None, 'optional': False, 'prefix': None, 'table_name': 'nodes'}

In [12]:
@register_cell_magic('Table')
def cell_table(line,cell):
    mo = re.match(r'\s*(\S+)\s*$',line)
    if not mo:
        raise ValueError('Usage: %%Table tablename')
    table_name = mo.group(1)
    global Table
    Table.DSTYPE = 'cell'
    Table.CELLDATA[table_name] = cell

In [13]:
%%Table nodes
NODEID,X,Y,Z
A,0,0,5000
B,0,4000,5000
C,8000,4000,5000
D,8000,0,5000

In [14]:
##test:
Table.DSTYPE

'cell'

In [15]:
##test:
Table.CELLDATA

{u'nodes': u'NODEID,X,Y,Z\nA,0,0,5000\nB,0,4000,5000\nC,8000,4000,5000\nD,8000,0,5000'}

In [16]:
##test:
t = Table('nodes',columns=['NODEID','Y','Z'])
t.read()

Unnamed: 0,NODEID,Y,Z
0,A,0,5000
1,B,4000,5000
2,C,4000,5000
3,D,0,5000


In [17]:
##test:
Table.set_source('xxx','data')
Table.set_data('nodes',t.data)
tt = Table('nodes',columns=['NODEID','Y','Z'])
tt.read()

Unnamed: 0,NODEID,Y,Z
0,A,0,5000
1,B,4000,5000
2,C,4000,5000
3,D,0,5000


In [19]:
##test:
tt.file_name

In [20]:
##test:
vars(tt)

{'columns': ['NODEID', 'Y', 'Z'], 'data':   NODEID     Y     Z
 0      A     0  5000
 1      B  4000  5000
 2      C  4000  5000
 3      D     0  5000, 'ds_name': 'xxx', 'file_name': None, 'index_col': None, 'optional': False, 'prefix': None, 'table_name': 'nodes'}