In [207]:
%matplotlib inline

In [208]:
# imports
import pandas as pd
import numpy as np

import re
import json
import yaml

import os

In [209]:
# hierarchical apply function on mixed dict/list struct
def map_nested_struct_modify(ob, func):
    if isinstance(ob,list):
        for v in ob:
            if isinstance(v,list) or isinstance(v,dict):
                map_nested_struct_modify(v, func)
            else:
                v = func(v)    
    elif isinstance(ob,dict):
        for k, v in ob.items():
            if isinstance(ob[k],list) or isinstance(ob[k],dict):
                map_nested_struct_modify(v, func)
            else:
                ob[k] = func(v)
    else:
        ob = func(ob)

In [224]:
if os.environ.get('JUPYTER_GATEWAY'):
    path='/srv/notebooks'
else:
    path='.'
    
datapath = "{}/datasets".format(path)

In [225]:
REQUEST = json.dumps({
    'path' : {},
    'args' : {}
})

In [226]:
def response_dict(d, content_type='application/json'):
    if content_type == 'application/json':
        print(json.dumps(d))
    elif content_type == 'application/xml' or content_type == 'text/xml':
        print(yaml.dump(d))

def response_meta(status=200, content_type='application/json'):
    print(json.dumps({
        "headers" : { "Content-Type" : content_type},
        "status" : status
    }))

In [227]:
def uniquify(seq): 
    checked = dict()
    uniq = []
    for e in seq:
        if e not in checked.keys():
            if e:
                checked[e]=0
                uniq.append(e)
            else:
                checked[e]=1
                uniq.append('_1')
        else:
            checked[e] += 1
            uniq.append('{}_{}'.format(e,checked[e]))
    return uniq

def to_alphanum(s):
    return re.sub(r'[^0-9A-Za-z._]+', '', s).lower()

def prep_names(seq):
    names = [ to_alphanum(x)  for x in seq]
    return uniquify(names)

def dtype_to_string(x):
    return {
        'b': 'bool',
        'i': 'long',
        'u': 'long',
        'f': 'double',
        'c': 'complex',
        'O': 'object',
        'S': 'char',
        'a': 'char',
        'U': 'string',
        'V': 'raw'
    }.get(x[1], 'unknown')

def is_number(s):
    try:
        complex(s) # for int, long, float and complex
    except ValueError:
        return False
    return True

def is_float(s):
    try:
        float(s) # for int, long, float
    except ValueError:
        return False
    return True

def is_int(s):
    try:
        return float(s).is_integer() # for int, long, float
    except ValueError:
        return False

def is_infinite(s):
    try:
        return np.isinf(float(s)) # for int, long, float
    except ValueError:
        return False

def is_zero(s):
    return not s

def vector_purity(x):
    for i in list(range(len(x))):
        # hard compressor
        x[i] = max(0, min(1, x[i]))
    return max(x)*(1 - (sum(x)-1)/(len(x)-1))

def get_typestats(sr):
    infinite= sum(sr.apply(is_infinite))
    numeric = sum(sr.apply(is_number))
    integer = sum(sr.apply(is_int))
    nan     = sum(sr.isnull())
    zero    = sum(sr.apply(is_zero))
    unique  = len(sr.unique())
    
    count   = len(sr)
    valid   = count-nan
    quality = valid/count
    
    t = 'string' 
    if (integer/valid)>0.5 :
        t = 'integer'
    elif (numeric/valid)>0.5 :
        t = 'numeric'
    
    # log vs linear?
    cat = False
    if (unique/valid)<0.1 :
        cat = True
    
    numeric = numeric - integer - nan
    string = count - numeric - integer
    
    d = { 'infinite': infinite,
          'numeric' : numeric,
          'integer' : integer,
          'nan'     : nan,
          'zero'    : zero,
          'string'  : string,
          'unique'  : unique,
          'valid'   : valid,
          'quality' : quality,
          'descrete': cat,
          'tcoerce' : t,
          'tpurity' : vector_purity([integer/valid, numeric/valid, string/valid]),
          'type'    : dtype_to_string(sr.dtype.str)
    }
    
    return d

def numpy2py(ob):
    return np.asscalar(ob) if isinstance(ob, np.generic) else ob

def format_float(ob):
    return float(format(ob,'.2f')) if isinstance(ob, float) else ob

def numpy_tojson(ob):
    map_nested_struct_modify(ob, numpy2py)
    map_nested_struct_modify(ob, format_float)
    return ob

In [228]:
sr = pd.Series([1,0.0, '', '', np.inf, np.nan, 2.9, '0', '111', 'kkk'])

In [229]:
get_typestats(sr)

{'descrete': False,
 'infinite': 1,
 'integer': 4,
 'nan': 1,
 'numeric': 2,
 'quality': 0.90000000000000002,
 'string': 4,
 'tcoerce': 'numeric',
 'tpurity': 0.41975308641975306,
 'type': 'object',
 'unique': 9,
 'valid': 9,
 'zero': 3}

In [230]:
# help functions
def rows_na_any(df):
    na_df = pd.isnull(df)
    d = na_df.apply(np.any, axis=0)
    return len(d[d==True])

def col_na_any(df):
    na_df = pd.isnull(df)
    d = na_df.apply(np.any, axis=1)
    return len(d[d==True])

In [243]:
df = pd.DataFrame()
df_meta = {
    'id'   : '',
    'name' : '',
    'cols' : {}
} 

def load_dataset(id):
    global df, ds_meta
    d = {
        '0' : 'titanic.csv',
        '1' : 'iris.csv',
        '2' : 'pokemon.csv',
        '3' : 'boston.csv'
    }

    filename = d.get(id, None)
    
    if filename==df_meta['name']:
        return
    
    if filename:
        df = pd.read_csv(
            "{}/{}".format(datapath,filename), 
            sep=None, 
            engine='python', 
            true_values=['True', 'true'], 
            false_values=['False','false']
        )

        #dataset id
        df_meta['id'] = id
        df_meta['name'] = filename
        df_meta['cols'] = dict(zip(prep_names(df.columns),df.columns))
    
        #rename df columns
        df.columns = prep_names(df.columns)
        
        return True
    else:
        df = pd.DataFrame()
        df_meta['id'] = id
        df_meta['name'] = filename
        df_meta['cols'] = {}
        
        return False
    

    

In [244]:
# GET /datasets/:id

if not os.environ.get('JUPYTER_GATEWAY'):
    REQUEST = json.dumps({'path': {'id':'1'}})

request = json.loads(REQUEST)
dataset_id = request['path'].get('id')

success = load_dataset(dataset_id)

if not success:
    response_dict('', 'application/json')
else:
    dv = []

    #extract types
    for alias,name in df_meta['cols'].items():
        sr = df[alias]
        dv.append(
            {
                'alias' : alias,
                'name'  : name
            }
        )
        
    response_dict(dv, 'application/json')


[{"name": "Species", "alias": "species"}, {"name": "Unnamed: 0", "alias": "unnamed0"}, {"name": "Petal.Length", "alias": "petal.length"}, {"name": "Petal.Width", "alias": "petal.width"}, {"name": "Sepal.Length", "alias": "sepal.length"}, {"name": "Sepal.Width", "alias": "sepal.width"}]


In [245]:
# ResponseInfo GET /datasets/:id

status = 404 if not df_meta['name'] else 200
response_meta(status)

{"status": 200, "headers": {"Content-Type": "application/json"}}


In [246]:
df_meta

{'cols': {'petal.length': 'Petal.Length',
  'petal.width': 'Petal.Width',
  'sepal.length': 'Sepal.Length',
  'sepal.width': 'Sepal.Width',
  'species': 'Species',
  'unnamed0': 'Unnamed: 0'},
 'id': '1',
 'name': 'iris.csv'}

In [254]:
# GET /datasets/:id/stats

if not os.environ.get('JUPYTER_GATEWAY'):
    REQUEST = json.dumps({'path': {'id':'0'}})

request = json.loads(REQUEST)
dataset_id = request['path'].get('id')

success = load_dataset(dataset_id)

if not success:
    response_dict('', 'application/json')
else:

    #cell data
    shape = df.shape

    dv = []

    #extract types
    for alias,name in df_meta['cols'].items():
        sr = df[alias]
        dv.append(
            {
                'alias' : alias,
                'name'  : name,
                'type'  : get_typestats(sr),
                'sample': [str(x) for x in sr.sample(n=10).tolist()]
            }
        )
    
    d = {
        'name': df_meta['name'],
        'id': df_meta['id'],
        'dims': len(shape),
        'rows': shape[0],
        'cols': shape[1],
        'na': {
            'cols': rows_na_any(df),
            'rows': col_na_any(df)
        },
        'variables': dv
    }

    #output
    response_dict(numpy_tojson(d), 'application/json')

{"name": "titanic.csv", "cols": 6, "id": "0", "rows": 32, "na": {"cols": 0, "rows": 0}, "variables": [{"name": "Class", "type": {"zero": 0, "infinite": 0, "string": 32, "type": "object", "numeric": 0, "tcoerce": "string", "unique": 4, "valid": 32, "quality": 1.0, "descrete": false, "tpurity": 1.0, "nan": 0, "integer": 0}, "sample": ["2nd", "3rd", "3rd", "2nd", "Crew", "1st", "3rd", "2nd", "1st", "3rd"], "alias": "class"}, {"name": "Survived", "type": {"zero": 0, "infinite": 0, "string": 32, "type": "object", "numeric": 0, "tcoerce": "string", "unique": 2, "valid": 32, "quality": 1.0, "descrete": true, "tpurity": 1.0, "nan": 0, "integer": 0}, "sample": ["Yes", "No", "Yes", "Yes", "No", "Yes", "Yes", "No", "No", "No"], "alias": "survived"}, {"name": "Freq", "type": {"zero": 8, "infinite": 0, "string": 0, "type": "long", "numeric": 0, "tcoerce": "integer", "unique": 22, "valid": 32, "quality": 1.0, "descrete": false, "tpurity": 1.0, "nan": 0, "integer": 32}, "sample": ["14", "192", "0", "

In [255]:
# ResponseInfo GET /datasets/:id/stats

status = 404 if not df_meta['name'] else 200
response_meta()

{"status": 200, "headers": {"Content-Type": "application/json"}}
