In [1]:
%matplotlib inline

In [48]:
# imports
import pandas as pd
import numpy as np

import re
import json
import yaml

import os

In [49]:
# hierarchical apply function on mixed dict/list struct
def map_nested_struct_modify(ob, func):
    if isinstance(ob,list):
        for v in ob:
            if isinstance(v,list) or isinstance(v,dict):
                map_nested_struct_modify(v, func)
            else:
                v = func(v)    
    elif isinstance(ob,dict):
        for k, v in ob.items():
            if isinstance(ob[k],list) or isinstance(ob[k],dict):
                map_nested_struct_modify(v, func)
            else:
                ob[k] = func(v)
    else:
        ob = func(ob)

In [50]:
if os.environ.get('JUPYTER_GATEWAY'):
    path='/srv/notebooks/'
else:
    path=''

In [51]:
def response_dict(d, content_type):
    if content_type == 'application/json':
        print(json.dumps(d))
    elif content_type == 'application/xml' or content_type == 'text/xml':
        print(yaml.dump(d))

def response_meta(status=200, content_type='application/json'):
    print(json.dumps({
        "headers" : { "Content-Type" : content_type},
        "status" : status
    }))

In [52]:
df = pd.read_csv(path+'pokemon.csv', sep=',', dtype='str', engine='c', true_values=['True', 'true'], false_values=['False','false'])

In [53]:
df.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [84]:
def uniquify(seq): 
    checked = dict()
    uniq = []
    for e in seq:
        if e not in checked.keys():
            if e:
                checked[e]=0
                uniq.append(e)
            else:
                checked[e]=1
                uniq.append('_1')
        else:
            checked[e] += 1
            uniq.append('{}_{}'.format(e,checked[e]))
    return uniq

def to_alphanum(s):
    return re.sub(r'\W+', '', s).lower()

def prep_names(seq):
    names = [ to_alphanum(x)  for x in seq]
    return uniquify(names)

def dtype_to_string(x):
    return {
        'b': 'bool',
        'i': 'long',
        'u': 'long',
        'f': 'double',
        'c': 'complex',
        'O': 'object',
        'S': 'char',
        'a': 'char',
        'U': 'string',
        'V': 'raw'
    }.get(x[1], 'unknown')

def is_number(s):
    try:
        complex(s) # for int, long, float and complex
    except ValueError:
        return False
    return True

def is_float(s):
    try:
        float(s) # for int, long, float
    except ValueError:
        return False
    return True

def is_int(s):
    try:
        return float(s).is_integer() # for int, long, float
    except ValueError:
        return False

def is_infinite(s):
    try:
        return np.isinf(float(s)) # for int, long, float
    except ValueError:
        return False

def is_zero(s):
    return not s

def is_string(s):
    return isinstance(s, str)

def vector_purity(x):
    for i in list(range(len(x))):
        # hard compressor
        x[i] = max(0, min(1, x[i]))
    return max(x)*(1 - (sum(x)-1)/(len(x)-1))

def get_typestats(sr):
    infinite  = sum(sr.apply(is_infinite))
    numeric = sum(sr.apply(is_number))
    integer = sum(sr.apply(is_int))
    nan     = sum(sr.isnull())
    zero    = sum(sr.apply(is_zero))
    string  = sum(sr.apply(is_string))
    unique  = len(sr.unique())
    
    count   = len(sr)
    valid   = count-nan
    quality = valid/count
    
    t = 'string' 
    if (integer/valid)>0.5 :
        t = 'integer'
    elif (numeric/valid)>0.5 :
        t = 'numeric'
    
    # log vs linear?
    cat = False
    if (unique/valid)<0.1 :
        cat = True
    
    numeric = numeric - integer - nan
    string = string - numeric - integer
    
    d = { 'infinite': infinite,
          'numeric' : numeric,
          'integer' : integer,
          'nan'     : nan,
          'zero'    : zero,
          'string'  : string,
          'unique'  : unique,
          'valid'   : valid,
          'quality' : quality,
          'descrete': cat,
          'tcoerce' : t,
          'tpurity' : vector_purity([integer/valid, numeric/valid, string/valid])}
    
    return d

def numpy2py(ob):
    return np.asscalar(ob) if isinstance(ob, np.generic) else ob

def format_float(ob):
    return float(format(ob,'.2f')) if isinstance(ob, float) else ob

def numpy_tojson(ob):
    map_nested_struct_modify(ob, numpy2py)
    map_nested_struct_modify(ob, format_float)
    return ob

In [85]:
#metadata: variables
variables = [{'alias':x[0],'name': x[1]} for x in zip(prep_names(df.columns),df.columns)]
variables

#rename columns
df.columns = prep_names(df.columns)

In [86]:
df.head()

Unnamed: 0,_1,name,type1,type2,total,hp,attack,defense,spatk,spdef,speed,generation,legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [87]:
sr = pd.Series([1,0.0, '', '', np.inf, np.nan, 2.9, '0', '111', 'kkk'])

In [88]:
get_typestats(sr)

{'descrete': False,
 'infinite': 1,
 'integer': 4,
 'nan': 1,
 'numeric': 2,
 'quality': 0.90000000000000002,
 'string': -1,
 'tcoerce': 'numeric',
 'tpurity': 0.51851851851851849,
 'unique': 9,
 'valid': 9,
 'zero': 3}

In [89]:
pd.DataFrame(df.apply(lambda x: pd.Series(get_typestats(x)), axis=0))


Unnamed: 0,_1,name,type1,type2,total,hp,attack,defense,spatk,spdef,speed,generation,legendary
descrete,False,False,True,True,False,False,False,False,False,False,False,True,True
infinite,0,0,0,0,0,0,0,0,0,0,0,0,0
integer,800,0,0,0,800,800,800,800,800,800,800,800,0
,0,0,0,386,0,0,0,0,0,0,0,0,0
numeric,0,0,0,0,0,0,0,0,0,0,0,0,0
quality,1,1,1,0.5175,1,1,1,1,1,1,1,1,1
string,0,800,800,414,0,0,0,0,0,0,0,0,800
tcoerce,integer,string,string,numeric,integer,integer,integer,integer,integer,integer,integer,integer,string
tpurity,1,1,1,1,1,1,1,1,1,1,1,1,1
unique,721,800,18,19,200,94,111,103,105,92,108,6,2


In [90]:
#df = pd.read_csv(server+'pokemon.csv', sep=',', engine='python', true_values=['True', 'true'], false_values=['False','false'])

In [92]:
#extract types
for v in variables:
    t = dtype_to_string(df[v['alias']].dtype.str)
    v['type'] = t
    v['sample'] = [str(x) for x in df[v['alias']].sample(n=10).tolist()]
    v.update(get_typestats(df[v['alias']]))

In [93]:
#cell data
shape = df.shape

# help functions
def rows_na_any(df):
    na_df = pd.isnull(df)
    d = na_df.apply(np.any, axis=0)
    return len(d[d==True])

def col_na_any(df):
    na_df = pd.isnull(df)
    d = na_df.apply(np.any, axis=1)
    return len(d[d==True])

dataset_dict = {
    'dims': len(shape),
    'rows': shape[0],
    'cols': shape[1],
    'na': {
        'cols': rows_na_any(df),
        'rows': col_na_any(df)
    },
    'variables': variables
}

# discard cell data
del shape

In [94]:
# GET /datasets/:id
response_dict(numpy_tojson(dataset_dict), 'application/json')

{"variables": [{"integer": 800, "string": 0, "tpurity": 1.0, "tcoerce": "integer", "sample": ["524", "465", "482", "277", "303", "719", "491", "688", "388", "118"], "alias": "_1", "numeric": 0, "quality": 1.0, "unique": 721, "valid": 800, "type": "object", "infinite": 0, "nan": 0, "name": "_1", "zero": 0, "descrete": false}, {"integer": 0, "string": 800, "tpurity": 1.0, "tcoerce": "string", "sample": ["VenusaurMega Venusaur", "Leavanny", "Sentret", "Croagunk", "Cherrim", "Weepinbell", "Milotic", "Bronzong", "Serperior", "Tympole"], "alias": "name", "numeric": 0, "quality": 1.0, "unique": 800, "valid": 800, "type": "object", "infinite": 0, "nan": 0, "name": "name", "zero": 0, "descrete": false}, {"integer": 0, "string": 800, "tpurity": 1.0, "tcoerce": "string", "sample": ["Fairy", "Psychic", "Psychic", "Ground", "Fire", "Psychic", "Ghost", "Water", "Electric", "Dragon"], "alias": "type1", "numeric": 0, "quality": 1.0, "unique": 18, "valid": 800, "type": "object", "infinite": 0, "nan": 0

In [64]:
# ResponseInfo GET /datasets/:id
response_meta()

{"status": 200, "headers": {"Content-Type": "application/json"}}
