In [41]:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
from elasticsearch_dsl.query import Q
from itertools import islice
import pandas as pd
import numpy as np
from misc import utils

from collections import defaultdict


In [42]:
es = Elasticsearch()
def s():
    return Search(using=es, index='samples_dev')

In [43]:
def itersearch(search: Search, page_size: int=1000):
    count = search.count()
    print(count)
    for page in range(0, count, page_size):
        for res in search[page: page+page_size].execute():
            yield res

In [None]:
search = (
    s()
    .query(Q('match', characteristics_raw='age'))
    .extra(_source=['accession', 'characteristics', 'characteristics_raw'])
)
samples = itersearch(search)



In [None]:
keys = utils.flatten(
            map(lambda x: x.characteristics.to_dict().keys(), 
                filter(lambda x: 'characteristics' in x, samples)))
counts = defaultdict(int)
for key in keys:
    counts[key] += 1

In [None]:
index, data = list(zip(*counts.items()))
counts_s = pd.Series(index=index, data=data)
counts_df = pd.DataFrame(dict(counts=counts_s))

In [None]:
%store counts_df

In [1]:
%store -r counts_df

In [2]:
import qgrid

In [6]:
qgrid.nbinstall(overwrite=True)
qgrid.set_defaults(remote_js=True, precision=4)

In [13]:
import re
def tokenize(s):
    return re.split(r'[^\w]+', s)
tokenize('age(yrs)')

['age', 'yrs', '']

In [27]:
age_fields = ['age', 'age (years)', 'age (yrs)', 'age at diagnosis', 'age (y)',
 'age in years', 'age (yrs) at rrp',
       'patient age', 'age at surgery', 'sample age',
       'age(years)', 
       'subject age']

In [29]:
(
    counts_df
    [counts_df.index.map(lambda x: any('age' == t.lower() for t in tokenize(x)))]
    .query('counts > 300')
    .loc[age_fields]
    .sort('counts', ascending=False)
)

Unnamed: 0,counts
age,240309
age (years),9939
age (yrs),9923
age at diagnosis,7596
age (y),6226
age in years,1386
age (yrs) at rrp,1192
patient age,1104
age at surgery,1062
sample age,1011


## Age values research

In [217]:
search = (
    s()
    .query(Q('match', characteristics_raw='age'))
    .extra(_source=['accession', 'characteristics', 'characteristics_raw'])
)
samples = itersearch(search)

In [215]:
def get_nondigits(s, repl=''):
    return re.sub(r'[\d\.]+', repl, s)



In [216]:
def update_counts(characteristics):
    for age_field in age_fields:
        if age_field in characteristics:
            value = get_nondigits( characteristics[age_field], repl='#')
            if value == '':
                value = '#AGEOK'
            if age_field not in counts:
                counts[age_field] = {}
            if value not in counts[age_field]:
                counts[age_field][value] = 0
            counts[age_field][value] += 1

In [218]:
chars = map(lambda x: x.characteristics.to_dict(), 
                filter(lambda x: 'characteristics' in x, samples))

In [219]:
counts = {}
for char in (chars):
    update_counts(char)
    

502493


In [221]:
serieses = []
for field_name, value_counts in counts.items():
    print(field_name)
    index, data = list(zip(*value_counts.items()))
    serieses.append((field_name, pd.Series(index=index, data=data, name=field_name)))
serieses = dict(serieses)

age at diagnosis
age (yrs)
age in years
age (years)
patient age
age (yrs) at rrp
sample age
age
age(years)
age (y)
age at surgery
subject age


In [147]:
list(counts.keys())

['age at diagnosis',
 'age (yrs)',
 'age in years',
 'age (years)',
 'patient age',
 'age (yrs) at rrp',
 'sample age',
 'age',
 'age(years)',
 'age (y)',
 'age at surgery',
 'subject age']

In [246]:
q = serieses['age']
(
    q
#     [q > 300]
    [q.index.map(lambda x: 'year' in x or 'yr' in x or x in ('#', '#.#')) & (q>100)]
    .order(ascending=False)
#     .sum()
)
# q[q.index.map(lambda x: 'y' in x)]


#                            88551
#.#                          11113
# years                       4362
# yrs                         1362
#-# years                     1093
# yrs, when sample taken.      878
# years old                    630
#.# years                      467
postnatal # years              398
# yr                           366
# year                         296
# year old                     162
#.# years old                  152
#-# years old                  147
about # years                  141
#-# yrs                        135
# years adult                  128
adult (#yr)                    120
#.#yrs                         118
<# year-old                    106
# year-old or older            101
#-year-old                     101
Name: age, dtype: int64

In [210]:
serieses['age'].to_csv('../data/tmp.age.str-values.csv')

In [211]:
! gzip ../data/tmp.age.str-values.csv

In [214]:
! ls -lh ../data/tmp.age.str-values.csv.gz

-rw-rw-r-- 1 ubuntu ubuntu 13K Nov  4 16:45 ../data/tmp.age.str-values.csv.gz


In [159]:
q = serieses['age at diagnosis']
q[q>300].order(ascending=False)

#AGEOK    5504
.         1698
Name: age at diagnosis, dtype: int64

In [160]:
q = serieses['patient age']
q[q>300].order(ascending=False)

#AGEOK    973
Name: patient age, dtype: int64

In [161]:
q = serieses['subject age']
q[q>300].order(ascending=False)

#AGEOK    574
Name: subject age, dtype: int64

In [162]:
q = serieses['age in years']
q[q>300].order(ascending=False)

#AGEOK    1333
Name: age in years, dtype: int64

In [164]:
q = serieses['age (yrs)']
q[q>300].order(ascending=False)

#AGEOK    7555
.         2092
Name: age (yrs), dtype: int64

In [None]:
def has_dot(item):
    x = item.to_dict()
    if 'characteristics' not in x:
        return False
    
    if 'age' not in x['characteristics']:
        return False
    
    if '.' not in x['characteristics']['age']:
        return False
    
    return True

In [168]:
chars = map(lambda x: x.characteristics.to_dict(), 
                filter(has_dot, samples))

In [195]:
next(chars)

{'age': '43.201', 'gender': 'm'}