## Importing all samples into elastic search index

In [1]:
import sys
sys.path.append('..')

In [2]:
from elasticsearch import Elasticsearch
from elasticsearch.client import IndicesClient
from elasticsearch_dsl import Mapping, String, Integer, Nested, Date, Object
from elasticsearch.helpers import bulk
import pymongo
from time import sleep
from lib.utils import iter_bucket
import json

In [3]:
es = Elasticsearch()
ies = IndicesClient(es)

In [4]:
index_name = 'series_dev'

## Dropping and creating index

In [17]:
if ies.exists(index_name):
    ies.delete(index_name)
ies.create(index_name)
sleep(1)

## Creating mapping and settings

In [18]:
ies.close(index_name)
m = Mapping(index_name)

m.field('accession', String(index='not_analyzed'))
m.field('contact', Object(enabled=False))
m.field('contributor', String(index='no'))
m.field('data_source', String(index='not_analyzed'))
m.field('email', String(index='no'))

meta = Object()
meta.field('geo_id', Integer(index='not_analyzed'))

m.field('meta', meta)
m.field('organism', String(index='not_analyzed'))
m.field('overall_design', String())
m.field('platforms', String(index='not_analyzed', multi=True))
m.field('pubmed_id', String(index='no'))
m.field('relations', Object(enabled=False))
m.field('samples', String(index='not_analyzed', multi=True))
m.field('scrap_date', String(index='no'))
m.field('status', String(index='not_analyzed'))
m.field('submission_date', String(index='no'))
m.field('summary', String())
supplementary_files = Nested()
supplementary_files.field("type", String(index='not_analyzed'))
supplementary_files.field("name", String(index='not_analyzed'))

m.field('supplementary_files', supplementary_files)

m.field('title', String())
m.field('type', String(index='not_analyzed', multi=True))
m.field('web_link', String(index='not_analyzed'))

m.save(index_name, using=es)

# ies.put_settings(index=index_name, body={
#     "analysis":{
#       "analyzer":{
#         "default":{
#           "type":"custom",
#           "tokenizer":"standard",
#           "filter":[ "standard", "lowercase", "stop", "kstem" ]
#         }
#       }
#     }
# })
sleep(1)
ies.open(index_name)

{'acknowledged': True}

In [7]:
def fields(fields_list):
    return dict((f, 1) for f in fields_list)

def del_id(item):
    if '_id' in item:
        del item['_id']
    return item
    

def make_raw_chars(item):
    if 'characteristics_raw' in item:
        return item
    if not item.get('characteristics'):
        return item

    ch_raw = []
    for k, v in item['characteristics'].items():
        if isinstance(v, list):
            v = ','.join(v)
        v = v.strip(" ")
        if not v:
            continue
            
        ch_raw.append('{}: {}'.format(k, v))
                      
    item['characteristics_raw'] = ch_raw
    
    del item['characteristics']
    return item


## Inserting data

### Using only fields in mapping

In [8]:
def read_dump(dump_file):
    with open(dump_file) as f:
        for l in f:
            l = l.rstrip('\n')
            yield json.loads(l)

In [9]:
b = 0
fields_list = list(m.to_dict()[index_name]['properties'].keys())
fs = fields(fields_list)
fs['_id'] = 0

### Creating dump

In [19]:
!mongoexport -d  scraper_test_dev -c series \
--out /data/rawdata/snapshots/mongo/scraper_test_dev.series.$(date +%Y%m%d.%H%M%S).json

2015-10-29T16:52:48.084+0000	connected to: localhost
2015-10-29T16:52:59.081+0000	exported 66380 records


### Importing from dump

In [10]:
def convert_date(item):
    def convert(field):
        if field in item:
            if item.get(field) and '$date' in item[field]:
                d = item[field]['$date']
                item[field] = d
    """
    "submission_date": {
    "$date": "2000-09-28T00:00:00.000Z"
  },
  "scrap_date": {
    "$date": "2015-09-22T12:03:46.782Z"
  }
  "last_update_date": {
    "$date": "2008-11-19T00:00:00.000Z"
  },
    """
    convert('submission_date')
    convert('scrap_date')
    convert('last_update_date')
    return item

In [20]:
b = 0
samples_iter = read_dump('/data/rawdata/snapshots/mongo/scraper_test_dev.series.20151029.165247.json')
for bucket in iter_bucket(map(convert_date, samples_iter)):
    actions = [dict(
                _index=index_name,
                _type=index_name,
                _source=s
                ) for s in bucket]
    
    b += len(actions)
    bulk(es, actions)

In [21]:
b

66380

### Importing from array-express data source

In [63]:
b = 0
fs['characteristics'] = 1
for bucket in iter_bucket(db.samples.find({'data_source': 'array-express'}, fs)):
    actions = [dict(
                _index=index_name,
                _type=index_name,
                _source=make_raw_chars(s)
                ) for s in bucket]
#     print(actions)
    b += len(actions)
    bulk(es, actions)
print(b)

695951


In [64]:
b

695951