## Importing all samples into elastic search index

In [1]:
import sys
sys.path.append('..')

In [2]:
from elasticsearch import Elasticsearch
from elasticsearch.client import IndicesClient
from elasticsearch_dsl import Mapping, String, Integer
from elasticsearch.helpers import bulk
import pymongo
from time import sleep
from lib.utils import iter_bucket

In [16]:
es = Elasticsearch()
ies = IndicesClient(es)
db = pymongo.MongoClient().scraper_test_dev

In [18]:
db.samples.count()

2438491

In [50]:
index_name = 'samples'

## Dropping and creating index

In [59]:
if ies.exists(index_name):
    ies.delete(index_name)
ies.create(index_name)
sleep(1)

## Creating mapping and settings

In [60]:
ies.close(index_name)
m = Mapping(index_name)

m.field('accession', String(index='not_analyzed'))
m.field('id', Integer())
m.field('organism', String(index='not_analyzed'))
m.field('platform', String(index='not_analyzed'))
m.field('series', String(index='not_analyzed'))

m.field('title', String())
m.field('description', String())
m.field('source_name', String())
# m.field('characteristics', String())
m.field('characteristics_raw', String())

m.save(index_name, using=es)

ies.put_settings(index=index_name, body={
    "analysis":{
      "analyzer":{
        "default":{
          "type":"custom",
          "tokenizer":"standard",
          "filter":[ "standard", "lowercase", "stop", "kstem" ]
        }
      }
    }
})
sleep(1)
ies.open(index_name)

{'acknowledged': True}

In [61]:
def fields(fields_list):
    return dict((f, 1) for f in fields_list)

def del_id(item):
    if '_id' in item:
        del item['_id']
    return item
    

def make_raw_chars(item):
    if 'characteristics_raw' in item:
        return item
    if not item.get('characteristics'):
        return item
#     print(item)

    ch_raw = []
    for k, v in item['characteristics'].items():
        if isinstance(v, list):
            v = ','.join(v)
        v = v.strip(" ")
        if not v:
            continue
            
        ch_raw.append('{}: {}'.format(k, v))
                      
    item['characteristics_raw'] = ch_raw
    
    del item['characteristics']
    return item


## Inserting data

### Using only fields in mapping

In [None]:
b = 0
fields_list = list(m.to_dict()[index_name]['properties'].keys())
fs = fields(fields_list)
fs['_id'] = 0

### Importing from geo data source

In [65]:
for bucket in iter_bucket(db.samples.find({'data_source': 'geo'}, fs)):
    actions = [dict(
                _index=index_name,
                _type=index_name,
                _source=s
                ) for s in bucket]
#     print(actions)
    b += len(actions)
    bulk(es, actions)
print(b)

1742540


In [67]:
b

1742540

### Importing from array-express data source

In [63]:
b = 0
fs['characteristics'] = 1
for bucket in iter_bucket(db.samples.find({'data_source': 'array-express'}, fs)):
    actions = [dict(
                _index=index_name,
                _type=index_name,
                _source=make_raw_chars(s)
                ) for s in bucket]
#     print(actions)
    b += len(actions)
    bulk(es, actions)
print(b)

695951


In [64]:
b

695951