In [None]:
# !pip install elasticsearch
# pip install csv2es
from elasticsearch import helpers, Elasticsearch
import csv
import csv2es

In [None]:
es = elasticsearch()

In [15]:
df = spark.read.csv('DNPBA2017.csv', header=True)

In [29]:
df = df.select(['NUMERODN', 'CODESTAB', 'IDADEMAE', 'ESCMAEAGR1']).limit(20).toPandas()

In [32]:
df.to_csv('DNPBA2017_es.csv', index=False)

### indexng

In [None]:
with open('DNPBA2017_es.csv') as f:
    reader = csv.DictReader(f)
    helpers.bulk(es, reader, index='my-index', doc_type='my-type')
    
# ler indices no linux com: curl 'localhost:9200/_cat/indices?v'

### buscando termo

In [66]:
content = {
    "query": {
        "term": {
            "CODESTAB": "2786095"
         }
     },
}

In [67]:
res = es.search(index="my-index", body=content)

In [68]:
res

{u'_shards': {u'failed': 0, u'skipped': 0, u'successful': 1, u'total': 1},
 u'hits': {u'hits': [{u'_id': u'H8atyW8BbundlQk4fUZA',
    u'_index': u'my-index',
    u'_score': 2.6390574,
    u'_source': {u'CODESTAB': u'2786095',
     u'ESCMAEAGR1': u'01',
     u'IDADEMAE': u'25',
     u'NUMERODN': u'72390242'},
    u'_type': u'my-type'}],
  u'max_score': 2.6390574,
  u'total': {u'relation': u'eq', u'value': 1}},
 u'timed_out': False,
 u'took': 1}

### buscando um intervalo

In [71]:
content = {
    "query": {
        "range": {
            "IDADEMAE": {
                "gte": 20,
                "lt": 30
            }
        }
    }
}

In [72]:
res = es.search(index="my-index", body=content)

In [73]:
res

{u'_shards': {u'failed': 0, u'skipped': 0, u'successful': 1, u'total': 1},
 u'hits': {u'hits': [{u'_id': u'H8atyW8BbundlQk4fUZA',
    u'_index': u'my-index',
    u'_score': 1.0,
    u'_source': {u'CODESTAB': u'2786095',
     u'ESCMAEAGR1': u'01',
     u'IDADEMAE': u'25',
     u'NUMERODN': u'72390242'},
    u'_type': u'my-type'},
   {u'_id': u'IMatyW8BbundlQk4fUZB',
    u'_index': u'my-index',
    u'_score': 1.0,
    u'_source': {u'CODESTAB': u'NA',
     u'ESCMAEAGR1': u'02',
     u'IDADEMAE': u'20',
     u'NUMERODN': u'72396320'},
    u'_type': u'my-type'},
   {u'_id': u'IcatyW8BbundlQk4fUZB',
    u'_index': u'my-index',
    u'_score': 1.0,
    u'_source': {u'CODESTAB': u'NA',
     u'ESCMAEAGR1': u'04',
     u'IDADEMAE': u'20',
     u'NUMERODN': u'72374657'},
    u'_type': u'my-type'},
   {u'_id': u'IsatyW8BbundlQk4fUZB',
    u'_index': u'my-index',
    u'_score': 1.0,
    u'_source': {u'CODESTAB': u'2755157',
     u'ESCMAEAGR1': u'06',
     u'IDADEMAE': u'28',
     u'NUMERODN': u'7237

### busca lógica

In [75]:
content = {
    "query": {
        "bool": {
            "must": [
                {
                    "term": {
                        "IDADEMAE": 21
                    }
                },
                {
                    "term": {
                        "ESCMAEAGR1": "06"
                    }
                }
            ]
        }
    }
}

In [76]:
res = es.search(index="my-index", body=content)

In [77]:
res

{u'_shards': {u'failed': 0, u'skipped': 0, u'successful': 1, u'total': 1},
 u'hits': {u'hits': [{u'_id': u'LMatyW8BbundlQk4fUZB',
    u'_index': u'my-index',
    u'_score': 3.6686769,
    u'_source': {u'CODESTAB': u'2777770',
     u'ESCMAEAGR1': u'06',
     u'IDADEMAE': u'21',
     u'NUMERODN': u'69726563'},
    u'_type': u'my-type'}],
  u'max_score': 3.6686769,
  u'total': {u'relation': u'eq', u'value': 1}},
 u'timed_out': False,
 u'took': 5}

### searching methods

In [46]:
def buscaExata(numerodn, codestab, idademae, escmaeagr1, startId=0): 
    
    global es
    
    content = {
        'size': 30,
        'query': {
            'bool': {
                'must': [
                    {'match_phrase': {'NUMERODN': '"' + numerodn + '"'}},
                    {'match_phrase': {'CODESTAB': '"' + codestab + '"'}}, 
                    {'match': {'IDADEMAE': idademae}},
                    {'match': {'ESCMAEAGR1': escmaeagr1}}
                ]
            }
        }
    }
    force = True
    while force:
        try:
            res = es.search(index="my-index", body=content)
            force = False
        except:
            pass
    return res['hits']['hits']

In [52]:
def buscaAproximada(numerodn, codestab, idademae, escmaeagr1, startId=0):
    
    global es
    
    content = {
        'size': 100,
        'query': {
            'bool': {
                'should': [
                    {'match': {'NUMERODN': {'query': numerodn, 'fuzziness':'AUTO', 'operator':'or', 'boost':'2'}}},
                    {'match': {'CODESTAB': {'query': codestab, 'fuzziness':'AUTO', 'operator':'or', 'boost':'2'}}},
                    {'match': {'IDADEMAE': {'query': idademae, 'fuzziness':'AUTO', 'operator':'or', 'boost':'0.5'}}},
                    {'match': {'ESCMAEAGR1': {'query': escmaeagr1, 'fuzziness':'AUTO', 'operator':'or'}}}
                ]
            }
        }
    }
    force = True
    while force:
        try:
            res = es.search(index="my-index", body=content)
            force = False
        except:
            pass
    return res['hits']['hits']

In [53]:
buscaExata("72390242", "2786095", "25", "01")

[{u'_id': u'H8atyW8BbundlQk4fUZA',
  u'_index': u'my-index',
  u'_score': 10.55623,
  u'_source': {u'CODESTAB': u'2786095',
   u'ESCMAEAGR1': u'01',
   u'IDADEMAE': u'25',
   u'NUMERODN': u'72390242'},
  u'_type': u'my-type'}]

In [63]:
# um campo modificado (note que o retorno é vazio)
buscaExata("72390242", "2786095", "33", "01")

[]

In [57]:
buscaAproximada("72390242", "2786095", "25", "01")

[{u'_id': u'H8atyW8BbundlQk4fUZA',
  u'_index': u'my-index',
  u'_score': 9.236701,
  u'_source': {u'CODESTAB': u'2786095',
   u'ESCMAEAGR1': u'01',
   u'IDADEMAE': u'25',
   u'NUMERODN': u'72390242'},
  u'_type': u'my-type'},
 {u'_id': u'I8atyW8BbundlQk4fUZB',
  u'_index': u'my-index',
  u'_score': 5.278115,
  u'_source': {u'CODESTAB': u'7373120',
   u'ESCMAEAGR1': u'12',
   u'IDADEMAE': u'34',
   u'NUMERODN': u'72392109'},
  u'_type': u'my-type'}]

In [65]:
# # um campo modificado (mais de um candidato é retornado)
buscaAproximada("72390242", "2786095", "33", "01")

[{u'_id': u'H8atyW8BbundlQk4fUZA',
  u'_index': u'my-index',
  u'_score': 13.195287,
  u'_source': {u'CODESTAB': u'2786095',
   u'ESCMAEAGR1': u'01',
   u'IDADEMAE': u'25',
   u'NUMERODN': u'72390242'},
  u'_type': u'my-type'},
 {u'_id': u'KMatyW8BbundlQk4fUZB',
  u'_index': u'my-index',
  u'_score': 1.0641159,
  u'_source': {u'CODESTAB': u'2777770',
   u'ESCMAEAGR1': u'08',
   u'IDADEMAE': u'33',
   u'NUMERODN': u'69726416'},
  u'_type': u'my-type'},
 {u'_id': u'KcatyW8BbundlQk4fUZB',
  u'_index': u'my-index',
  u'_score': 1.0641159,
  u'_source': {u'CODESTAB': u'2777770',
   u'ESCMAEAGR1': u'06',
   u'IDADEMAE': u'33',
   u'NUMERODN': u'69726448'},
  u'_type': u'my-type'}]

In [62]:
print len(buscaAproximada("72390242", "2786095", "33", "01"))

3
