In [1]:
import urllib.request 
import re
import pandas as pd
from elasticsearch import Elasticsearch

In [2]:
import requests
# run bin/elasticsearch in elasticsearch directory in terminal before running this cell
res = requests.get('http://localhost:9200')

In [3]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

In [4]:
def test_ES(es):
    return es.ping()  # got True

In [5]:
if test_ES(es):
    print('ES instance working')
else:
    print('ES instance not working')

ES instance working




In [6]:
def index_info(index_name):
    count, deleted, shards, =  es.cat.indices(index=index_name, h=['docs.count', 'docs.deleted', 'pri'])[:-1].split(' ')
    print(
      """
      #### INDEX INFO #####
      index_name = {}
      doc_count = {}
      shard_count = {}
      deleted_doc_count = {}
      """.format(index_name, count, shards, deleted)
  )

In [51]:
# read data from steam data directory 
games = pd.read_pickle('final_data.pickle')

In [82]:
#es.indices.delete(index='steam_index_bm25')

{'acknowledged': True}

In [11]:
# set index name
index_name = 'steam_index_bm25'

In [12]:
# BM25 (Default index)
request_body_bm25 = {
    'settings': {
        'number_of_shards': 1,
        'number_of_replicas': 1,
        
    },
    'mappings': {
          'properties': {
              'app_id':{'type': 'integer'},
              'title': {'type': 'text'},
              'short_description': {'type': 'text'},
              'about_the_game': {'type': 'text'},
              'developer': {
                    "type": "text",
                    "fields": {
                    "raw": { 
                       "type": "keyword"
                            }
                    }
              },
              'publisher': {
                    "type": "text",
                    "fields": {
                    "raw": { 
                       "type": "keyword"
                            }
                    }
              },
              'categories': {
                    "type": "text",
                    "fields": {
                    "raw": { 
                       "type": "keyword"
                            }
                    }
              },
              'genres': {
                    "type": "text",
                    "fields": {
                    "raw": { 
                       "type": "keyword"
                            }
                    }
              },
              'release_date': {'type': 'date'},
          }
    }
}

try:
    es.indices.get(index_name)
    print('index {} already exists'.format(index_name))
except:
    print('creating index {}'.format(index_name))
    es.indices.create(index_name, body=request_body_bm25)

index steam_index_bm25 already exists




In [91]:
for i, (title, app_id, developer, publisher, categories, genres, body, abstract, release_date) in enumerate(games.values[:,1:-1]):
    doc_body = {
              'app_id': app_id,
              'title': title,
              'short_description': abstract,
              'about_the_game': body,
              'developer': developer,
              'publisher': publisher,
              'categories': categories,
              'genres': genres,
              'release_date': release_date
              }
    es.index(index_name, doc_body, id=i)

In [86]:
print('we have made and index called {} with {} documents'.format(index_name, es.cat.count(index=index_name,h=['count'])))
index_info(index_name)

we have made and index called steam_index_bm25 with 49317
 documents

      #### INDEX INFO #####
      index_name = steam_index_bm25
      doc_count = 49317
      shard_count = 1
      deleted_doc_count = 0
      


In [101]:
index_name = 'steam_index_dfr'

In [102]:
#DFR index
request_body_dfr = {
        'settings': {
        'number_of_shards': 1,
        'number_of_replicas': 1,
        'index': {
            'similarity': {
                'dfr_similarity': {
                    'type': 'DFR',
                    'basic_model': 'g',
                    'after_effect': 'l',
                    'normalization': 'h2',
                    'normalization.h2.c':'3.0'

                }
            }
        }
        
    },
    'mappings': {
          'properties': {
              'app_id':{'type': 'integer'},
              'title': {'type': 'text', 'similarity': 'dfr_similarity'},
              'short_description': {'type': 'text', 'similarity': 'dfr_similarity'},
              'about_the_game': {'type': 'text', 'similarity': 'dfr_similarity'},
              'developer': {
                    "type": "text", 'similarity': 'dfr_similarity',
                    "fields": {
                    "raw": { 
                       "type": "keyword"
                            }
                    }
              },
              'publisher': {
                    "type": "text", 'similarity': 'dfr_similarity',
                    "fields": {
                    "raw": { 
                       "type": "keyword"
                            }
                    }
              },
              'categories': {
                    "type": "text", 'similarity': 'dfr_similarity',
                    "fields": {
                    "raw": { 
                       "type": "keyword"
                            }
                    }
              },
              'genres': {
                    "type": "text", 'similarity': 'dfr_similarity',
                    "fields": {
                    "raw": { 
                       "type": "keyword"
                            }
                    }
              },
              'release_date': {'type': 'date'},
          }
    }
}

try:
    es.indices.get(index_name)
    print('index {} already exists'.format(index_name))
except:
    print('creating index {}'.format(index_name))
    es.indices.create(index_name, body=request_body_dfr)

index steam_index_dfr already exists


In [None]:
for i, (title, app_id, developer, publisher, categories, genres, body, abstract, release_date) in enumerate(games.values[:,1:-1]):
    doc_body = {
              'app_id': app_id,
              'title': title,
              'short_description': abstract,
              'about_the_game': body,
              'developer': developer,
              'publisher': publisher,
              'categories': categories,
              'genres': genres,
              'release_date': release_date
              }
    es.index(index_name, doc_body, id=i)

In [103]:
print('we have made and index called {} with {} documents'.format(index_name, es.cat.count(index=index_name,h=['count'])))
index_info(index_name)

we have made and index called steam_index_dfr with 49317
 documents

      #### INDEX INFO #####
      index_name = steam_index_dfr
      doc_count = 49317
      shard_count = 1
      deleted_doc_count = 0
      


In [104]:
es.get(index=index_name, id=51)

{'_index': 'steam_index_dfr',
 '_type': '_doc',
 '_id': '51',
 '_version': 1,
 '_seq_no': 51,
 '_primary_term': 1,
 'found': True,
 '_source': {'app_id': 2300,
  'title': 'DOOM II',
  'short_description': 'let obsession begin time entire force netherworld overrun earth save must descend stygian depth hell battle mightier nastier deadlier demon monster use powerful weapon',
  'about_the_game': 'let obsession begin time entire force netherworld overrun earth save must descend stygian depth hell battle mightier nastier deadlier demon monster use powerful weapon survive mind blowing explosion bloodiest fiercest awesome blastfest ever play doom ii solo two people modem four player lan supporting ipx protocol matter way choose get ready adrenaline pumping action packed excitement thats sure give heart real workout',
  'developer': ['id Software'],
  'publisher': ['id Software'],
  'categories': ['Single-player',
   'Multi-player',
   'PvP',
   'Shared/Split Screen PvP',
   'Co-op',
   'Share

In [20]:
query_body = {
  "query": {
    "multi_match" : {
      "query": "bethesda medieval fantasy massively mutliplayer swords adn dragons", 
      "fields": [ "title", "short_description", 'about_the_game', 'developer', 
                 'publisher', 'getegories', 'genres' ] 
    }
  }
}
print('### RESULTS ####')
explain=False
results = es.search(index=index_name, body=query_body, explain=explain)['hits']['hits']
for hit in results:
    print('title: {} - score: {}'.format(hit['_source']['title'], hit['_score']))
if explain:
    print('some info on results')
    print(hit['_explanation'])

### RESULTS ####


ConnectionError: ConnectionError(<urllib3.connection.HTTPConnection object at 0x7fe07186d5b0>: Failed to establish a new connection: [Errno 111] Connection refused) caused by: NewConnectionError(<urllib3.connection.HTTPConnection object at 0x7fe07186d5b0>: Failed to establish a new connection: [Errno 111] Connection refused)