In [45]:
import urllib.request 
import re
import pandas as pd
from elasticsearch import Elasticsearch

In [46]:
import requests
# run bin/elasticsearch in elasticsearch directory in terminal before running this cell
res = requests.get('http://localhost:9200')

In [47]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

In [48]:
def test_ES(es):
    return es.ping()  # got True

In [49]:
if test_ES(es):
    print('ES instance working')
else:
    print('ES instance not working')

ES instance working




In [50]:
def index_info(index_name):
    count, deleted, shards, =  es.cat.indices(index=index_name, h=['docs.count', 'docs.deleted', 'pri'])[:-1].split(' ')
    print(
      """
      #### INDEX INFO #####
      index_name = {}
      doc_count = {}
      shard_count = {}
      deleted_doc_count = {}
      """.format(index_name, count, shards, deleted)
  )

In [51]:
# read data from steam data directory 
games = pd.read_pickle('final_data.pickle')

In [82]:
#es.indices.delete(index='steam_index_bm25')

{'acknowledged': True}

In [98]:
# set index name
index_name = 'steam_index_bm25'

In [83]:
# BM25 (Default index)
request_body_bm25 = {
    'settings': {
        'number_of_shards': 1,
        'number_of_replicas': 1,
        
    },
    'mappings': {
          'properties': {
              'app_id':{'type': 'integer'},
              'title': {'type': 'text'},
              'short_description': {'type': 'text'},
              'about_the_game': {'type': 'text'},
              'developer': {
                    "type": "text",
                    "fields": {
                    "raw": { 
                       "type": "keyword"
                            }
                    }
              },
              'publisher': {
                    "type": "text",
                    "fields": {
                    "raw": { 
                       "type": "keyword"
                            }
                    }
              },
              'categories': {
                    "type": "text",
                    "fields": {
                    "raw": { 
                       "type": "keyword"
                            }
                    }
              },
              'genres': {
                    "type": "text",
                    "fields": {
                    "raw": { 
                       "type": "keyword"
                            }
                    }
              },
              'release_date': {'type': 'date'},
          }
    }
}

try:
    es.indices.get(index_name)
    print('index {} already exists'.format(index_name))
except:
    print('creating index {}'.format(index_name))
    es.indices.create(index_name, body=request_body_bm25)

creating index steam_index_bm25


In [91]:
for i, (title, app_id, developer, publisher, categories, genres, body, abstract, release_date) in enumerate(games.values[:,1:-1]):
    doc_body = {
              'app_id': app_id,
              'title': title,
              'short_description': abstract,
              'about_the_game': body,
              'developer': developer,
              'publisher': publisher,
              'categories': categories,
              'genres': genres,
              'release_date': release_date
              }
    es.index(index_name, doc_body, id=i)

In [86]:
print('we have made and index called {} with {} documents'.format(index_name, es.cat.count(index=index_name,h=['count'])))
index_info(index_name)

we have made and index called steam_index_bm25 with 49317
 documents

      #### INDEX INFO #####
      index_name = steam_index_bm25
      doc_count = 49317
      shard_count = 1
      deleted_doc_count = 0
      


In [101]:
index_name = 'steam_index_dfr'

In [102]:
#DFR index
request_body_dfr = {
        'settings': {
        'number_of_shards': 1,
        'number_of_replicas': 1,
        'index': {
            'similarity': {
                'dfr_similarity': {
                    'type': 'DFR',
                    'basic_model': 'g',
                    'after_effect': 'l',
                    'normalization': 'h2',
                    'normalization.h2.c':'3.0'

                }
            }
        }
        
    },
    'mappings': {
          'properties': {
              'app_id':{'type': 'integer'},
              'title': {'type': 'text', 'similarity': 'dfr_similarity'},
              'short_description': {'type': 'text', 'similarity': 'dfr_similarity'},
              'about_the_game': {'type': 'text', 'similarity': 'dfr_similarity'},
              'developer': {
                    "type": "text", 'similarity': 'dfr_similarity',
                    "fields": {
                    "raw": { 
                       "type": "keyword"
                            }
                    }
              },
              'publisher': {
                    "type": "text", 'similarity': 'dfr_similarity',
                    "fields": {
                    "raw": { 
                       "type": "keyword"
                            }
                    }
              },
              'categories': {
                    "type": "text", 'similarity': 'dfr_similarity',
                    "fields": {
                    "raw": { 
                       "type": "keyword"
                            }
                    }
              },
              'genres': {
                    "type": "text", 'similarity': 'dfr_similarity',
                    "fields": {
                    "raw": { 
                       "type": "keyword"
                            }
                    }
              },
              'release_date': {'type': 'date'},
          }
    }
}

try:
    es.indices.get(index_name)
    print('index {} already exists'.format(index_name))
except:
    print('creating index {}'.format(index_name))
    es.indices.create(index_name, body=request_body_dfr)

index steam_index_dfr already exists


In [None]:
for i, (title, app_id, developer, publisher, categories, genres, body, abstract, release_date) in enumerate(games.values[:,1:-1]):
    doc_body = {
              'app_id': app_id,
              'title': title,
              'short_description': abstract,
              'about_the_game': body,
              'developer': developer,
              'publisher': publisher,
              'categories': categories,
              'genres': genres,
              'release_date': release_date
              }
    es.index(index_name, doc_body, id=i)

In [103]:
print('we have made and index called {} with {} documents'.format(index_name, es.cat.count(index=index_name,h=['count'])))
index_info(index_name)

we have made and index called steam_index_dfr with 49317
 documents

      #### INDEX INFO #####
      index_name = steam_index_dfr
      doc_count = 49317
      shard_count = 1
      deleted_doc_count = 0
      


In [104]:
es.get(index=index_name, id=51)

{'_index': 'steam_index_dfr',
 '_type': '_doc',
 '_id': '51',
 '_version': 1,
 '_seq_no': 51,
 '_primary_term': 1,
 'found': True,
 '_source': {'app_id': 2300,
  'title': 'DOOM II',
  'short_description': 'let obsession begin time entire force netherworld overrun earth save must descend stygian depth hell battle mightier nastier deadlier demon monster use powerful weapon',
  'about_the_game': 'let obsession begin time entire force netherworld overrun earth save must descend stygian depth hell battle mightier nastier deadlier demon monster use powerful weapon survive mind blowing explosion bloodiest fiercest awesome blastfest ever play doom ii solo two people modem four player lan supporting ipx protocol matter way choose get ready adrenaline pumping action packed excitement thats sure give heart real workout',
  'developer': ['id Software'],
  'publisher': ['id Software'],
  'categories': ['Single-player',
   'Multi-player',
   'PvP',
   'Shared/Split Screen PvP',
   'Co-op',
   'Share

In [105]:
query_body = {
    'query':{
        'term': {
            'about_the_game': 'doom'
        }
    }
}
print('### RESULTS ####')
explain=True
results = es.search(index=index_name, body=query_body, explain=explain)['hits']['hits']
for hit in results:
    print('title: {} - score: {}'.format(hit['_source']['title'], hit['_score']))
if explain:
    print('some info on results')
    print(hit['_explanation'])

### RESULTS ####
title: Doom 3: BFG Edition - score: 6.753497
title: Retro Classix: Gate of Doom - score: 6.3668017
title: Doom & Destiny Worlds - score: 6.3424344
title: Scoot Kaboom and the Tomb of Doom - score: 6.341451
title: The Forest of Doom (Standalone) - score: 6.316736
title: DOOM VFR - score: 6.2229695
title: Ultimate Doom - score: 6.144916
title: Road Doom - score: 6.144916
title: DOOM - score: 6.0524974
title: Rise of the Triad: Dark War - score: 5.997403
some info on results
{'value': 5.997403, 'description': 'weight(about_the_game:doom in 5460) [PerFieldSimilarity], result of:', 'details': [{'value': 5.997403, 'description': 'score(DFRSimilarity, freq=3.0), computed as boost * basicModel.score(stats, tfn) * afterEffect.score(stats, tfn) from:', 'details': [{'value': 5.884641, 'description': 'NormalizationH2, computed as tf * log2(1 + c * avgfl / fl) from:', 'details': [{'value': 3.0, 'description': 'tf, number of occurrences of term in the document', 'details': []}, {'va