In [1]:
import urllib.request 
import re
import pandas as pd
from elasticsearch import Elasticsearch

In [2]:
import requests
# run bin/elasticsearch in elasticsearch directory in terminal before running this cell
res = requests.get('http://localhost:9200')

In [3]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

In [4]:
def test_ES(es):
    return es.ping()  # got True

In [5]:
if test_ES(es):
    print('ES instance working')
else:
    print('ES instance not working')

ES instance working




In [6]:
def index_info(index_name):
    count, deleted, shards, =  es.cat.indices(index=index_name, h=['docs.count', 'docs.deleted', 'pri'])[:-1].split(' ')
    print(
      """
      #### INDEX INFO #####
      index_name = {}
      doc_count = {}
      shard_count = {}
      deleted_doc_count = {}
      """.format(index_name, count, shards, deleted)
  )

In [7]:
# read data from steam data directory
games = pd.read_pickle('final_data_raw_V3.pickle')

In [8]:
#es.indices.delete(index='steam_index_dfr')

In [9]:
# set index name
index_name = 'steam_index_bm25_final'

In [63]:
# BM25 (Default index)
request_body_bm25_final = {
    'settings': {
        'number_of_shards': 1,
        'number_of_replicas': 1,
        
    },
    'mappings': {
          'properties': {
              'app_id':{'type': 'integer'},
              'title': {'type': 'text'},
              'short_description': {
                  'type': 'text',
                  'analyzer': 'english'
              },
              'about_the_game': {
                  'type': 'text',
                  'analyzer': 'english'
              },
              'developers': {
                    "type": "text",
                    "fields": {"raw": {"type": "keyword"}}
              },
              'publishers': {
                    "type": "text",
                    "fields": {"raw": {"type": "keyword"}}
              },
              'categories': {
                    "type": "text",
                    "fields": {"raw": {"type": "keyword"}}
              },
              'genres': {
                    "type": "text",
                    "fields": {"raw": {"type": "keyword"}}
              },
              'is_released': {"type": "keyword"},
              'release_date': {'type': 'date'},
              'popularity':{'type':'float'},
              'rating':{'type':'float'}
          }
    }
}

try:
    es.indices.get(index_name)
    print('index {} already exists'.format(index_name))
except:
    print('creating index {}'.format(index_name))
    es.indices.create(index_name, body=request_body_bm25_final)

index steam_index_bm25_final already exists




In [None]:
for i, (name, app_id, about_the_game, short_description,
       developers, publishers, categories, genres, release_date, is_released, rating, popularity) in enumerate(games.values[:,1:]):
    doc_body = {
              'app_id': app_id,
              'title': name,
              'short_description': short_description,
              'about_the_game': about_the_game,
              'developers': developers,
              'publishers': publishers,
              'categories': categories,
              'genres': genres,
              'is_released': is_released,
              'release_date': release_date,
              'rating': rating,
              'popularity': popularity
              }
    es.index(index_name, doc_body, id=i)

In [None]:
print('we have made and index called {} with {} documents'.format(index_name, es.cat.count(index=index_name,h=['count'])))
index_info(index_name)

In [None]:
index_name = 'steam_index_dfr_final'

In [None]:
#DFR index
request_body_dfr = {
        'settings': {
        'number_of_shards': 1,
        'number_of_replicas': 1,
        'index': {
            'similarity': {
                'dfr_similarity': {
                    'type': 'DFR',
                    'basic_model': 'g',
                    'after_effect': 'l',
                    'normalization': 'h2',
                    'normalization.h2.c':'3.0'

                }
            }
        }
        },
    'mappings': {
          'properties': {
              'app_id':{'type': 'integer'},
              'title': {
                  'type': 'text', 
                  'similarity': 'dfr_similarity'
              },
              'short_description': {
                  'type': 'text', 
                  'analyzer': 'english', 
                  'similarity': 'dfr_similarity'
              },
              'about_the_game': {
                  'type': 'text', 
                  'analyzer': 'english', 
                  'similarity': 'dfr_similarity'
              },
              'developer': {
                  "type": "text",
                  'similarity': 'dfr_similarity',
                    "fields": {
                    "raw": {"type": "keyword"}}
              },
              'publisher': {
                  "type": "text",
                  'similarity': 'dfr_similarity',
                    "fields": {
                    "raw": {"type": "keyword"}}
              },
              'categories': {
                  "type": "text",
                  'similarity': 'dfr_similarity',
                    "fields": {
                    "raw": {"type": "keyword"}}
              },
              'genres': {
                  "type": "text", 
                  'similarity': 'dfr_similarity',
                    "fields": {
                    "raw": { "type": "keyword"}}
              },
              'is_released': {"type": "keyword"},
              'release_date': {'type': 'date'},
              'popularity':{'type':'float'},
              'rating':{'type':'float'}
          }
    }
}

try:
    es.indices.get(index_name)
    print('index {} already exists'.format(index_name))
except:
    print('creating index {}'.format(index_name))
    es.indices.create(index_name, body=request_body_dfr)

In [None]:
# indexing using DFR
for i, (name, app_id, about_the_game, short_description,
       developers, publishers, categories, genres, release_date, is_released, rating, popularity) in enumerate(games.values[:,1:]):
    doc_body = {
              'app_id': app_id,
              'title': name,
              'short_description': short_description,
              'about_the_game': about_the_game,
              'developers': developers,
              'publishers': publishers,
              'categories': categories,
              'genres': genres,
              'is_released': is_released,
              'release_date': release_date,
              'rating': rating,
              'popularity': popularity
              }
    es.index(index_name, doc_body, id=i)

In [None]:
print('we have made and index called {} with {} documents'.format(index_name, es.cat.count(index=index_name,h=['count'])))
index_info(index_name)