# Elasticsearch Demo

In [39]:
import pandas as pd

hotels = pd.read_pickle('../Storage/Data/hotels.pkl')
reviews = pd.read_pickle('../Storage/Data/preprocessed_reviews.pkl')
sentiments = pd.read_pickle('../Storage/Data/reviews_agg_sentiment.pkl')

hotels.shape, reviews.shape, sentiments.shape

((21420, 19), (21227, 5), (275, 2))

In [40]:
grouped_hotels = hotels[['reviews.text']].groupby('name').count()
grouped_hotels.index

KeyError: 'name'

In [41]:
sentiments.index

Index(['A Bed & Breakfast In Cambridge', 'Acorn Motor Inn',
       'Ambassadors Inn and Suites',
       'American Star Inn and Suites Atlantic City', 'Americas Best Value Inn',
       'Americas Best Value Inn - Medical Center/airport',
       'Americas Best Value Inn and Suites', 'Americinn Coralville',
       'Americinn Lodge & Suites Appleton', 'Amisos Hotel',
       ...
       'Travelodge Ruther Glen', 'Tryp Dusseldorf Krefeld Hotel',
       'Tulip Inn Turin West', 'Una Hotel Forte Dei Marmi', 'Villa Carlotta',
       'Vista Hotel On Lake Tarpon', 'Warwick Denver',
       'Western Inn-glacier Park', 'Wine Valley Lodge',
       'Wingate By Wyndham Pueblo'],
      dtype='object', name='name', length=275)

In [44]:
no_sent_hotel = grouped_hotels.index.difference(sentiments.index, sort=False)
no_sent_hotel

Index(['Abbeville Inn', 'Adria Motor Inn', 'Bailey Hotel', 'Chippewa Hotel',
       'Close For You', 'Club Quarters, Rockefeller Center',
       'Clubhouse At River Country Est', 'Concord Apartments By Boq Lodging',
       'Days Inn Newton', 'Days Inn Tallulah',
       'Holiday Inn Express Hotel & Suites Hillview', 'Hyatt Dulles',
       'La Playa', 'Marriott Springhill Marina', 'Old Wheeler Hotel',
       'Porto Vista Hotel', 'Relax Inn', 'Ritz Hotel', 'Spring Fountain Motel',
       'Springhill Suites By Marriott South Bend/mishawaka', 'Stratford Inn',
       'Studio 6', 'The Dec', 'Worldwide Hospitality Supply Company'],
      dtype='object', name='name')

In [50]:
# look up `no_sent_hotel` in original `hotels` df before preprocessing
no_sent_hotel.isin(hotels.name).sum()

24

In [51]:
# look up `no_sent_hotel` in original `reviews` df after preprocessing
no_sent_hotel.isin(reviews.name).sum()

0

We will notice that there are some hotels dropped out during preprocessing (from 299 it'll be 275, almost 24 hotels), so we won't index them in ElasticSearch beacuse of missing reviews & missing sentiments

In [55]:
grouped_reviews = reviews.groupby('name')
grouped_reviews.get_group('American Star Inn and Suites Atlantic City')

Unnamed: 0,name,lemmatized,classification,p_pos,p_neg
8200,American Star Inn and Suites Atlantic City,disgusting place stay place horrible bed scum ...,pos,0.646265,0.353735
8201,American Star Inn and Suites Atlantic City,bad motel stay room dirty food ice cream get m...,neg,0.356840,0.643160
8202,American Star Inn and Suites Atlantic City,clean affordable friendly manager desk nice he...,pos,0.734980,0.265020
8203,American Star Inn and Suites Atlantic City,quality room motel low end motel high cost fee...,neg,0.408191,0.591809
8204,American Star Inn and Suites Atlantic City,good value simple hotel fairly clean pleasant ...,pos,0.876419,0.123581
...,...,...,...,...,...
8261,American Star Inn and Suites Atlantic City,descent get room night price good pay,pos,0.747158,0.252842
8262,American Star Inn and Suites Atlantic City,descent staff rude ignorant,pos,0.951923,0.048077
8263,American Star Inn and Suites Atlantic City,descent bathroom bad,neg,0.483656,0.516344
8264,American Star Inn and Suites Atlantic City,bad manner unprofessional motel meet bad manne...,pos,0.866642,0.133358


In [57]:
grouped_reviews.get_group('Arion').head()

Unnamed: 0,name,lemmatized,classification,p_pos,p_neg
14164,Arion,de d mat mat,pos,0.75,0.25
14165,Arion,great hotel excellent service expect star hote...,pos,0.966563,0.033437
14166,Arion,autumn beauty autumn people summer resort like...,pos,0.886964,0.113036
14167,Arion,romantic weekend nice hotel spend couple day d...,pos,0.837684,0.162316
14168,Arion,romantic weekend nice hotel spend couple day d...,pos,0.837684,0.162316


In [64]:
filtered_hotels = sentiments.index.values
filtered_hotels[:5]

array(['A Bed & Breakfast In Cambridge', 'Acorn Motor Inn',
       'Ambassadors Inn and Suites',
       'American Star Inn and Suites Atlantic City',
       'Americas Best Value Inn'], dtype=object)

In [1]:
from elasticsearch import Elasticsearch
from elasticsearch import helpers


es_client = Elasticsearch(hosts=["http://localhost:9200"])
results =  es_client.indices
print(dir(results))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'add_block', 'analyze', 'clear_cache', 'client', 'clone', 'close', 'create', 'create_data_stream', 'data_streams_stats', 'delete', 'delete_alias', 'delete_data_stream', 'delete_index_template', 'delete_template', 'disk_usage', 'exists', 'exists_alias', 'exists_index_template', 'exists_template', 'exists_type', 'field_usage_stats', 'flush', 'flush_synced', 'forcemerge', 'freeze', 'get', 'get_alias', 'get_data_stream', 'get_field_mapping', 'get_index_template', 'get_mapping', 'get_settings', 'get_template', 'get_upgrade', 'migrate_to_data_stream', 'modify_data_stream', 'open', 'promote_data_stream', 'put_alias', 'put_index_template', 'put_mapping', 

In [2]:
def gendata():
    mywords = ['foo', 'bar', 'baz']
    for word in mywords:
        yield {
            "_index": "mywords",
            "word": word,
        }

helpers.bulk(es_client, gendata())

ConnectionTimeout: ConnectionTimeout caused by - ReadTimeoutError(HTTPConnectionPool(host='localhost', port=9200): Read timed out. (read timeout=10))

In [7]:
helpers.scan(es_client,
    query={"query": {"match": {"title": "python"}}},
    index="myword",
    doc_type="word"
).hit()

AttributeError: 'generator' object has no attribute 'hit'

In [None]:
def prepare_doc(hotel_name:str) -> dict:
    """
        prepare hotel document for ES index
        
        Keyword arguments:
        argument -- hotel_name
        Return: python dict contains all info related to that hotel
    """
    hotel_dict = {
        
    }


    

In [82]:
import uuid

def doc_generator(df):
    df_iter = df.iterrows()
    for index, document in df_iter:
        yield {
                "_index": 'es_demo',
                "_doc_type": "_doc",
                "_id" : uuid.uuid4(),
                "_source": document,
            }
            

helpers.bulk(es_client, doc_generator(sentiments))

RuntimeError: generator raised StopIteration

In [92]:
es_client.index(index='es_demo', document={'name':'Fares', 'age':15}, id=2)

ConnectionTimeout: Connection timed out

In [99]:
response = es_client.index(
    index = 'employees',
    #  = 'person',
    id = uuid.uuid4(),
    document = {
        "name": "George Peterson",
        "sex": "male",
        "age": 34,
        "years": 10
    },
    request_timeout=45
)
print(response)


  response = es_client.index(


ConnectionTimeout: Connection timed out

In [98]:
from elasticsearch_dsl import Search

search = Search(using=es_client, index="employees")
for hit in search.scan():
    print(hit)

ApiError: ApiError(503, 'search_phase_execution_exception', None)

In [100]:
hits = es_client.count(index='es_demo', query={"match_all": {}})
print(hits)

ApiError: ApiError(503, 'search_phase_execution_exception', None)