# Elasticsearch Demo

In [1]:
import pandas as pd

hotels = pd.read_pickle('../Storage/Data/hotels.pkl')
reviews = pd.read_pickle('../Storage/Data/preprocessed_reviews.pkl')
sentiments = pd.read_pickle('../Storage/Data/reviews_agg_sentiment.pkl')

hotels.shape, reviews.shape, sentiments.shape

((21420, 19), (21227, 5), (275, 2))

In [2]:
grouped_hotels = hotels[['name', 'reviews.text']].groupby('name').count()
grouped_hotels.index

Index(['A Bed & Breakfast In Cambridge', 'Abbeville Inn', 'Acorn Motor Inn',
       'Adria Motor Inn', 'Ambassadors Inn and Suites',
       'American Star Inn and Suites Atlantic City', 'Americas Best Value Inn',
       'Americas Best Value Inn - Medical Center/airport',
       'Americas Best Value Inn and Suites', 'Americinn Coralville',
       ...
       'Tryp Dusseldorf Krefeld Hotel', 'Tulip Inn Turin West',
       'Una Hotel Forte Dei Marmi', 'Villa Carlotta',
       'Vista Hotel On Lake Tarpon', 'Warwick Denver',
       'Western Inn-glacier Park', 'Wine Valley Lodge',
       'Wingate By Wyndham Pueblo', 'Worldwide Hospitality Supply Company'],
      dtype='object', name='name', length=299)

In [3]:
sentiments.index

Index(['A Bed & Breakfast In Cambridge', 'Acorn Motor Inn',
       'Ambassadors Inn and Suites',
       'American Star Inn and Suites Atlantic City', 'Americas Best Value Inn',
       'Americas Best Value Inn - Medical Center/airport',
       'Americas Best Value Inn and Suites', 'Americinn Coralville',
       'Americinn Lodge & Suites Appleton', 'Amisos Hotel',
       ...
       'Travelodge Ruther Glen', 'Tryp Dusseldorf Krefeld Hotel',
       'Tulip Inn Turin West', 'Una Hotel Forte Dei Marmi', 'Villa Carlotta',
       'Vista Hotel On Lake Tarpon', 'Warwick Denver',
       'Western Inn-glacier Park', 'Wine Valley Lodge',
       'Wingate By Wyndham Pueblo'],
      dtype='object', name='name', length=275)

In [4]:
no_sent_hotel = grouped_hotels.index.difference(sentiments.index, sort=False)
no_sent_hotel

Index(['Abbeville Inn', 'Adria Motor Inn', 'Bailey Hotel', 'Chippewa Hotel',
       'Close For You', 'Club Quarters, Rockefeller Center',
       'Clubhouse At River Country Est', 'Concord Apartments By Boq Lodging',
       'Days Inn Newton', 'Days Inn Tallulah',
       'Holiday Inn Express Hotel & Suites Hillview', 'Hyatt Dulles',
       'La Playa', 'Marriott Springhill Marina', 'Old Wheeler Hotel',
       'Porto Vista Hotel', 'Relax Inn', 'Ritz Hotel', 'Spring Fountain Motel',
       'Springhill Suites By Marriott South Bend/mishawaka', 'Stratford Inn',
       'Studio 6', 'The Dec', 'Worldwide Hospitality Supply Company'],
      dtype='object', name='name')

In [5]:
# look up `no_sent_hotel` in original `hotels` df before preprocessing
no_sent_hotel.isin(hotels.name).sum()

24

In [6]:
# look up `no_sent_hotel` in original `reviews` df after preprocessing
no_sent_hotel.isin(reviews.name).sum()

0

We will notice that there are some hotels dropped out during preprocessing (from 299 it'll be 275, almost 24 hotels), so we won't index them in ElasticSearch beacuse of missing reviews & missing sentiments

In [7]:
grouped_reviews = reviews.groupby('name')
grouped_reviews.get_group('American Star Inn and Suites Atlantic City')

Unnamed: 0,name,lemmatized,classification,p_pos,p_neg
8200,American Star Inn and Suites Atlantic City,disgusting place stay place horrible bed scum ...,pos,0.646265,0.353735
8201,American Star Inn and Suites Atlantic City,bad motel stay room dirty food ice cream get m...,neg,0.356840,0.643160
8202,American Star Inn and Suites Atlantic City,clean affordable friendly manager desk nice he...,pos,0.734980,0.265020
8203,American Star Inn and Suites Atlantic City,quality room motel low end motel high cost fee...,neg,0.408191,0.591809
8204,American Star Inn and Suites Atlantic City,good value simple hotel fairly clean pleasant ...,pos,0.876419,0.123581
...,...,...,...,...,...
8261,American Star Inn and Suites Atlantic City,descent get room night price good pay,pos,0.747158,0.252842
8262,American Star Inn and Suites Atlantic City,descent staff rude ignorant,pos,0.951923,0.048077
8263,American Star Inn and Suites Atlantic City,descent bathroom bad,neg,0.483656,0.516344
8264,American Star Inn and Suites Atlantic City,bad manner unprofessional motel meet bad manne...,pos,0.866642,0.133358


In [8]:
grouped_reviews.get_group('Arion').head()

Unnamed: 0,name,lemmatized,classification,p_pos,p_neg
14164,Arion,de d mat mat,pos,0.75,0.25
14165,Arion,great hotel excellent service expect star hote...,pos,0.966563,0.033437
14166,Arion,autumn beauty autumn people summer resort like...,pos,0.886964,0.113036
14167,Arion,romantic weekend nice hotel spend couple day d...,pos,0.837684,0.162316
14168,Arion,romantic weekend nice hotel spend couple day d...,pos,0.837684,0.162316


In [9]:
filtered_hotels = sentiments.index.values
filtered_hotels[:5]

array(['A Bed & Breakfast In Cambridge', 'Acorn Motor Inn',
       'Ambassadors Inn and Suites',
       'American Star Inn and Suites Atlantic City',
       'Americas Best Value Inn'], dtype=object)

In [44]:
from elasticsearch import Elasticsearch
from ssl import create_default_context
import configparser

# load global configs
config = configparser.ConfigParser()
config.read('../configs.ini')

# load SSL certs.pem
context = create_default_context(cafile=config['ELASTIC']['ssl_cert_path'])

# Found in the 'Manage Deployment' page
CLOUD_ID = config['ELASTIC']['cloud_id']

# Create the client instance
es_client = Elasticsearch(
    cloud_id=CLOUD_ID,
    http_auth=(config['ELASTIC']['username'], config['ELASTIC']['password']),
    scheme=config['ELASTIC']['scheme'],
    port=config['ELASTIC']['port'],
    ssl_context=context,
)

# Successful response!
es_client.info()
# {'name': 'instance-0000000000', 'cluster_name': ...}

{'name': 'instance-0000000001',
 'cluster_name': 'd64f20da8ed348069549b3b387c7eb58',
 'cluster_uuid': 'BGMBBfU9Rtquo6kqDNtG3A',
 'version': {'number': '8.2.3',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': '9905bfb62a3f0b044948376b4f607f70a8a151b4',
  'build_date': '2022-06-08T22:21:36.455508792Z',
  'build_snapshot': False,
  'lucene_version': '9.1.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

In [48]:
# Get Index API
es_client.indices.get(index="*")

{'.ds-logs-enterprise_search.api-default-2022.06.17-000001': {'aliases': {},
  'mappings': {'_meta': {'version': '1.7.0'},
   '_data_stream_timestamp': {'enabled': True},
   'dynamic_templates': [{'match_ip': {'match': 'ip',
      'match_mapping_type': 'string',
      'mapping': {'type': 'ip'}}},
    {'match_message': {'match': 'message',
      'match_mapping_type': 'string',
      'mapping': {'type': 'match_only_text'}}},
    {'strings_as_keyword': {'match_mapping_type': 'string',
      'mapping': {'ignore_above': 1024, 'type': 'keyword'}}}],
   'date_detection': False,
   'properties': {'@timestamp': {'type': 'date'},
    'agent': {'properties': {'build': {'properties': {'original': {'type': 'keyword',
         'ignore_above': 1024}}},
      'ephemeral_id': {'type': 'keyword', 'ignore_above': 1024},
      'hostname': {'type': 'keyword', 'ignore_above': 1024},
      'id': {'type': 'keyword', 'ignore_above': 1024},
      'name': {'type': 'keyword', 'ignore_above': 1024},
      'type': 

In [49]:
es_client.index(
    index='lord-of-the-rings',
    document={
        'character': 'Aragon',
        'quote': 'It is not this day.'
    }
)


{'_index': 'lord-of-the-rings',
 '_id': 'HR4ydIEB6_mhCbAmtCZD',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 0,
 '_primary_term': 1}

In [50]:
es_client.index(
    index='lord-of-the-rings',
    document={
        'character': 'Gandalf',
        'quote': 'A wizard is never late, nor is he early.'
    }
)


{'_index': 'lord-of-the-rings',
 '_id': 'Hh4zdIEB6_mhCbAmiibJ',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 2, 'failed': 0},
 '_seq_no': 1,
 '_primary_term': 1}

In [51]:
es_client.index(
    index='lord-of-the-rings',
    document={
        'character': 'Frodo Baggins',
        'quote': 'You are late'
    }
)


{'_index': 'lord-of-the-rings',
 '_id': 'Hx4zdIEB6_mhCbAm4Ca1',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 2, 'failed': 0},
 '_seq_no': 2,
 '_primary_term': 1}

In [53]:
es_client.indices.refresh(index='lord-of-the-rings')

{'_shards': {'total': 2, 'successful': 2, 'failed': 0}}

In [59]:
result = es.search(
 index='lord-of-the-rings',
  query={
    'match': {'quote': 'late'}
  }
 )

result['hits']['hits']

[{'_index': 'lord-of-the-rings',
  '_id': 'Hx4zdIEB6_mhCbAm4Ca1',
  '_score': 0.5820575,
  '_source': {'character': 'Frodo Baggins', 'quote': 'You are late'}},
 {'_index': 'lord-of-the-rings',
  '_id': 'Hh4zdIEB6_mhCbAmiibJ',
  '_score': 0.37883914,
  '_source': {'character': 'Gandalf',
   'quote': 'A wizard is never late, nor is he early.'}}]

In [60]:
result = es_client.search(
 index='lord-of-the-rings',
  query={
    'match_all': {}
  }
 )

result['hits']['hits']

[{'_index': 'lord-of-the-rings',
  '_id': 'HR4ydIEB6_mhCbAmtCZD',
  '_score': 1.0,
  '_source': {'character': 'Aragon', 'quote': 'It is not this day.'}},
 {'_index': 'lord-of-the-rings',
  '_id': 'Hh4zdIEB6_mhCbAmiibJ',
  '_score': 1.0,
  '_source': {'character': 'Gandalf',
   'quote': 'A wizard is never late, nor is he early.'}},
 {'_index': 'lord-of-the-rings',
  '_id': 'Hx4zdIEB6_mhCbAm4Ca1',
  '_score': 1.0,
  '_source': {'character': 'Frodo Baggins', 'quote': 'You are late'}}]

In [46]:
# def prepare_doc(hotel_name:str) -> dict:
#     """
#         prepare hotel document for ES index
        
#         Keyword arguments:
#         argument -- hotel_name
#         Return: python dict contains all info related to that hotel
#     """
#     hotel_dict = {
        
#     }


    

In [47]:
# import uuid

# def doc_generator(df):
#     df_iter = df.iterrows()
#     for index, document in df_iter:
#         yield {
#                 "_index": 'es_demo',
#                 "_doc_type": "_doc",
#                 "_id" : uuid.uuid4(),
#                 "_source": document,
#             }
            

# helpers.bulk(es_client, doc_generator(sentiments))