# Elasticsearch Demo

In [1]:
import pandas as pd

hotels = pd.read_pickle('../Storage/Data/hotels.pkl')
reviews = pd.read_pickle('../Storage/Data/preprocessed_reviews.pkl')
sentiments = pd.read_pickle('../Storage/Data/reviews_agg_sentiment.pkl')

hotels.shape, reviews.shape, sentiments.shape

((21420, 19), (21227, 5), (275, 2))

In [2]:
grouped_hotels = hotels[['name', 'reviews.text']].groupby('name').count()
grouped_hotels.index

Index(['A Bed & Breakfast In Cambridge', 'Abbeville Inn', 'Acorn Motor Inn',
       'Adria Motor Inn', 'Ambassadors Inn and Suites',
       'American Star Inn and Suites Atlantic City', 'Americas Best Value Inn',
       'Americas Best Value Inn - Medical Center/airport',
       'Americas Best Value Inn and Suites', 'Americinn Coralville',
       ...
       'Tryp Dusseldorf Krefeld Hotel', 'Tulip Inn Turin West',
       'Una Hotel Forte Dei Marmi', 'Villa Carlotta',
       'Vista Hotel On Lake Tarpon', 'Warwick Denver',
       'Western Inn-glacier Park', 'Wine Valley Lodge',
       'Wingate By Wyndham Pueblo', 'Worldwide Hospitality Supply Company'],
      dtype='object', name='name', length=299)

In [3]:
sentiments.index

Index(['A Bed & Breakfast In Cambridge', 'Acorn Motor Inn',
       'Ambassadors Inn and Suites',
       'American Star Inn and Suites Atlantic City', 'Americas Best Value Inn',
       'Americas Best Value Inn - Medical Center/airport',
       'Americas Best Value Inn and Suites', 'Americinn Coralville',
       'Americinn Lodge & Suites Appleton', 'Amisos Hotel',
       ...
       'Travelodge Ruther Glen', 'Tryp Dusseldorf Krefeld Hotel',
       'Tulip Inn Turin West', 'Una Hotel Forte Dei Marmi', 'Villa Carlotta',
       'Vista Hotel On Lake Tarpon', 'Warwick Denver',
       'Western Inn-glacier Park', 'Wine Valley Lodge',
       'Wingate By Wyndham Pueblo'],
      dtype='object', name='name', length=275)

In [4]:
no_sent_hotel = grouped_hotels.index.difference(sentiments.index, sort=False)
no_sent_hotel

Index(['Abbeville Inn', 'Adria Motor Inn', 'Bailey Hotel', 'Chippewa Hotel',
       'Close For You', 'Club Quarters, Rockefeller Center',
       'Clubhouse At River Country Est', 'Concord Apartments By Boq Lodging',
       'Days Inn Newton', 'Days Inn Tallulah',
       'Holiday Inn Express Hotel & Suites Hillview', 'Hyatt Dulles',
       'La Playa', 'Marriott Springhill Marina', 'Old Wheeler Hotel',
       'Porto Vista Hotel', 'Relax Inn', 'Ritz Hotel', 'Spring Fountain Motel',
       'Springhill Suites By Marriott South Bend/mishawaka', 'Stratford Inn',
       'Studio 6', 'The Dec', 'Worldwide Hospitality Supply Company'],
      dtype='object', name='name')

In [5]:
# look up `no_sent_hotel` in original `hotels` df before preprocessing
no_sent_hotel.isin(hotels.name).sum()

24

In [6]:
# look up `no_sent_hotel` in original `reviews` df after preprocessing
no_sent_hotel.isin(reviews.name).sum()

0

We will notice that there are some hotels dropped out during preprocessing (from 299 it'll be 275, almost 24 hotels), so we won't index them in ElasticSearch beacuse of missing reviews & missing sentiments

In [7]:
grouped_reviews = reviews.groupby('name')
grouped_reviews.get_group('American Star Inn and Suites Atlantic City')

Unnamed: 0,name,lemmatized,classification,p_pos,p_neg
8200,American Star Inn and Suites Atlantic City,disgusting place stay place horrible bed scum ...,pos,0.646265,0.353735
8201,American Star Inn and Suites Atlantic City,bad motel stay room dirty food ice cream get m...,neg,0.356840,0.643160
8202,American Star Inn and Suites Atlantic City,clean affordable friendly manager desk nice he...,pos,0.734980,0.265020
8203,American Star Inn and Suites Atlantic City,quality room motel low end motel high cost fee...,neg,0.408191,0.591809
8204,American Star Inn and Suites Atlantic City,good value simple hotel fairly clean pleasant ...,pos,0.876419,0.123581
...,...,...,...,...,...
8261,American Star Inn and Suites Atlantic City,descent get room night price good pay,pos,0.747158,0.252842
8262,American Star Inn and Suites Atlantic City,descent staff rude ignorant,pos,0.951923,0.048077
8263,American Star Inn and Suites Atlantic City,descent bathroom bad,neg,0.483656,0.516344
8264,American Star Inn and Suites Atlantic City,bad manner unprofessional motel meet bad manne...,pos,0.866642,0.133358


In [8]:
grouped_reviews.get_group('Americas Best Value Inn').head()

Unnamed: 0,name,lemmatized,classification,p_pos,p_neg
8890,Americas Best Value Inn,poor adequate chair 1night stand bed good show...,pos,0.61522,0.38478
8891,Americas Best Value Inn,d expect d expect expensive night s stay clean...,pos,0.844569,0.155431
8892,Americas Best Value Inn,surprising good value clean comfortable helpfu...,pos,0.709604,0.290396
8893,Americas Best Value Inn,hotel pretty bad clean lady naked use work cra...,neg,0.00056,0.99944
8894,Americas Best Value Inn,nice turning motel awful,neg,0.128,0.872


In [9]:
hotels[hotels['name'] == 'Americas Best Value Inn'].shape

(401, 19)

In [10]:
# group by all hotel cols, get first group
hotels[hotels['name'] == 'Americas Best Value Inn'].groupby([
    'address', 'categories', 'city', 'country', 'latitude', 'longitude',
    'name', 'postalCode', 'province'
]).first(5)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,reviews.doRecommend,reviews.id,reviews.rating
address,categories,city,country,latitude,longitude,name,postalCode,province,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
125 Sloane Garden Rd,Hotels,Boiling Springs,US,35.011314,-81.954798,Americas Best Value Inn,29316,SC,,,2.0
1320 Harrisburg Pike,Hotels,Lancaster,US,40.05696,-76.33129,Americas Best Value Inn,17603,Manor Ridge,,,1.0
2512 W Lincolnway,Hotels,Cheyenne,US,41.11915,-104.84949,Americas Best Value Inn,82001,WY,,,1.0
2731 S Carson St,Hotels,Carson City,US,39.142715,-119.767824,Americas Best Value Inn,89701,Carson Colony,,,5.0
3080 E Colby St,Hotels,Whitehall,US,43.40897,-86.31742,Americas Best Value Inn,49461,MI,,,1.0
740 Broadway St,Hotels,Chico,US,39.72602,-121.83713,Americas Best Value Inn,95928,Chapmantown,,,1.0


Wel will notice that some hotels don't have unique value per each one of those columns: `['address', 'categories', 'city', 'country', 'latitude', 'longitude', 'name', 'postalCode', 'province']`

In [11]:
filtered_hotels = sentiments.index.values
filtered_hotels[:5]

array(['A Bed & Breakfast In Cambridge', 'Acorn Motor Inn',
       'Ambassadors Inn and Suites',
       'American Star Inn and Suites Atlantic City',
       'Americas Best Value Inn'], dtype=object)

## ES Cloud (AWS) Demo

In [12]:
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from ssl import create_default_context
import configparser

# load global configs
config = configparser.ConfigParser()
config.read('../configs.ini')

# load SSL certs.pem
context = create_default_context(cafile=config['ELASTIC']['ssl_cert_path'])

# Found in the 'Manage Deployment' page
CLOUD_ID = config['ELASTIC']['cloud_id']

# Create the client instance
es_client = Elasticsearch(
    cloud_id=CLOUD_ID,
    http_auth=(config['ELASTIC']['username'], config['ELASTIC']['password']),
    scheme=config['ELASTIC']['scheme'],
    port=config['ELASTIC']['port'],
    ssl_context=context,
    timeout=1000
)

# Successful response!
es_client.info()
# {'name': 'instance-0000000000', 'cluster_name': ...}

{'name': 'instance-0000000000',
 'cluster_name': 'd64f20da8ed348069549b3b387c7eb58',
 'cluster_uuid': 'BGMBBfU9Rtquo6kqDNtG3A',
 'version': {'number': '8.2.3',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': '9905bfb62a3f0b044948376b4f607f70a8a151b4',
  'build_date': '2022-06-08T22:21:36.455508792Z',
  'build_snapshot': False,
  'lucene_version': '9.1.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

In [13]:
# es_client.index(
#     index='lord-of-the-rings',
#     document={
#         'character': 'Aragon',
#         'quote': 'It is not this day.'
#     }
# )

OUTPUT:

    {'_index': 'lord-of-the-rings',
    '_id': 'HR4ydIEB6_mhCbAmtCZD',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 0,
    '_primary_term': 1}


In [14]:
# es_client.index(
#     index='lord-of-the-rings',
#     document={
#         'character': 'Gandalf',
#         'quote': 'A wizard is never late, nor is he early.'
#     }
# )

OUTPUT:

    {'_index': 'lord-of-the-rings',
    '_id': 'Hh4zdIEB6_mhCbAmiibJ',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 2, 'failed': 0},
    '_seq_no': 1,
    '_primary_term': 1} 

In [15]:
# es_client.index(
#     index='lord-of-the-rings',
#     document={
#         'character': 'Frodo Baggins',
#         'quote': 'You are late'
#     }
# )


OUTPUT:

    {'_index': 'lord-of-the-rings',
    '_id': 'Hx4zdIEB6_mhCbAm4Ca1',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 2, 'failed': 0},
    '_seq_no': 2,
    '_primary_term': 1}

In [16]:
es_client.indices.refresh(index='lord-of-the-rings')

{'_shards': {'total': 2, 'successful': 2, 'failed': 0}}

In [17]:
# filter docs with a query
result = es_client.search(
    index='lord-of-the-rings',
    query={
        'match': {'quote': 'late'}
    }
)

result['hits']['hits']


[{'_index': 'lord-of-the-rings',
  '_id': 'Hx4zdIEB6_mhCbAm4Ca1',
  '_score': 0.5820575,
  '_source': {'character': 'Frodo Baggins', 'quote': 'You are late'}},
 {'_index': 'lord-of-the-rings',
  '_id': 'Hh4zdIEB6_mhCbAmiibJ',
  '_score': 0.37883914,
  '_source': {'character': 'Gandalf',
   'quote': 'A wizard is never late, nor is he early.'}}]

In [18]:
# return all docs
result = es_client.search(
    index='lord-of-the-rings',
    query={
        'match_all': {}
    }
)

result['hits']['hits']


[{'_index': 'lord-of-the-rings',
  '_id': 'HR4ydIEB6_mhCbAmtCZD',
  '_score': 1.0,
  '_source': {'character': 'Aragon', 'quote': 'It is not this day.'}},
 {'_index': 'lord-of-the-rings',
  '_id': 'Hh4zdIEB6_mhCbAmiibJ',
  '_score': 1.0,
  '_source': {'character': 'Gandalf',
   'quote': 'A wizard is never late, nor is he early.'}},
 {'_index': 'lord-of-the-rings',
  '_id': 'Hx4zdIEB6_mhCbAm4Ca1',
  '_score': 1.0,
  '_source': {'character': 'Frodo Baggins', 'quote': 'You are late'}}]

## Convert DataFrame to Python Dictionary

In [19]:
hotel_name = "Arion"

hotel_cols = ['address', 'categories', 'city', 'country',
              'latitude', 'longitude', 'postalCode', 'province']

review_cols = ['reviews.date', 'reviews.dateAdded',
               'reviews.doRecommend', 'reviews.id', 'reviews.rating', 'reviews.text',
               'reviews.title', 'reviews.userCity', 'reviews.username',
               'reviews.userProvince']

review_sentiment_cols = ['lemmatized', 'classification', 'p_pos', 'p_neg']

agg_sentiment_cols = ['p_pos_mean', 'p_neg_mean']


Join them all together after loading preprocessed verions

In [20]:
final_df = hotels.join(reviews.drop(['name'], axis=1), how='inner')[['name'] + hotel_cols + review_cols + review_sentiment_cols] \
    .set_index('name') \
    .join(sentiments, how='inner')

final_df.head(3)


Unnamed: 0_level_0,address,categories,city,country,latitude,longitude,postalCode,province,reviews.date,reviews.dateAdded,...,reviews.title,reviews.userCity,reviews.username,reviews.userProvince,lemmatized,classification,p_pos,p_neg,p_pos_mean,p_neg_mean
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Bed & Breakfast In Cambridge,1657 Cambridge St,Hotels,Cambridge,US,42.374972,-71.110408,2138,MA,2014-09-01T00:00:00Z,2016-11-03T22:07:49Z,...,The host is great,,David,,host great breakfast good good thing stay host...,neg,0.007476,0.992524,0.638919,0.361081
A Bed & Breakfast In Cambridge,1657 Cambridge St,Hotels,Cambridge,US,42.374972,-71.110408,2138,MA,2015-07-24T00:00:00Z,2016-11-03T22:07:49Z,...,Bed & Breakfast Stay,,A Traveler,,bed breakfast stay nice close square little ge...,pos,0.832994,0.167006,0.638919,0.361081
A Bed & Breakfast In Cambridge,1657 Cambridge St,Hotels,Cambridge,US,42.374972,-71.110408,2138,MA,2015-10-10T00:00:00Z,2016-11-03T22:07:49Z,...,We will never stay here again.,,A Traveler,,stay poor excuse,neg,0.184476,0.815524,0.638919,0.361081


In [60]:
final_df['reviews.date'] = pd.to_datetime(final_df['reviews.date'])
final_df['reviews.dateAdded'] = pd.to_datetime(final_df['reviews.dateAdded'])


In [69]:
final_df.reset_index().groupby(['name'] + hotel_cols + agg_sentiment_cols)[review_cols + review_sentiment_cols] \
    .apply(lambda g: g.T.to_json()).reset_index().T.to_json('../Storage/Data/docs.json')


In [70]:
all_docs = final_df.reset_index().groupby(['name'] + hotel_cols + agg_sentiment_cols)[review_cols + review_sentiment_cols] \
    .apply(lambda g: g.fillna("").T.to_dict()).reset_index().T.to_dict()

all_docs[0][0]

{0: {'reviews.date': Timestamp('2014-09-01 00:00:00+0000', tz='UTC'),
  'reviews.dateAdded': Timestamp('2016-11-03 22:07:49+0000', tz='UTC'),
  'reviews.doRecommend': '',
  'reviews.id': '',
  'reviews.rating': 4.0,
  'reviews.text': 'Breakfast was the best, and the best thing about the stay was the one and only host, great conversations at breakfast and advise on the city. Very nice neighborhood. Only reason that it did not get 5 stars is that the room has room for improvement, not any fault of the host. This is bed and breakfast, but not your honeymoon style BB, I would stay here again. Room comfort could improve with a new air conditioning unit, other then that five stars. Sharing the bathroom, never a problem, and coming in and out with you wanted never a problem. No need to rent a car, easy with public transportation.',
  'reviews.title': 'The host is great',
  'reviews.userCity': '',
  'reviews.username': 'David',
  'reviews.userProvince': '',
  'lemmatized': 'host great breakfas

### Indexing Preprocessed Data In ElasticSearch 

In [None]:
import uuid


HOTEL_INDEX_NAME = 'demo-hotels-1'

# delete index then create fresh empty one, if it has already existed
if(es_client.indices.exists(index=HOTEL_INDEX_NAME)):
    es_client.indices.delete(index=HOTEL_INDEX_NAME)
es_client.indices.create(index=HOTEL_INDEX_NAME)

# to avoid total_fields.limit exception, we have to increase that limit
es_client.indices.put_settings(index=HOTEL_INDEX_NAME, body={
        "index.mapping.total_fields.limit": 100000,
    }, 
    request_timeout=10000,
    timeout='10000s'
)

# index all hotels info without reviews
for doc in all_docs.values():
    es_client.index(
        index=HOTEL_INDEX_NAME,
        document=doc
    )

# iterator for multiple docs
# def doc_generator(docs):
#     for doc in docs.values():
#         yield {
#             "_id" : uuid.uuid4(), # random UUID for _id
#             "doc_type" : "_doc", # document _type
#             "doc": doc
#         }

# try:
#     # make the bulk call using 'actions' and get a response
#     response = helpers.bulk(es_client, doc_generator(all_docs), index=HOTEL_INDEX_NAME, doc_type='_doc')
#     print ("\nactions RESPONSE:", response)
# except Exception as e:
#     print("\nERROR:", e)



In [72]:
# return docs count
result = es_client.count(
    index=HOTEL_INDEX_NAME,
    # query={
    #     "match_all": {}
    # }
)

result['count']


52

In [None]:
# return all docs
result = es_client.search(
    index=HOTEL_INDEX_NAME,
    query={
        "query_string": {
            "query": "NOT (province: MA)"
        }
    }
)

result['hits']['hits']


---


### DataFrame MultiColumn Indexing
For more logical representaion

In [None]:
col_indexes = []
for t in hotel_cols:
    col_indexes.append(('hotel', t))

for t in review_cols + review_sentiment_cols:
    col_indexes.append(('review', t))

for t in agg_sentiment_cols:
    col_indexes.append(('sentiment', t))

pd.MultiIndex.from_tuples(col_indexes)

In [None]:
final_df.columns = pd.MultiIndex.from_tuples(col_indexes)
final_df.head()

In [None]:
# import uuid

# def doc_generator(df):
#     df_iter = df.iterrows()
#     for index, document in df_iter:
#         yield {
#                 "_index": 'es_demo',
#                 "_doc_type": "_doc",
#                 "_id" : uuid.uuid4(),
#                 "_source": document,
#             }
            

# helpers.bulk(es_client, doc_generator(sentiments))