In [3]:
from elasticsearch import Elasticsearch
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd

# Connection to the cluster
es = Elasticsearch(hosts="https://elastic:datascientest@localhost:9200",
                   ca_certs="/Users/metka/Desktop/DST/SupplyChain/elasticsearch/ca/ca.crt")

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Define the search query to retrieve all documents
search_query = {
    "query": {
        "match_all": {}
    },
    "size": 100  # Number of documents per page
}


  response = es.search(index="reviews", body=search_query, from_=(page - 1) * 100)


In [6]:
#OUTPUT WITHOUT IDs

# Lists to store data
data_list = []

# Paginated search
page = 1
while True:
    response = es.search(index="reviews", body=search_query, from_=(page - 1) * 100)
    
    if not response["hits"]["hits"]:
        break
    
    for hit in response["hits"]["hits"]:
        source = hit.get("_source", {})
        review_text = source.get("review_text", "")
        
        if review_text:
            sentiment_scores = analyzer.polarity_scores(review_text)
        
            compound_score = sentiment_scores["compound"]
            if compound_score >= 0.05:
                sentiment_label = "Positive"
            elif compound_score <= -0.05:
                sentiment_label = "Negative"
            else:
                sentiment_label = "Neutral"
            
            data_list.append({
                "Company Name": source.get("company_name", ""),
                "Review Text": review_text,
                "Sentiment Scores": compound_score,
                "Sentiment Label": sentiment_label
            })

    page += 1

# Create DataFrame
df = pd.DataFrame(data_list)

# Output DataFrame to a CSV file
#df.to_csv("reviews_sentiments.csv", index=False)

print(df)

  response = es.search(index="reviews", body=search_query, from_=(page - 1) * 100)


                              Company Name  \
0                                 PNC Bank   
1                                 PNC Bank   
2                                 PNC Bank   
3                                 PNC Bank   
4                                 PNC Bank   
...                                    ...   
1476  Heritage Valley Federal Credit Union   
1477  Heritage Valley Federal Credit Union   
1478  Heritage Valley Federal Credit Union   
1479                        The Payment HQ   
1480                        The Payment HQ   

                                            Review Text  Sentiment Scores  \
0     After opening an account my experience has bee...           -0.3400   
1     If I could give them no stars I WOULD !!! DO N...            0.9552   
2     High Yield Savings application could not be ap...           -0.8313   
3     I loved my local branch, all the employees  we...            0.8057   
4     I went to get a cashier's check for my parents...       

In [5]:
#OUTPUT WITH IDs


# Lists to store data
review_ids = []
company_ids = []
company_names = []
review_texts = []
compound_scores = []
sentiment_labels = []

# Paginated search
page = 1
while True:
    response = es.search(index="reviews", body=search_query, from_=(page - 1) * 100)
    
    if not response["hits"]["hits"]:
        break
    
    for hit in response["hits"]["hits"]:
        source = hit.get("_source", {})
        review_id = source.get("review_id", "")
        company_id = source.get("company_id", "")
        company_name = source.get("company_name", "")
        review_text = source.get("review_text", "")
        
        if review_text:
            sentiment_scores = analyzer.polarity_scores(review_text)
        
            compound_score = sentiment_scores["compound"]
            if compound_score >= 0.05:
                sentiment_label = "Positive"
            elif compound_score <= -0.05:
                sentiment_label = "Negative"
            else:
                sentiment_label = "Neutral"
            
            review_ids.append(review_id)
            company_ids.append(company_id)
            company_names.append(company_name)
            review_texts.append(review_text)
            compound_scores.append(compound_score)
            sentiment_labels.append(sentiment_label)

    page += 1

# Create data_all dictionary
data_all = {
    "Review ID": review_ids,
    "Company ID": company_ids,
    "Company Name": company_names,
    "Review Text": review_texts,
    "Sentiment Scores": compound_scores,
    "Sentiment Label": sentiment_labels
}

# Create DataFrame
df_all = pd.DataFrame(data_all)

# Output DataFrame to a CSV file
#df_all.to_csv("all_reviews_sentiments.csv", index=False)

# Print DataFrame
print(df_all)

  response = es.search(index="reviews", body=search_query, from_=(page - 1) * 100)


      Review ID  Company ID                          Company Name  \
0             0        1118                              PNC Bank   
1             1        1118                              PNC Bank   
2             2        1118                              PNC Bank   
3             3        1118                              PNC Bank   
4             4        1118                              PNC Bank   
...         ...         ...                                   ...   
1476       1648         114  Heritage Valley Federal Credit Union   
1477       1649         114  Heritage Valley Federal Credit Union   
1478       1650         114  Heritage Valley Federal Credit Union   
1479       1651         115                        The Payment HQ   
1480       1652         115                        The Payment HQ   

                                            Review Text  Sentiment Scores  \
0     After opening an account my experience has bee...           -0.3400   
1     If I could 