In [9]:
from elasticsearch import Elasticsearch
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd

# Connection to the cluster
es = Elasticsearch(hosts="https://elastic:datascientest@localhost:9200",
                   ca_certs="/Users/metka/Desktop/DST/SupplyChain/elasticsearch/ca/ca.crt")

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Define the search query to retrieve all documents
search_query = {
    "query": {
        "match_all": {}
    },
    "size": 100  # Number of documents per page
}


In [12]:
#OUTPUT ALL REVIEWS WITH SENTIMENT

# Lists to store data
review_ids = []
company_ids = []
company_names = []
review_texts = []
compound_scores = []
sentiment_labels = []

# Paginated search
page = 1
while True:
    response = es.search(index="atm_reviews", body=search_query, from_=(page - 1) * 100)
    
    if not response["hits"]["hits"]:
        break
    
    for hit in response["hits"]["hits"]:
        source = hit.get("_source", {})
        review_id = source.get("review_id", "")
        company_id = source.get("company_id", "")
        company_name = source.get("company_name", "")
        review_text = source.get("review_text", "")
        
        if review_text:
            sentiment_scores = analyzer.polarity_scores(review_text)
        
            compound_score = sentiment_scores["compound"]
            if compound_score >= 0.05:
                sentiment_label = "Positive"
            elif compound_score <= -0.05:
                sentiment_label = "Negative"
            else:
                sentiment_label = "Neutral"
            
            review_ids.append(review_id)
            company_ids.append(company_id)
            company_names.append(company_name)
            review_texts.append(review_text)
            compound_scores.append(compound_score)
            sentiment_labels.append(sentiment_label)

    page += 1

# Create data_all dictionary
data_all = {
    "Review ID": review_ids,
    "Company ID": company_ids,
    "Company Name": company_names,
    "Review Text": review_texts,
    "Sentiment Scores": compound_scores,
    "Sentiment Label": sentiment_labels
}

# Create DataFrame
df_all = pd.DataFrame(data_all)

# Output DataFrame to a CSV file
#df_all.to_csv("all_reviews_sentiments.csv", index=False)

# Print DataFrame
print(df_all)

  response = es.search(index="atm_reviews", body=search_query, from_=(page - 1) * 100)


     Review ID Company ID                          Company Name  \
0            0       1118                              PNC Bank   
1            1       1118                              PNC Bank   
2            2       1118                              PNC Bank   
3            3       1118                              PNC Bank   
4            4       1118                              PNC Bank   
...        ...        ...                                   ...   
6911      1648        114  Heritage Valley Federal Credit Union   
6912      1649        114  Heritage Valley Federal Credit Union   
6913      1650        114  Heritage Valley Federal Credit Union   
6914      1651        115                        The Payment HQ   
6915      1652        115                        The Payment HQ   

                                            Review Text  Sentiment Scores  \
0     After opening an account my experience has bee...           -0.3400   
1     If I could give them no stars I WOU

In [13]:
# COUNT MOST USED WORDS 

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter


# Tokenize and process reviews to find most common words
nltk.download('punkt')
nltk.download('stopwords')

# Combine all review texts into one string
all_reviews_text = ' '.join(df_all['Review Text'])

# Tokenize words
words = word_tokenize(all_reviews_text)

# Remove stopwords
stop_words = set(stopwords.words("english"))
filtered_words = [word for word in words if word.lower() not in stop_words and len(word) > 2]

# Count the frequency of words
word_counts = Counter(filtered_words)

# Get the most common words (change the number as needed)
most_common_words = word_counts.most_common(20)

# Print the most common words
print("Most common words in reviews:")
for word, count in most_common_words:
    print(f"{word}: {count}")

[nltk_data] Downloading package punkt to /Users/metka/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/metka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Most common words in reviews:
bank: 5324
account: 5030
PNC: 5002
money: 2792
n't: 2754
customer: 2602
service: 2518
get: 2514
would: 1858
card: 1844
time: 1702
business: 1554
check: 1392
one: 1374
back: 1284
never: 1278
call: 1258
credit: 1212
told: 1200
could: 1184


In [18]:
#CREATE WORD BUCKETS AND OUTPUT WORD ANALYSIS

# Define the word buckets
word_buckets = {
    "bank account": ["bank", "account"],
    "customer service": ["customer", "service"],
    "credits": ["loan", "credit"]
}

# Initialize dictionary to store values for the new DataFrame
new_data = {
    "Sentiment Scores": [],
    "Sentiment Label": [],
    "Company Name": [],
    "Company ID": []
}
for bucket in word_buckets.keys():
    new_data[bucket] = []

# Iterate over each row in the DataFrame
for index, row in df_all.iterrows():
    new_data["Sentiment Scores"].append(row["Sentiment Scores"])
    new_data["Sentiment Label"].append(row["Sentiment Label"])
    new_data["Company Name"].append(row["Company Name"])
    new_data["Company ID"].append(row["Company ID"])
    review_text = row["Review Text"].lower()
    word_counts = Counter(review_text.split())  # Count words in the review text
    
    for bucket, words in word_buckets.items():
        bucket_count = sum(word_counts.get(word, 0) for word in words)
        new_data[bucket].append(bucket_count)

# Create the new DataFrame
df_new = pd.DataFrame(new_data)

# Display the new DataFrame
print(df_new)

# Output DataFrame to a CSV file
df_new.to_csv("word_analysis.csv", index=False)

      Sentiment Scores Sentiment Label                          Company Name  \
0              -0.3400        Negative                              PNC Bank   
1               0.9552        Positive                              PNC Bank   
2              -0.8313        Negative                              PNC Bank   
3               0.8057        Positive                              PNC Bank   
4              -0.8465        Negative                              PNC Bank   
...                ...             ...                                   ...   
6911            0.7856        Positive  Heritage Valley Federal Credit Union   
6912            0.9516        Positive  Heritage Valley Federal Credit Union   
6913            0.8684        Positive  Heritage Valley Federal Credit Union   
6914            0.9824        Positive                        The Payment HQ   
6915            0.9199        Positive                        The Payment HQ   

     Company ID  bank account  customer