In [None]:
# nootbooks/analysis/sentiment_analysis.ipynb

import pandas as pd
from tqdm import tqdm
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import os

# Download VADER lexicon 
nltk.download('vader_lexicon')

# Initialize
tqdm.pandas()
vader = SentimentIntensityAnalyzer()

# Load preprocessed data
df = pd.read_csv("../../data/cleaned_reviews.csv")  

# Sentiment analysis
def get_sentiment_label(text):
    scores = vader.polarity_scores(str(text))
    compound = scores['compound']
    if compound >= 0.05:
        return "positive", compound
    elif compound <= -0.05:
        return "negative", compound
    else:
        return "neutral", compound

df[['sentiment_label', 'sentiment_score']] = df['review'].progress_apply(
    lambda x: pd.Series(get_sentiment_label(x))
)

# Theme assignment
def assign_theme(text):
    text = str(text).lower()
    if any(word in text for word in ["login", "access", "password", "otp"]):
        return "Account Access Issues"
    elif any(word in text for word in ["slow", "transfer", "fail", "delay"]):
        return "Transaction Performance"
    elif any(word in text for word in ["design", "interface", "navigation", "ui"]):
        return "User Interface & Experience"
    elif any(word in text for word in ["support", "help", "call", "agent"]):
        return "Customer Support"
    elif any(word in text for word in ["feature", "add", "request", "option"]):
        return "Feature Requests"
    else:
        return "Other"

df['theme'] = df['review'].apply(assign_theme)  

# TF-IDF for keywords
def extract_keywords(corpus, top_n=20):
    vectorizer = TfidfVectorizer(max_df=0.85, stop_words='english')
    X = vectorizer.fit_transform(corpus.astype(str))
    tfidf_scores = zip(vectorizer.get_feature_names_out(), X.sum(axis=0).A1)
    sorted_keywords = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)
    return [kw for kw, _ in sorted_keywords[:top_n]]

def extract_themes_per_bank(df):
    themes_per_bank = {}
    for bank in df['bank'].unique():
        bank_df = df[df['bank'] == bank]
        keywords = extract_keywords(bank_df['review'])
        themes_per_bank[bank] = keywords
    return themes_per_bank

themes = extract_themes_per_bank(df)
print("\nTop Keywords by Bank:")
for bank, keywords in themes.items():
    print(f"{bank}: {keywords}")

# output
df.reset_index(inplace=True)
df.rename(columns={"index": "review_id"}, inplace=True)
df.to_csv("../../data/bank_reviews_analysis.csv", index=False)

print("\n✅ Sentiment + Theme pipeline completed. Output: 'data/bank_reviews_analysis.csv'")


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
100%|██████████| 262/262 [00:00<00:00, 1144.19it/s]


Top Keywords by Bank:
CBE: ['app', 'good', 'nice', 'best', 'like', 'network', 'bank', 'awesome', 'fast', 'love', 'cbe', 'using', 'problem', 'make', 'fantastic', 'money', 'banking', 'application', 'better', 'easy']
BOA: ['app', 'good', 'boa', 'best', 'bank', 'work', 'working', 'really', 'wow', 'thank', 'mobile', 'doesn', 'excellent', 'like', 'application', 'great', 'use', 'worst', 'update', 'amazing']
Dashen: ['app', 'good', 'best', 'nice', 'banking', 'wow', 'use', 'amole', 'mobile', 'bank', 'dashin', 'fast', 'update', 'used', 'slow', 'dashen', 'needs', 'love', 'like', 'just']

✅ Sentiment + Theme pipeline completed. Output: 'data/bank_reviews_analysis.csv'



