In [1]:

import pandas as pd
import spacy
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from tqdm import tqdm 

In [2]:
# Load data
df = pd.read_csv('../data/bank_reviews.csv')

In [3]:
classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')




Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


In [4]:
# Sentiment Analysis

def get_sentiment(text):
    if not text or len(text.strip()) == 0:
        return 'NEUTRAL', 0.5
    result = classifier(text[:512])[0]  # Truncate to 512 tokens
    label = result['label'].upper()
    score = result['score']
    return label, score

df['sentiment_label'], df['sentiment_score'] = zip(*df['review'].apply(get_sentiment))

In [5]:
# Thematic Analysis
nlp = spacy.load('en_core_web_sm')

In [6]:
# Preprocessing for thematic analysis
def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(tokens)

df['processed_text'] = df['review'].apply(preprocess_text)

In [7]:
# Extract keywords using TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=100)
tfidf_matrix = vectorizer.fit_transform(df['processed_text'])
feature_names = vectorizer.get_feature_names_out()

In [8]:
# Manual theme clustering
themes = {
    'CBE': {
        'Account Access Issues': ['login error', 'authentication fail', 'pin issue'],
        'Transaction Performance': ['slow transfer', 'transfer fail', 'loading delay'],
        'User Interface': ['intuitive ui', 'poor summary', 'ui change'],
        'Customer Support': ['support unresponsive', 'contact issue'],
        'Feature Requests': ['fingerprint login', 'qr code', 'budget tool']
    },
    'BOA': {
        'Account Access Issues': ['login fail', 'zero balance', 'authentication bug'],
        'Transaction Performance': ['slow load', 'transfer error', 'et-switch fail'],
        'User Interface': ['poor design', 'logo issue', 'unresponsive ui'],
        'Customer Support': ['support delay', 'unresolved issue'],
        'Feature Requests': ['fingerprint login', 'faster transfer']
    },
    'Dashen': {
        'Account Access Issues': ['login issue', 'account access'],
        'Transaction Performance': ['transfer speed', 'payment delay'],
        'User Interface': ['clean ui', 'navigation ease'],
        'Customer Support': ['support response', 'helpdesk'],
        'Feature Requests': ['biometric login', 'transaction limit']
    }
}

In [9]:
# Assign themes to reviews
def assign_themes(text, bank):
    doc = nlp(text.lower())
    review_themes = []
    for theme, keywords in themes[bank].items():
        if any(keyword in text.lower() for keyword in keywords):
            review_themes.append(theme)
    return ', '.join(review_themes) if review_themes else 'Other'

df['theme'] = df.apply(lambda x: assign_themes(x['review'], x['bank']), axis=1)

In [11]:
# Save results in the 'data' folder
output_df = df[['review_id', 'review', 'sentiment_label', 'sentiment_score', 'theme']]
output_df.to_csv('../data/sentiment_themes.csv', index=False)
print("Saved sentiment and theme analysis to data/sentiment_themes.csv")


Saved sentiment and theme analysis to data/sentiment_themes.csv


In [12]:
# Aggregate sentiment by bank and rating
print(df.groupby(['bank', 'rating'])['sentiment_score'].mean())

bank    rating
BOA     1         0.977844
        2         0.935626
        3         0.961892
        4         0.958368
        5         0.959053
CBE     1         0.963464
        2         0.980620
        3         0.976234
        4         0.952189
        5         0.976127
Dashen  1         0.995137
        2         0.957596
        3         0.997640
        4         0.974137
        5         0.984752
Name: sentiment_score, dtype: float64
