In [21]:
import pandas as pd
import numpy as np
import spacy
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from transformers import pipeline
import uuid

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\micha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")






In [4]:
df = pd.read_csv("scraped data.csv") 

In [5]:
reviews = df["review"].tolist()
results = sentiment_pipeline(reviews, truncation=True)

In [6]:
def to_score(label, score):
    if label == "POSITIVE":
        return score
    elif label == "NEGATIVE":
        return -score
    else:
        return 0.0

In [7]:
sentiment_scores = []
for result in results:
    score = to_score(result["label"], result["score"])
    sentiment_scores.append(score)

In [8]:
df["sentiment_score"] = sentiment_scores

In [9]:
agg_df = df.groupby(["bank", "rating"])["sentiment_score"].mean().reset_index()

In [10]:
# agg_df.to_csv("aggregated_sentiment.csv", index=False)
print(agg_df)

   bank  rating  sentiment_score
0   BOA       1        -0.779004
1   BOA       2        -0.718548
2   BOA       3        -0.191240
3   BOA       4         0.023577
4   BOA       5         0.517518
5   CBE       1        -0.603236
6   CBE       2        -0.423547
7   CBE       3        -0.494413
8   CBE       4        -0.066724
9   CBE       5         0.712700
10   DB       1        -0.884815
11   DB       2        -0.726466
12   DB       3        -0.091441
13   DB       4         0.021889
14   DB       5         0.789882


In [11]:
df["sentiment_label"] = df["sentiment_score"].apply(
    lambda x: "Positive" if x > 0 else "Negative" if x < 0 else "Neutral"
)

In [12]:
if 'review_id' not in df.columns:
    df['review_id'] = [str(uuid.uuid4()) for _ in range(len(df))]

In [19]:
nlp = spacy.load("en_core_web_sm")

In [13]:
def preprocess_text(text):
    if not isinstance(text, str):
        text = str(text)
    # Lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize and remove stopwords
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.text not in stop_words and not token.is_punct]
    return ' '.join(tokens)

In [14]:
def extract_keywords_spacy(text):
    doc = nlp(text)
    keywords = []
    # Extract nouns, adjectives, and n-grams
    for token in doc:
        if token.pos_ in ['NOUN', 'ADJ'] and token.text not in stop_words:
            keywords.append(token.lemma_)
    # Extract noun chunks as n-grams
    for chunk in doc.noun_chunks:
        if len(chunk.text.split()) > 1:
            keywords.append(chunk.text.lower())
    return keywords

In [None]:
def extract_keywords_tfidf(reviews):
    vectorizer = TfidfVectorizer(max_features=100, ngram_range=(1, 2), stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(reviews)
    feature_names = vectorizer.get_feature_names_out()
    return feature_names.tolist()

In [16]:
def cluster_themes(keywords, bank):
    theme_dict = {
        'Account Access Issues': ['login', 'crash', 'error', 'access', 'authentication', 'sign in', 'log in'],
        'Transaction Performance': ['transfer', 'slow', 'fast', 'payment', 'transaction', 'deposit', 'withdrawal'],
        'User Interface & Experience': ['ui', 'interface', 'design', 'navigation', 'experience', 'layout', 'user friendly'],
        'Customer Support': ['support', 'help', 'service', 'response', 'customer', 'assistance'],
        'Feature Requests': ['feature', 'tool', 'budget', 'option', 'functionality', 'add', 'new']
    }
    themes = []
    for keyword in keywords:
        for theme, theme_keywords in theme_dict.items():
            if any(kw in keyword.lower() for kw in theme_keywords):
                themes.append(theme)
                break
    return list(set(themes)) if themes else ['Other']

In [None]:
def thematic_analysis_pipeline(df):
    # Preprocess reviews
    df['processed_review'] = df['review'].apply(preprocess_text)
    
    # Extract keywords
    df['keywords_spacy'] = df['processed_review'].apply(extract_keywords_spacy)
    
    # TF-IDF keywords per bank
    for bank in df['bank'].unique():
        bank_reviews = df[df['bank'] == bank]['processed_review']
        tfidf_keywords = extract_keywords_tfidf(bank_reviews)
        print(f"TF-IDF Keywords for {bank}: {tfidf_keywords[:10]}")  # Print top 10 for reference
    
    # Cluster into themes
    df['themes'] = df.apply(lambda row: cluster_themes(row['keywords_spacy'], row['bank']), axis=1)
    
    # Prepare output
    output_df = df[['review_id', 'review', 'sentiment_label', 'sentiment_score', 'themes']]
    
    # Save to CSV
    output_df.to_csv('thematic_analysis_output.csv', index=False)
    print("Results saved to 'thematic_analysis_output.csv'")
    
    
    return output_df

In [22]:
result_df = thematic_analysis_pipeline(df)

TF-IDF Keywords for CBE: ['access', 'account', 'add', 'amazing', 'app', 'application', 'bad', 'bank', 'banking', 'banking app']
TF-IDF Keywords for BOA: ['app', 'app work', 'application', 'ask', 'bad', 'bad app', 'bank', 'banking', 'banking app', 'boa']
TF-IDF Keywords for DB: ['account', 'ahead', 'amazing', 'app', 'application', 'bank', 'bank super', 'banking', 'convenient', 'customer']
Results saved to 'thematic_analysis_output.csv'


In [23]:
print("\nSample Output:")
print(result_df[['review', 'sentiment_label', 'themes']].head())


Sample Output:
                                              review sentiment_label   themes
0       the app is proactive and a good connections.        Positive  [Other]
1    I cannot send to cbebirr app. through this app.        Negative  [Other]
2                                               good        Positive  [Other]
3                                     not functional        Negative  [Other]
4  everytime you uninstall the app you have to re...        Negative  [Other]
