In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [None]:
# Load and concatenate all CSV files (adjust path pattern if needed)
csv_files = glob("../data/processed/*.csv")  # or use absolute paths if stored elsewhere
df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)

In [None]:
def extract_top_keywords(bank_name, sentiment_label, max_features=10):
    # Filter reviews for a specific bank and sentiment
    filtered = df[(df['bank'] == bank_name) & (df['sentiment'] == sentiment_label)]
    
    reviews = filtered['review'].fillna('')

    # Apply TF-IDF
    vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features)
    X = vectorizer.fit_transform(reviews)

    print(f" Top keywords for {bank_name} [{sentiment_label}]:")
    print(vectorizer.get_feature_names_out())
    print("-" * 50)

# Extract for all banks and both sentiments
banks = df['bank'].unique()
for bank in banks:
    extract_top_keywords(bank, 'positive')
    extract_top_keywords(bank, 'negative')

In [None]:
def plot_top_keywords(bank_name, sentiment_label, max_features=10):
    # Filter reviews
    filtered = df[(df['bank'] == bank_name) & (df['sentiment'] == sentiment_label)]
    reviews = filtered['review'].fillna('')

    if reviews.empty:
        print(f"No data for {bank_name} [{sentiment_label}]")
        return

    # TF-IDF
    vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features)
    X = vectorizer.fit_transform(reviews)
    feature_names = vectorizer.get_feature_names_out()
    scores = X.sum(axis=0).A1  # Convert sparse matrix to array
    keyword_scores = dict(zip(feature_names, scores))

    # Plot
    plt.figure(figsize=(8, 4))
    plt.barh(list(keyword_scores.keys()), list(keyword_scores.values()), color='skyblue')
    plt.title(f"Top Keywords for {bank_name} ({sentiment_label})")
    plt.xlabel("TF-IDF Score")
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

#  Run for all banks and both sentiments
banks = df['bank'].unique()
for bank in banks:
    plot_top_keywords(bank, 'positive')
    plot_top_keywords(bank, 'negative')

In [None]:
def plot_wordcloud(df, bank_name, sentiment_label, max_features=50):
    # Filter data
    filtered = df[(df['bank'] == bank_name) & (df['sentiment'] == sentiment_label)]
    reviews = filtered['review'].fillna('')

    if reviews.empty:
        print(f"No data for {bank_name} [{sentiment_label}]")
        return

    # TF-IDF to get weighted words
    vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features)
    X = vectorizer.fit_transform(reviews)
    feature_names = vectorizer.get_feature_names_out()
    scores = X.sum(axis=0).A1
    word_scores = dict(zip(feature_names, scores))

    # Generate Word Cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_scores)

    # Plot
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f"Word Cloud for {bank_name} ({sentiment_label})")
    plt.axis('off')
    plt.tight_layout()
    plt.show()

# Example usage for all banks and sentiments
banks = df['bank'].unique()
for bank in banks:
    plot_wordcloud(df, bank, 'positive')
    plot_wordcloud(df, bank, 'negative')

comparision between banks

In [15]:
print("🔍 Sentiment Distribution by Bank (proportion):")
print(df.groupby('bank')['sentiment'].value_counts(normalize=True))
print("⭐ Average Rating by Bank:")

print(df.groupby('bank')['rating'].mean())



🔍 Sentiment Distribution by Bank (proportion):
bank                                sentiment
Bank of Abyssinia Mobile            positive     0.471178
                                    neutral      0.325815
                                    negative     0.203008
Commercial Bank of Ethiopia Mobile  positive     0.685422
                                    neutral      0.248082
                                    negative     0.066496
Dashen Bank SuperApp                positive     0.759398
                                    neutral      0.200501
                                    negative     0.040100
Name: proportion, dtype: float64
⭐ Average Rating by Bank:
bank
Bank of Abyssinia Mobile              3.085213
Commercial Bank of Ethiopia Mobile    4.189258
Dashen Bank SuperApp                  4.431078
Name: rating, dtype: float64


In [None]:
# Sentiment distribution per bank (as percentages)
print("🔍 Sentiment Distribution by Bank (proportion):")
print(df.groupby('bank')['sentiment'].value_counts(normalize=True).unstack().round(2))
print("\n")

# Average rating per bank
print("⭐ Average Rating by Bank:")
print(df.groupby('bank')['rating'].mean().round(2))

Bank of Abyssinia Mobile (BoA)
🔻 Lowest average rating: 3.08

- High neutral/negative proportion: ~52.8% combined

🔍 Top Negative Keywords:
['app', 'bad', 'boa', 'fix', 'make', 'mobile', 'time', 'work', 'worst']

✅ Suggestions:
App reliability: Keywords like "fix", "work", "worst" indicate performance issues — consider improving app stability and responsiveness.

User Experience: Complaints about "time" may point to long loading times — optimize startup time and reduce lag.

Feedback loop: Add in-app options for reporting issues easily and reward feedback with updates.

Update frequency: Make improvements visible through regular updates with changelogs to rebuild trust.

 Commercial Bank of Ethiopia Mobile (CBE)
⭐ Average rating: 4.19

 Sentiment mostly positive: ~68.5%

🔍 Top Negative Keywords:
['account', 'app', 'application', 'bad', 'banking', 'seen', 'service']

✅ Suggestions:
Account & service management: The words "account", "service" suggest issues with account features or poor customer support. Improve error handling and service consistency.

Visibility of changes: The word "seen" could imply user actions not being reflected — improve transaction visibility and feedback mechanisms.

Support: Enhance user support — maybe integrate live chat or smarter FAQs within the app.

 3. Dashen Bank SuperApp
🌟 Highest average rating: 4.43

-  Very positive sentiment: ~76%

🔍 Top Negative Keywords:
['app', 'bad', 'disappointing', 'fix', 'money', 'opt', 'transaction']

✅ Suggestions:
Financial operations: "money", "transaction", "disappointing" suggest pain points with core banking actions. Ensure transaction confirmation, timing, and error handling are flawless.

"Opt" issues: Might relate to login/authentication/OTP — test multi-device login and SMS reliability.

Maintain the lead: Dashen is performing well — focus on retaining users and pushing minor usability refinements.