In [10]:
# !pip install google-play-scraper

In [25]:
import pandas as pd
from google_play_scraper import reviews, Sort
from datetime import datetime

# Configuration
BANK_APPS = {
    'Commercial Bank of Ethiopia': {
        'app_id': 'com.combanketh.mobilebanking',
        'count': 1000
    },
    'Bank of Abyssinia': {
        'app_id': 'com.boa.boaMobileBanking',
        'count': 1500
    },
    'Dashen Bank': {
        'app_id': 'com.dashen.dashensuperapp',
        'count': 7000
    }
}
MIN_REQUIRED = 400
OUTPUT_FILE = 'all_banks_reviews_cleaned.csv'

In [26]:
# Scrape Function
def scrape_reviews(app_id, bank_name, count):
    print(f" Scraping up to {count} reviews for {bank_name}...")
    result, _ = reviews(
        app_id,
        lang='en',
        country='us',
        sort=Sort.NEWEST,
        count=count,
        filter_score_with=None
    )
    actual_count = len(result)
    print(f"{bank_name}: Scraped {actual_count} reviews from Google Play\n")

    df = pd.DataFrame(result)
    df['bank'] = bank_name
    df['source'] = 'Google Play'
    return df
    

In [None]:
# Clean Function 
def clean_reviews(df, bank_name):
    print(f" Cleaning: {bank_name}")
    df.rename(columns={'content': 'review', 'score': 'rating'}, inplace=True)
    if 'at' in df.columns:
        df['date'] = pd.to_datetime(df['at'], errors='coerce')

    # Drop rows with missing essential data
    before_drop = len(df)
    df = df.dropna(subset=['review', 'rating', 'date'])
    after_drop = len(df)
    print(f"➖ Dropped {before_drop - after_drop} rows with missing values")

    # Drop duplicates
    before_dup = len(df)
    df = df.drop_duplicates(subset=['review', 'date'])
    after_dup = len(df)
    print(f"➖ Dropped {before_dup - after_dup} duplicate reviews")

    # Format date
    df['date'] = df['date'].dt.strftime('%Y-%m-%d')

    return df[['review', 'rating', 'date', 'bank', 'source']]


all_cleaned = []
for bank, info in BANK_APPS.items():
    raw_df = scrape_reviews(info['app_id'], bank, info['count'])
    clean_df = clean_reviews(raw_df, bank)
    print(f"{bank}: {len(clean_df)} cleaned reviews\n")

    if len(clean_df) >= MIN_REQUIRED:
        all_cleaned.append(clean_df)
    else:
        print(f"{bank}: Skipped — only {len(clean_df)} valid reviews.\n")

# Save final result
if len(all_cleaned) == len(BANK_APPS):
    final_df = pd.concat(all_cleaned, ignore_index=True)
    final_df.to_csv(OUTPUT_FILE, index=False, encoding='utf-8')
    print(f"\n All banks processed. Combined file saved as: {OUTPUT_FILE}")
else:
    print("\n Not all banks met the minimum review count. File not saved.")

 Scraping up to 1000 reviews for Commercial Bank of Ethiopia...
Commercial Bank of Ethiopia: Scraped 1000 reviews from Google Play

 Cleaning: Commercial Bank of Ethiopia
➖ Dropped 0 rows with missing values
➖ Dropped 0 duplicate reviews
Commercial Bank of Ethiopia: 1000 cleaned reviews

 Scraping up to 1500 reviews for Bank of Abyssinia...
Bank of Abyssinia: Scraped 1044 reviews from Google Play

 Cleaning: Bank of Abyssinia
➖ Dropped 0 rows with missing values
➖ Dropped 0 duplicate reviews
Bank of Abyssinia: 1044 cleaned reviews

 Scraping up to 7000 reviews for Dashen Bank...
Dashen Bank: Scraped 449 reviews from Google Play

 Cleaning: Dashen Bank
➖ Dropped 0 rows with missing values
➖ Dropped 0 duplicate reviews
Dashen Bank: 449 cleaned reviews


 All banks processed. Combined file saved as: all_banks_reviews_cleaned.csv
