In [2]:
import logging
import pandas as pd
from google_play_scraper import reviews, app
from urllib.error import HTTPError
from datetime import datetime, timedelta
import time
import os
from concurrent.futures import ThreadPoolExecutor

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define the app IDs for the banks
app_ids = {
    'Abyssinia Bank': 'com.boa.boaMobileBanking',
    'Commercial Bank of Ethiopia': 'com.combanketh.mobilebanking',
}

def fetch_reviews(app_id, num_reviews=200):
    """Fetch a limited number of reviews for a given app."""
    all_reviews = []
    continuation_token = None
    
    while len(all_reviews) < num_reviews:
        try:
            result, continuation_token = reviews(
                app_id,
                lang='en',  # defaults to 'en'
                country='us',  # defaults to 'us'
                continuation_token=continuation_token
            )
            all_reviews.extend(result)
            if not continuation_token:
                break
        except HTTPError as e:
            logger.error("HTTPError while fetching reviews for app ID %s: %s", app_id, e)
            break
    return all_reviews[:num_reviews]

def fetch_app_details(app_id):
    """Fetch app details to get download count and other metadata."""
    try:
        details = app(app_id)
        return details
    except HTTPError as e:
        logger.error("HTTPError while fetching details for app ID %s: %s", app_id, e)
        return {}

def fetch_data_for_bank(bank, app_id, duration_days=7, num_reviews=200):
    """Fetch data for a specific bank."""
    end_time = datetime.now()
    start_time = end_time - timedelta(days=duration_days)
    
    logger.info("Fetching data for %s", bank)
    
    # Fetch reviews
    reviews = fetch_reviews(app_id, num_reviews)
    if not reviews:
        logger.info("No reviews found for %s", bank)
        return []
    
    logger.info("Fetched %d reviews for %s", len(reviews), bank)
    
    # Filter reviews by time
    filtered_reviews = [review for review in reviews if datetime.fromtimestamp(review['at'].timestamp()) >= start_time]
    logger.info("Filtered down to %d reviews for %s", len(filtered_reviews), bank)
    
    all_reviews = []
    for review in filtered_reviews:
        review_data = {
            'bank': bank,
            'reviewId': review['reviewId'],
            'userName': review['userName'],
            'userImage': review['userImage'],
            '👍': review['thumbsUpCount'],
            'reviewCreatedVersion': review.get('reviewCreatedVersion'),
            'at': review['at'],
            'replyContent': review.get('replyContent', ''),
            'repliedAt': review.get('repliedAt', ''),
            'appVersion': review.get('appVersion', ''),
            'score': review['score'],
            'Comments': review['content'],
            'Keywords': '',  # Placeholder for keywords
            'LDA_Category': '',  # Placeholder for LDA category
            'Sentiment': '',  # Placeholder for sentiment
            'Insight': ''  # Placeholder for insight
        }
        all_reviews.append(review_data)
    
    return all_reviews

def track_reviews_and_downloads(app_ids, duration_days=7, num_reviews=200):
    """Track reviews and download counts over a period of time."""
    all_reviews = []
    
    with ThreadPoolExecutor(max_workers=len(app_ids)) as executor:
        futures = [executor.submit(fetch_data_for_bank, bank, app_id, duration_days, num_reviews) for bank, app_id in app_ids.items()]
        
        for future in futures:
            result = future.result()
            all_reviews.extend(result)
    
    # Create a DataFrame from the collected reviews
    df_reviews = pd.DataFrame(all_reviews)
    
    # Print the head of the DataFrame
    print(df_reviews.head())
    
    # Ensure the data directory exists
    os.makedirs('data', exist_ok=True)
    
    # Save the DataFrame to a CSV file
    csv_file_path = '../data/google_play_reviews.csv'
    df_reviews.to_csv(csv_file_path, index=False)
    
    logger.info("Saved reviews data to %s", csv_file_path)

if __name__ == "__main__":
    track_reviews_and_downloads(app_ids, num_reviews=200)

INFO:__main__:Fetching data for Abyssinia Bank
INFO:__main__:Fetching data for Commercial Bank of Ethiopia
