In [3]:
import logging
import pandas as pd
from google_play_scraper import reviews, app
from urllib.error import HTTPError
from datetime import datetime, timedelta
import time
import os
from concurrent.futures import ThreadPoolExecutor

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define the app IDs for the banks
app_ids = {
    'Abyssinia Bank': 'com.boa.boaMobileBanking',
    'Commercial Bank of Ethiopia': 'com.combanketh.mobilebanking',
}

def fetch_reviews(app_id, num_reviews=100):
    """Fetch a limited number of reviews for a given app."""
    all_reviews = []
    continuation_token = None
    
    while len(all_reviews) < num_reviews:
        try:
            result, continuation_token = reviews(
                app_id,
                lang='en',  # defaults to 'en'
                country='us',  # defaults to 'us'
                continuation_token=continuation_token
            )
            all_reviews.extend(result)
            if not continuation_token:
                break
        except HTTPError as e:
            logger.error("HTTPError while fetching reviews for app ID %s: %s", app_id, e)
            break
    return all_reviews[:num_reviews]

def fetch_app_details(app_id):
    """Fetch app details to get download count and other metadata."""
    try:
        details = app(app_id)
        return details
    except HTTPError as e:
        logger.error("HTTPError while fetching details for app ID %s: %s", app_id, e)
        return {}

def fetch_data_for_bank(bank, app_id, duration_days=7, num_reviews=100):
    """Fetch data for a specific bank."""
    end_time = datetime.now()
    start_time = end_time - timedelta(days=duration_days)
    
    logger.info("Fetching data for %s", bank)
    
    # Fetch reviews
    reviews_data = fetch_reviews(app_id, num_reviews)
    if not reviews_data:
        logger.info("No reviews found for %s", bank)
        return [], []
    
    logger.info("Fetched %d reviews for %s", len(reviews_data), bank)
    
    # Filter reviews by time
    filtered_reviews = [review for review in reviews_data if datetime.fromtimestamp(review['at'].timestamp()) >= start_time]
    logger.info("Filtered down to %d reviews for %s", len(filtered_reviews), bank)
    
    # Fetch app details
    details = fetch_app_details(app_id)
    download_count = details.get('installs', 'N/A')
    
    all_reviews = []
    for review in filtered_reviews:
        review_data = {
            'bank': bank,
            'appId': app_id,
            'reviewId': review['reviewId'],
            'userName': review['userName'],
            'userImage': review['userImage'],
            'thumbsUpCount': review['thumbsUpCount'],
            'reviewCreatedVersion': review.get('reviewCreatedVersion'),
            'at': review['at'],
            'replyContent': review.get('replyContent', ''),
            'repliedAt': review.get('repliedAt', ''),
            'appVersion': review.get('appVersion', ''),
            'score': review['score'],
            'content': review['content'],
            'keywords': '',  # Placeholder for keywords
            'LDA_Category': '',  # Placeholder for LDA category
            'Sentiment': '',  # Placeholder for sentiment
            'Insight': ''  # Placeholder for insight
        }
        all_reviews.append(review_data)
    
    # Create a download count entry for the current time
    download_data = {
        'bank': bank,
        'appId': app_id,
        'timestamp': end_time,
        'downloads': download_count
    }
    
    return all_reviews, download_data

def track_reviews_and_downloads(app_ids, duration_days=7, num_reviews=100):
    """Track reviews and download counts over a period of time."""
    all_reviews = []
    all_downloads = []
    
    with ThreadPoolExecutor(max_workers=len(app_ids)) as executor:
        futures = [executor.submit(fetch_data_for_bank, bank, app_id, duration_days, num_reviews) for bank, app_id in app_ids.items()]
        
        for future in futures:
            reviews, downloads = future.result()
            all_reviews.extend(reviews)
            all_downloads.append(downloads)
    
    # Create DataFrames from the collected reviews and download counts
    df_reviews = pd.DataFrame(all_reviews)
    df_downloads = pd.DataFrame(all_downloads)
    
    # Print the heads of the DataFrames
    print("Reviews DataFrame head:")
    print(df_reviews.head())
    print("Downloads DataFrame head:")
    print(df_downloads.head())
    
    # Ensure the data directory exists
    os.makedirs('data', exist_ok=True)
    
    # Save the DataFrames to separate CSV files
    reviews_csv_file_path = '../data/google_play_reviews.csv'
    downloads_csv_file_path = '../data/google_play_downloads.csv'
    df_reviews.to_csv(reviews_csv_file_path, index=False)
    df_downloads.to_csv(downloads_csv_file_path, index=False)
    
    logger.info("Saved reviews data to %s", reviews_csv_file_path)
    logger.info("Saved downloads data to %s", downloads_csv_file_path)

if __name__ == "__main__":
    track_reviews_and_downloads(app_ids, num_reviews=100)


INFO:__main__:Fetching data for Abyssinia Bank
INFO:__main__:Fetching data for Commercial Bank of Ethiopia
INFO:__main__:Fetched 100 reviews for Commercial Bank of Ethiopia
INFO:__main__:Filtered down to 33 reviews for Commercial Bank of Ethiopia
INFO:__main__:Fetched 100 reviews for Abyssinia Bank
INFO:__main__:Filtered down to 23 reviews for Abyssinia Bank
INFO:__main__:Saved reviews data to ../data/google_play_reviews.csv
INFO:__main__:Saved downloads data to ../data/google_play_downloads.csv


Reviews DataFrame head:
             bank                     appId  \
0  Abyssinia Bank  com.boa.boaMobileBanking   
1  Abyssinia Bank  com.boa.boaMobileBanking   
2  Abyssinia Bank  com.boa.boaMobileBanking   
3  Abyssinia Bank  com.boa.boaMobileBanking   
4  Abyssinia Bank  com.boa.boaMobileBanking   

                               reviewId        userName  \
0  f77c9ee3-07b7-4203-9aaa-f019d35abaa6  Chernet Bekele   
1  622762e8-14b9-42eb-b807-8960278d8b34        Hermon Z   
2  9756a397-2464-40d6-8eab-d269c119aa9e    Davinci Tube   
3  fe54afea-6f7b-4fee-bc8a-4af38050cc54  Mohammed Kasim   
4  9340824c-f839-4afe-81af-cd398088f470   Ahadu tesfaye   

                                           userImage  thumbsUpCount  \
0  https://play-lh.googleusercontent.com/a/ACg8oc...              1   
1  https://play-lh.googleusercontent.com/a-/ALV-U...              0   
2  https://play-lh.googleusercontent.com/a-/ALV-U...              0   
3  https://play-lh.googleusercontent.com/a-/ALV-U...  

In [7]:
import logging
import pandas as pd
from google_play_scraper import reviews, app
from urllib.error import HTTPError, URLError
from datetime import datetime, timedelta
import time
import os
from concurrent.futures import ThreadPoolExecutor
import random

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define the app IDs for the banks
app_ids = {
    'Abyssinia Bank': 'com.boa.boaMobileBanking',
    'Commercial Bank of Ethiopia': 'com.combanketh.mobilebanking',
}

def fetch_reviews(app_id, num_reviews=100, max_retries=5):
    """Fetch a limited number of reviews for a given app with retry mechanism."""
    all_reviews = []
    continuation_token = None
    retries = 0

    while len(all_reviews) < num_reviews and retries < max_retries:
        try:
            result, continuation_token = reviews(
                app_id,
                lang='en',  # defaults to 'en'
                country='us',  # defaults to 'us'
                continuation_token=continuation_token
            )
            all_reviews.extend(result)
            if not continuation_token:
                break
        except (HTTPError, URLError) as e:
            logger.error("Error while fetching reviews for app ID %s: %s", app_id, e)
            retries += 1
            sleep_time = (2 ** retries) + random.uniform(0, 1)
            logger.info("Retrying in %f seconds...", sleep_time)
            time.sleep(sleep_time)
    return all_reviews[:num_reviews]

def fetch_app_details(app_id, max_retries=5):
    """Fetch app details to get download count and other metadata with retry mechanism."""
    retries = 0
    while retries < max_retries:
        try:
            details = app(app_id)
            download_count = details.get('installs', 'N/A')
            current_date = datetime.now().date()
            return {'downloads': download_count, 'date': current_date}
        except (HTTPError, URLError) as e:
            logger.error("Error while fetching details for app ID %s: %s", app_id, e)
            retries += 1
            sleep_time = (2 ** retries) + random.uniform(0, 1)
            logger.info("Retrying in %f seconds...", sleep_time)
            time.sleep(sleep_time)
    return {}

def fetch_data_for_bank(bank, app_id, duration_days=7, num_reviews=100):
    """Fetch data for a specific bank."""
    end_time = datetime.now()
    start_time = end_time - timedelta(days=duration_days)
    
    logger.info("Fetching data for %s", bank)
    
    # Fetch reviews
    reviews_data = fetch_reviews(app_id, num_reviews)
    if not reviews_data:
        logger.info("No reviews found for %s", bank)
        return [], []
    
    logger.info("Fetched %d reviews for %s", len(reviews_data), bank)
    
    # Filter reviews by time
    filtered_reviews = [review for review in reviews_data if datetime.fromtimestamp(review['at'].timestamp()) >= start_time]
    logger.info("Filtered down to %d reviews for %s", len(filtered_reviews), bank)
    
    # Fetch app details over the specified duration
    download_data_list = []
    current_date = start_time
    while current_date <= end_time:
        details = fetch_app_details(app_id)
        download_count = details.get('downloads', 'N/A')
        date = details.get('date', current_date)
        download_data = {
            'bank': bank,
            'appId': app_id,
            'date': date,
            'downloads': download_count
        }
        download_data_list.append(download_data)
        current_date += timedelta(days=1)
    
    all_reviews = []
    for review in filtered_reviews:
        review_data = {
            'bank': bank,
            'appId': app_id,
            'reviewId': review['reviewId'],
            'userName': review['userName'],
            'userImage': review['userImage'],
            'thumbsUpCount': review['thumbsUpCount'],
            'reviewCreatedVersion': review.get('reviewCreatedVersion'),
            'at': review['at'],
            'replyContent': review.get('replyContent', ''),
            'repliedAt': review.get('repliedAt', ''),
            'appVersion': review.get('appVersion', ''),
            'score': review['score'],
            'content': review['content'],
            'keywords': '',  # Placeholder for keywords
            'LDA_Category': '',  # Placeholder for LDA category
            'Sentiment': '',  # Placeholder for sentiment
            'Insight': ''  # Placeholder for insight
        }
        all_reviews.append(review_data)
    
    return all_reviews, download_data_list

def track_reviews_and_downloads(app_ids, duration_days=7, num_reviews=100):
    """Track reviews and download counts over a period of time."""
    all_reviews = []
    all_downloads = []
    
    with ThreadPoolExecutor(max_workers=len(app_ids)) as executor:
        futures = [executor.submit(fetch_data_for_bank, bank, app_id, duration_days, num_reviews) for bank, app_id in app_ids.items()]
        
        for future in futures:
            reviews, downloads = future.result()
            all_reviews.extend(reviews)
            all_downloads.extend(downloads)
    
    # Create DataFrames from the collected reviews and download counts
    df_reviews = pd.DataFrame(all_reviews)
    df_downloads = pd.DataFrame(all_downloads)
    
    # Print the heads of the DataFrames
    print("Reviews DataFrame head:")
    print(df_reviews.head())
    print("Downloads DataFrame head:")
    print(df_downloads.head())
    
    # Ensure the data directory exists
    os.makedirs('data', exist_ok=True)
    
    # Save the DataFrames to separate CSV files
    reviews_csv_file_path = '../data/google_play_reviews.csv'
    downloads_csv_file_path = '../data/google_play_downloads.csv'
    df_reviews.to_csv(reviews_csv_file_path, index=False)
    df_downloads.to_csv(downloads_csv_file_path, index=False)
    
    logger.info("Saved reviews data to %s", reviews_csv_file_path)
    logger.info("Saved downloads data to %s", downloads_csv_file_path)

if __name__ == "__main__":
    track_reviews_and_downloads(app_ids, num_reviews=100)

INFO:__main__:Fetching data for Abyssinia Bank
INFO:__main__:Fetching data for Commercial Bank of Ethiopia
INFO:__main__:Fetched 100 reviews for Commercial Bank of Ethiopia
INFO:__main__:Filtered down to 33 reviews for Commercial Bank of Ethiopia
INFO:__main__:Fetched 100 reviews for Abyssinia Bank
INFO:__main__:Filtered down to 23 reviews for Abyssinia Bank
INFO:__main__:Saved reviews data to ../data/google_play_reviews.csv
INFO:__main__:Saved downloads data to ../data/google_play_downloads.csv


Reviews DataFrame head:
             bank                     appId  \
0  Abyssinia Bank  com.boa.boaMobileBanking   
1  Abyssinia Bank  com.boa.boaMobileBanking   
2  Abyssinia Bank  com.boa.boaMobileBanking   
3  Abyssinia Bank  com.boa.boaMobileBanking   
4  Abyssinia Bank  com.boa.boaMobileBanking   

                               reviewId        userName  \
0  f77c9ee3-07b7-4203-9aaa-f019d35abaa6  Chernet Bekele   
1  622762e8-14b9-42eb-b807-8960278d8b34        Hermon Z   
2  9756a397-2464-40d6-8eab-d269c119aa9e    Davinci Tube   
3  fe54afea-6f7b-4fee-bc8a-4af38050cc54  Mohammed Kasim   
4  9340824c-f839-4afe-81af-cd398088f470   Ahadu tesfaye   

                                           userImage  thumbsUpCount  \
0  https://play-lh.googleusercontent.com/a/ACg8oc...              1   
1  https://play-lh.googleusercontent.com/a-/ALV-U...              0   
2  https://play-lh.googleusercontent.com/a-/ALV-U...              0   
3  https://play-lh.googleusercontent.com/a-/ALV-U...  

In [11]:
import pandas as pd
import os
from google_play_scraper import Sort, reviews

def collect_all_reviews(app_id):
    all_reviews = []
    batch_size = 100
    next_token = None
    lang = 'en'
    country = 'us'
    sort = Sort.NEWEST

    while True:
        result, next_token = reviews(
            app_id,
            lang=lang,
            country=country,
            sort=sort,
            count=batch_size,
            continuation_token=next_token
        )
        all_reviews.extend(result)
        if not next_token or len(result) == 0:
            break

    return all_reviews

def save_reviews_to_csv(reviews_list, csv_filename):
    # Define the column names based on the PostgreSQL table schema
    reviews_data = [{
        'bank': 'Abyssinia Bank',  # Adding the bank column first
        'appId': 'com.boa.boaMobileBanking',
        'review_id': review['reviewId'],
        'username': review['userName'],
        'user_image': review['userImage'],
        'likes': review['thumbsUpCount'],
        'review_created_version': review.get('reviewCreatedVersion', None),
        'created_at': review['at'],
        'reply_content': review.get('replyContent', None),
        'replied_at': review.get('repliedAt', None),
        'app_version': review.get('reviewCreatedVersion', None),  # Assuming app version is the same as review created version
        'score': review['score'],
        'comments': review['content'],
        'keywords': None,  # Placeholder for future data
        'lda_category': None,  # Placeholder for future data
        'sentiment': None,  # Placeholder for future data
        'insight': None  # Placeholder for future data
    } for review in reviews_list]

    df = pd.DataFrame(reviews_data)

    # Get the current working directory
    current_directory = os.getcwd()
    # Construct the path to the parent directory
    parent_directory = os.path.dirname(current_directory)
    # Define the 'data' directory path relative to the parent directory
    data_directory = os.path.join(parent_directory, 'data')

    # Create the 'data' directory if it doesn't exist
    os.makedirs(data_directory, exist_ok=True)

    # Full path for the CSV file
    full_csv_path = os.path.join(data_directory, csv_filename)

    # Save the DataFrame to CSV
    df.to_csv(full_csv_path, index=False, encoding='utf-8')

    print(f"Collected {len(reviews_list)} reviews and saved to {full_csv_path}")

def main(app_id, csv_filename):
    reviews_list = collect_all_reviews(app_id)
    save_reviews_to_csv(reviews_list, csv_filename)

if __name__ == '__main__':
    app_id = 'com.boa.apollo'
    csv_filename = 'boa_app_reviews.csv'
    main(app_id, csv_filename)

Collected 400 reviews and saved to d:\marketing_analytics\data\boa_app_reviews.csv


In [12]:
import pandas as pd
import os
from google_play_scraper import Sort, reviews

def collect_all_reviews(app_id):
    all_reviews = []
    batch_size = 100
    next_token = None
    lang = 'en'
    country = 'us'
    sort = Sort.NEWEST

    while True:
        result, next_token = reviews(
            app_id,
            lang=lang,
            country=country,
            sort=sort,
            count=batch_size,
            continuation_token=next_token
        )
        all_reviews.extend(result)
        if not next_token or len(result) == 0:
            break

    return all_reviews

def save_reviews_to_csv(reviews_list, csv_filename):
    # Define the column names based on the PostgreSQL table schema
    reviews_data = [{
        'bank': 'Abyssinia Bank',  # Adding the bank column first
        'appId': 'com.boa.boaMobileBanking',
        'review_id': review['reviewId'],
        'username': review['userName'],
        'user_image': review['userImage'],
        'likes': review['thumbsUpCount'],
        'review_created_version': review.get('reviewCreatedVersion', None),
        'created_at': review['at'],
        'reply_content': review.get('replyContent', None),
        'replied_at': review.get('repliedAt', None),
        'app_version': review.get('reviewCreatedVersion', None),  # Assuming app version is the same as review created version
        'score': review['score'],
        'comments': review['content'],
        'keywords': None,  # Placeholder for future data
        'lda_category': None,  # Placeholder for future data
        'sentiment': None,  # Placeholder for future data
        'insight': None  # Placeholder for future data
    } for review in reviews_list]

    df = pd.DataFrame(reviews_data)

    # Get the current working directory
    current_directory = os.getcwd()
    # Construct the path to the parent directory
    parent_directory = os.path.dirname(current_directory)
    # Define the 'data' directory path relative to the parent directory
    data_directory = os.path.join(parent_directory, 'data')

    # Create the 'data' directory if it doesn't exist
    os.makedirs(data_directory, exist_ok=True)

    # Full path for the CSV file
    full_csv_path = os.path.join(data_directory, csv_filename)

    # Save the DataFrame to CSV
    df.to_csv(full_csv_path, index=False, encoding='utf-8')

    print(f"Collected {len(reviews_list)} reviews and saved to {full_csv_path}")

def merge_csv_files(new_csv_filename, existing_csv_filename, output_csv_filename):
    # Get the current working directory
    current_directory = os.getcwd()
    # Construct the path to the parent directory
    parent_directory = os.path.dirname(current_directory)
    # Define the 'data' directory path relative to the parent directory
    data_directory = os.path.join(parent_directory, 'data')

    # Full paths for the CSV files
    new_csv_path = os.path.join(data_directory, new_csv_filename)
    existing_csv_path = os.path.join(data_directory, existing_csv_filename)

    # Read the new and existing CSV files
    new_reviews_df = pd.read_csv(new_csv_path)
    existing_reviews_df = pd.read_csv(existing_csv_path)

    # Merge the dataframes
    merged_df = pd.concat([existing_reviews_df, new_reviews_df], ignore_index=True)

    # Full path for the output CSV file
    output_csv_path = os.path.join(data_directory, output_csv_filename)

    # Save the merged DataFrame to CSV
    merged_df.to_csv(output_csv_path, index=False, encoding='utf-8')

    print(f"Merged CSV saved to {output_csv_path}")

def main(app_id, new_csv_filename, existing_csv_filename, output_csv_filename):
    reviews_list = collect_all_reviews(app_id)
    save_reviews_to_csv(reviews_list, new_csv_filename)
    merge_csv_files(new_csv_filename, existing_csv_filename, output_csv_filename)

if __name__ == '__main__':
    app_id = 'com.boa.apollo'
    new_csv_filename = 'boa_app_reviews.csv'
    existing_csv_filename = 'google_play_reviews.csv'
    output_csv_filename = 'google_play_reviews.csv'
    main(app_id, new_csv_filename, existing_csv_filename, output_csv_filename)


Collected 100 reviews and saved to d:\marketing_analytics\data\boa_app_reviews.csv
Merged CSV saved to d:\marketing_analytics\data\google_play_reviews.csv


In [13]:
import pandas as pd

# Define the file paths
boa_app_reviews_path = '../data/boa_app_reviews.csv'
google_play_reviews_path = '../data/google_play_reviews.csv'
merged_reviews_path = '../data/google_play_reviews.csv'

# Read the CSV files
boa_app_reviews = pd.read_csv(boa_app_reviews_path)
google_play_reviews = pd.read_csv(google_play_reviews_path)

# Debug: Print the first few rows of each DataFrame
print("BOA App Reviews:")
print(boa_app_reviews.head())
print("\nGoogle Play Reviews:")
print(google_play_reviews.head())

# Merge the DataFrames
merged_reviews = pd.concat([google_play_reviews, boa_app_reviews], ignore_index=True)

# Debug: Print the first few rows of the merged DataFrame
print("\nMerged Reviews:")
print(merged_reviews.head())

# Save the merged DataFrame to a CSV file
merged_reviews.to_csv(merged_reviews_path, index=False)

# Confirm the merge and save operation
print(f"\nMerged file saved to {merged_reviews_path}")

BOA App Reviews:
             bank                     appId  \
0  Abyssinia Bank  com.boa.boaMobileBanking   
1  Abyssinia Bank  com.boa.boaMobileBanking   
2  Abyssinia Bank  com.boa.boaMobileBanking   
3  Abyssinia Bank  com.boa.boaMobileBanking   
4  Abyssinia Bank  com.boa.boaMobileBanking   

                              review_id        username  \
0  d80d7cd7-3d12-420e-b34e-31226970326e     Batu Yenatu   
1  677af726-36b5-4a32-871c-d2baf1a8c303   Yonas Tadesse   
2  738b6d3a-2d7f-4a82-bfe2-dcf32591f944  NAHOM NIGUSSIE   
3  e20abe49-8fe7-42fe-af3a-91399875b21a   Abbatu Ermias   
4  55c833c8-a942-47bb-aaaa-e022f39e28af    Habtsh Darge   

                                          user_image  likes  \
0  https://play-lh.googleusercontent.com/a/ACg8oc...      0   
1  https://play-lh.googleusercontent.com/a-/ALV-U...      0   
2  https://play-lh.googleusercontent.com/a-/ALV-U...      1   
3  https://play-lh.googleusercontent.com/a/ACg8oc...      0   
4  https://play-lh.googleuserc