In [1]:
import sys
import os

# Add project root (one directory above "notebooks")
sys.path.append(os.path.abspath(".."))

In [5]:
import pandas as pd
from scripts.scrape_reviews import scrape_reviews_for_app

In [6]:


APPS = {
    "CBE": "com.combanketh.mobilebanking",  #https://play.google.com/store/apps/details?id=com.combanketh.mobilebanking&hl=en
    "BOA": "com.boa.boaMobileBanking",    #https://play.google.com/store/apps/details?id=com.boa.boaMobileBanking&pcampaignid=web_share
    "Dashen": "com.dashen.dashensuperapp",  #https://play.google.com/store/apps/details?id=com.dashen.dashensuperapp&pcampaignid=web_share
}



In [7]:
all_reviews = []
for bank, package in APPS.items():
    data = scrape_reviews_for_app(bank, package)
    all_reviews.extend(data)

ðŸ”¹ Scraping CBE...
âœ… Finished CBE (400 reviews)
ðŸ”¹ Scraping BOA...
âœ… Finished BOA (400 reviews)
ðŸ”¹ Scraping Dashen...
âœ… Finished Dashen (400 reviews)


In [8]:
df = pd.DataFrame(all_reviews)
df.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,bank
0,3d88a334-958c-4717-9f97-c5d46359e054,samson getachew,https://play-lh.googleusercontent.com/a/ACg8oc...,It is nice app,5,0,5.2.1,2025-11-26 12:03:18,,NaT,5.2.1,CBE
1,99d376ea-4824-4af9-a093-27360acc3a5c,Nejbadin Ali,https://play-lh.googleusercontent.com/a-/ALV-U...,best,5,0,5.2.1,2025-11-25 20:27:20,,NaT,5.2.1,CBE
2,f1861daf-a1ed-407a-9e7c-295edbb3877d,Amman Mom,https://play-lh.googleusercontent.com/a/ACg8oc...,good app,5,0,5.2.1,2025-11-25 18:10:35,,NaT,5.2.1,CBE
3,fd178fb7-7026-4d02-98a0-5c86c3bd56f5,Nuredin Adem,https://play-lh.googleusercontent.com/a/ACg8oc...,it suddenly asked me to enter the verification...,1,0,5.2.1,2025-11-25 17:14:22,,NaT,5.2.1,CBE
4,571c66c6-fd18-437b-b8e5-8c443e9db2df,Abdulfeta Mifta,https://play-lh.googleusercontent.com/a/ACg8oc...,nice app...,5,0,,2025-11-25 10:24:19,,NaT,,CBE


In [9]:
df.to_csv("../data/raw_bank_reviews.csv", index=False)
print("ðŸ’¾ Saved to data/raw_bank_reviews.csv")


ðŸ’¾ Saved to data/raw_bank_reviews.csv


In [43]:
def remove_duplicates(df):
    """
    Remove duplicate entries based on reviewId or review text.

    Parameters:
        df (DataFrame): Raw scraped reviews.

    Returns:
        DataFrame: Reviews with duplicates removed.
    """
    before = len(df)

    # Drop duplicates using reviewId and review content
    df = df.drop_duplicates(subset=["reviewId", "content"], keep="first")

    after = len(df)
    print(f"ðŸ”¹ Removed {before - after} duplicate reviews.")
    return df


def remove_empty_reviews(df):
    """
    Remove reviews where the content is empty or missing.

    Parameters:
        df (DataFrame): Reviews.

    Returns:
        DataFrame: Reviews with empty text removed.
    """
    before = len(df)

    # Remove rows with NaN or whitespace-only text
    df = df.dropna(subset=["content"])
    df = df[df["content"].str.strip() != ""]

    after = len(df)
    print(f"ðŸ”¹ Removed {before - after} empty reviews.")
    return df


def normalize_dates(df: pd.DataFrame) -> pd.DataFrame:
    """
    Convert 'at' column to proper YYYY-MM-DD datetime64[ns] (without time).
    Safe for PostgreSQL, plotting, and analysis.
    """
    df["date"] = pd.to_datetime(df["at"], errors="coerce").dt.normalize()
    print(f"Date normalized â†’ {df['date'].dtype}")
    return df

def standardize_bank_names(df):
    """
    Ensure all bank names follow a consistent uppercase format.

    Parameters:
        df (DataFrame): Reviews.

    Returns:
        DataFrame: Reviews with standardized bank names.
    """
    df["bank"] = df["bank"].str.strip().str.upper()
    print("ðŸ”¹ Standardized bank names.")
    return df



def select_required_columns(df):
    """
    Keep only the required columns:
    review, rating, date, bank, source.

    Parameters:
        df (DataFrame): Fully preprocessed reviews.

    Returns:
        DataFrame: Final clean dataset.
    """
    # Map original columns to final standardized names
    df["review"] = df["content"]
    df["rating"] = df["score"]
    df["source"] = "Google Play Store"
    print("ðŸ”¹ Selected required final columns.")
    return df[["review", "rating", "date", "bank", "source"]]


In [44]:
# Make a working copy
df_clean = df.copy()

df_clean = remove_duplicates(df_clean)
df_clean = remove_empty_reviews(df_clean)
df_clean = normalize_dates(df_clean)
df_clean = standardize_bank_names(df_clean)
df_clean = select_required_columns(df_clean)


print(df_clean['bank'].value_counts())
df_clean.head()


ðŸ”¹ Removed 0 duplicate reviews.
ðŸ”¹ Removed 0 empty reviews.
Date normalized â†’ datetime64[ns]
ðŸ”¹ Standardized bank names.
ðŸ”¹ Selected required final columns.
bank
CBE       400
BOA       400
DASHEN    400
Name: count, dtype: int64


Unnamed: 0,review,rating,date,bank,source
0,It is nice app,5,2025-11-26,CBE,Google Play Store
1,best,5,2025-11-25,CBE,Google Play Store
2,good app,5,2025-11-25,CBE,Google Play Store
3,it suddenly asked me to enter the verification...,1,2025-11-25,CBE,Google Play Store
4,nice app...,5,2025-11-25,CBE,Google Play Store


In [45]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   review  1200 non-null   object        
 1   rating  1200 non-null   int64         
 2   date    1200 non-null   datetime64[ns]
 3   bank    1200 non-null   object        
 4   source  1200 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 47.0+ KB


In [46]:
df_clean.to_csv("../data/processed/cleaned_reviews.csv", index=False)
print("ðŸ’¾ Saved cleaned dataset to ../data/cleaned_reviews.csv")

ðŸ’¾ Saved cleaned dataset to ../data/cleaned_reviews.csv


Validation Checks

In [47]:
# Check total review count
print("Total reviews:", len(df_clean))

Total reviews: 1200


In [48]:
# Check distribution across banks
print(df_clean["bank"].value_counts())

bank
CBE       400
BOA       400
DASHEN    400
Name: count, dtype: int64


In [49]:
# Check missing values (< 5% expected)
df_clean.isnull().mean() * 100

review    0.0
rating    0.0
date      0.0
bank      0.0
source    0.0
dtype: float64

In [50]:
# Confirm final columns
df_clean.columns

Index(['review', 'rating', 'date', 'bank', 'source'], dtype='object')

Validation Checks

In [51]:
# Check total review count
print("Total reviews:", len(df_clean))


Total reviews: 1200


In [52]:
# Check distribution across banks
print(df_clean["bank"].value_counts())


bank
CBE       400
BOA       400
DASHEN    400
Name: count, dtype: int64


In [53]:
# Check missing values (< 5% expected)
df_clean.isnull().mean() * 100

review    0.0
rating    0.0
date      0.0
bank      0.0
source    0.0
dtype: float64

In [54]:
# Confirm final columns
df_clean.columns

Index(['review', 'rating', 'date', 'bank', 'source'], dtype='object')