In [1]:
import pandas as pd
import os

In [None]:
# Load the Raw CSV Files

input_folder = "../scraped_reviews"
all_data = []

for file in os.listdir(input_folder):
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(input_folder, file))
        all_data.append(df)

raw_df = pd.concat(all_data, ignore_index=True)
print(f"Total records before cleaning: {len(raw_df)}")

Total records before cleaning: 6494


In [4]:
# Remove rows with missing reviews or ratings
clean_df = raw_df.dropna(subset=['review_text', 'rating'])

# Remove duplicates based on text and bank
clean_df = clean_df.drop_duplicates(subset=['review_text', 'bank_name'])

print(f"Total records after cleaning: {len(clean_df)}")

Total records after cleaning: 4945


In [5]:
# Convert date to standard YYYY-MM-DD
clean_df['date'] = pd.to_datetime(clean_df['date'], errors='coerce')
clean_df = clean_df.dropna(subset=['date'])  # Drop rows with invalid dates
clean_df['date'] = clean_df['date'].dt.strftime('%Y-%m-%d')

In [6]:
clean_df = clean_df[['review_text', 'rating', 'date', 'bank_name', 'source']]

In [9]:
print(clean_df.sample(10))
print(clean_df.columns)
print(len(clean_df))

                                           review_text  rating        date  \
3462                  Fast, user-friendly, easy-to-use       5  2024-06-14   
4954                      Best mobile banking app ever       4  2023-11-19   
5826                                   baaye gaari dha       4  2023-03-20   
4949  Best to be Best as your Age of Surveillance Old.       3  2023-11-20   
2435                 I used for long time its best app       5  2024-12-24   
4318                       It is very used application       5  2024-02-13   
6423                                         V good ❗️       5  2025-01-24   
3206               It's not appropriate for Play store       1  2024-08-01   
1755                                      The best app       5  2025-02-22   
858                                 It's good Rate app       5  2024-03-12   

                        bank_name       source  
3462  Commercial Bank of Ethiopia  Google Play  
4954  Commercial Bank of Ethiopia  Google P

In [11]:
# Save the Clean CSV
clean_df.to_csv("../scraped_reviews/cleaned_reviews.csv", index=False)
print("✅ Cleaned dataset saved as 'cleaned_reviews.csv'")

✅ Cleaned dataset saved as 'cleaned_reviews.csv'
