# Sentimental and Thematic Analysis

In [2]:
# importing of  Dependencies
#import dependencies
import pandas as pd
from google_play_scraper import reviews_all
import nltk
from nltk.corpus import stopwords
from typing import List, Dict, Any
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


#sentiment analysis
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


#keyword Extraction
from sklearn.feature_extraction.text import TfidfVectorizer

#visuals
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Downloading the reviews of  banks

In [3]:
# google play store bank app scraping 
banks = [
    'com.combanketh.mobilebanking',  # Commercial Bank of Ethiopia
    'com.dashen.dashensuperapp',    # Dashen Bank
    'com.boa.boaMobileBanking'      # Bank of Abyssinia
]
all_reviews = []

*Collecting of Individuals banks (CBE,DASHEN and BOA)*

In [11]:
# Function  to scrap the data from the individual  banks
def fetch_reviews(banks: List[str], max_reviews_per_bank: int = 400) -> List[Dict[str, Any]]:
    """
    Fetches app reviews with bank source tracking
    
    Args:
        banks: List of app package names (e.g., ["com.chase", "com.bankofamerica"])
        max_reviews_per_bank: Maximum reviews to fetch per app
        
    Returns:
        List of reviews, each annotated with its source bank
    """
    all_reviews = []
    
    for bank in banks:
        try:
            reviews = reviews_all(
                app_id=bank,
                sleep_milliseconds=100,  # Small delay to avoid rate limiting
                lang='en',
                country='us'
            )
            
            # Add bank identifier to each review
            for review in reviews[:max_reviews_per_bank]:
                review['Bank'] = bank  # or 'app_id' if preferred
                all_reviews.append(review)
                
            print(f"Fetched {len(reviews[:max_reviews_per_bank])} reviews from {bank}")
            
        except Exception as e:
            print(f"Failed to fetch {bank}: {str(e)}")
            continue
    
    print(f"\nTOTAL REVIEWS: {len(all_reviews)}")
    return all_reviews
# Get all reviews for a specific bank
chase_reviews = [r for r in all_reviews if r['source_bank'] == 'com.chase']

# Count reviews per bank
from collections import Counter
bank_counts = Counter(r['source_bank'] for r in all_reviews)
print(bank_counts)


Counter()


In [12]:

if __name__ == "__main__":
    target_banks = ["com.combanketh.mobilebanking", "com.dashen.dashensuperapp","com.boa.boaMobileBanking"]
    reviews = fetch_reviews(target_banks)
    print(f"Final review count: {len(reviews)}")

Fetched 400 reviews from com.combanketh.mobilebanking
Fetched 400 reviews from com.dashen.dashensuperapp
Fetched 400 reviews from com.boa.boaMobileBanking

TOTAL REVIEWS: 1200
Final review count: 1200


*Date Normalization*

In [13]:
df=pd.DataFrame(reviews)
# Convert 'at' column to datetime
df['at'] = pd.to_datetime(df['at'], unit='ms')
df.head(10)

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,Bank
0,44b56ef7-c297-438a-94ac-f59f70b40594,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Very amazing app indeed. I'm enjoying it,5,0,,2025-06-08 21:52:23,,,,com.combanketh.mobilebanking
1,ba2970d7-802b-44d5-9c91-e6bed733adad,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Best,5,0,,2025-06-08 18:25:37,,,,com.combanketh.mobilebanking
2,3e1d37a0-a082-4cb0-912a-8efff072ed3f,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,20 years,5,0,,2025-06-08 12:04:48,,,,com.combanketh.mobilebanking
3,a7d1c799-ba53-4a0a-a8d6-c5400a009825,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,A great app. It's like carrying a bank in your...,4,0,5.1.0,2025-06-07 20:21:52,,,5.1.0,com.combanketh.mobilebanking
4,64ed5562-1758-4eb8-9291-8b6edc394118,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,More than garrantty bank EBC.,4,0,,2025-06-07 18:21:26,,,,com.combanketh.mobilebanking
5,d0c05687-ddd4-43fb-95a9-08f6358d80a2,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,really am happy to this app it is Siple to use...,5,0,5.1.0,2025-06-07 11:02:38,,,5.1.0,com.combanketh.mobilebanking
6,811bf820-3529-433a-9b6d-e624fa23a16a,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,I liked this app. But the User interface is ve...,2,0,5.1.0,2025-06-07 10:50:29,,,5.1.0,com.combanketh.mobilebanking
7,be2cb2ac-bbe0-4175-81c4-9f6c86afdaaa,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,"""Why don’t your ATMs support account-to-accoun...",4,0,,2025-06-06 09:54:11,,,,com.combanketh.mobilebanking
8,8efd71e9-59cd-41ce-8c5c-12052dee9ad0,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,what is this app problem???,1,0,5.1.0,2025-06-05 22:16:56,,,5.1.0,com.combanketh.mobilebanking
9,b12d0383-9b27-4e49-a94d-277a43b15800,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,the app is proactive and a good connections.,5,0,5.1.0,2025-06-05 15:55:10,,,5.1.0,com.combanketh.mobilebanking


#   Data preprocessing 
*1. Column Rename fro clarity*

In [15]:

#rename columns
df.rename(columns={
    'content': 'review_text',
    'score': 'rating',
    'at': 'review_date',
    'reviewId': 'review_id',
    'userName': 'userName'  
}, inplace=True)
df['source'] = 'Google Play store'  # Adding the column to called the source= google play store 

# Display after adjusting 
df.head()


Unnamed: 0,review_id,userName,userImage,review_text,rating,thumbsUpCount,reviewCreatedVersion,review_date,replyContent,repliedAt,appVersion,Bank,source
0,44b56ef7-c297-438a-94ac-f59f70b40594,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Very amazing app indeed. I'm enjoying it,5,0,,2025-06-08 21:52:23,,,,com.combanketh.mobilebanking,Google Play store
1,ba2970d7-802b-44d5-9c91-e6bed733adad,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Best,5,0,,2025-06-08 18:25:37,,,,com.combanketh.mobilebanking,Google Play store
2,3e1d37a0-a082-4cb0-912a-8efff072ed3f,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,20 years,5,0,,2025-06-08 12:04:48,,,,com.combanketh.mobilebanking,Google Play store
3,a7d1c799-ba53-4a0a-a8d6-c5400a009825,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,A great app. It's like carrying a bank in your...,4,0,5.1.0,2025-06-07 20:21:52,,,5.1.0,com.combanketh.mobilebanking,Google Play store
4,64ed5562-1758-4eb8-9291-8b6edc394118,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,More than garrantty bank EBC.,4,0,,2025-06-07 18:21:26,,,,com.combanketh.mobilebanking,Google Play store


*2.Data cleaning (Drop of the duplicates reviews and handle the missing)*

In [16]:

#handle duplicate reviews
df.drop_duplicates()
#print the duplicate reviews
#check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)
#drop missing values
Missing_value=df.dropna()
#print the missing values
print(f"Total missing values removed: {len(Missing_value)}")


Missing values in each column:
review_id                  0
userName                   0
userImage                  0
review_text                0
rating                     0
thumbsUpCount              0
reviewCreatedVersion     284
review_date                0
replyContent            1200
repliedAt               1200
appVersion               284
Bank                       0
source                     0
dtype: int64
Total missing values removed: 0


**3. Textual cleaning and related**

*Steps*:
* Clean text: Remove punctuation, special characters, convert to lowercase.
* Tokenize: Split text into words.
* Remove stop words: Eliminate common words (e.g., "the," "and").
* Lemmatize: Reduce words to base form (e.g., "running" → "run").

In [17]:
def clean_review_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = ''.join(char for char in text if char.isalnum() or char.isspace())
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)


In [20]:

#Apply the cleaning function to the review_text column
df['cleaned_review_text'] = df['review_text'].apply(clean_review_text)
# Save the cleaned reviews to a new CSV file
df.to_csv('F:/Bank-Reviews-Analysis/Data/cleaned_data/All_bank_reviews_cleaned.csv', index=False, encoding='utf-8-sig')
print("First 10 rows of cleaned All bank reviews:")
print(df.head())

First 10 rows of cleaned All bank reviews:
                              review_id       userName  \
0  44b56ef7-c297-438a-94ac-f59f70b40594  A Google user   
1  ba2970d7-802b-44d5-9c91-e6bed733adad  A Google user   
2  3e1d37a0-a082-4cb0-912a-8efff072ed3f  A Google user   
3  a7d1c799-ba53-4a0a-a8d6-c5400a009825  A Google user   
4  64ed5562-1758-4eb8-9291-8b6edc394118  A Google user   

                                           userImage  \
0  https://play-lh.googleusercontent.com/EGemoI2N...   
1  https://play-lh.googleusercontent.com/EGemoI2N...   
2  https://play-lh.googleusercontent.com/EGemoI2N...   
3  https://play-lh.googleusercontent.com/EGemoI2N...   
4  https://play-lh.googleusercontent.com/EGemoI2N...   

                                         review_text  rating  thumbsUpCount  \
0           Very amazing app indeed. I'm enjoying it       5              0   
1                                               Best       5              0   
2                                 

# Sentiment Analysis with TextBlob

Description: TextBlob provides a straightforward way to classify sentiments based on polarity scores.

In [21]:
#function to analyze sentiment using TextBlob
def analyze_sentiment_textblob(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity < 0:
        return 'negative'
    else:
        return 'neutral'
    
    
    # Apply the sentiment analysis function to the cleaned review text
df['textblob_sentiment'] = df['cleaned_review_text'].apply(analyze_sentiment_textblob)

print("Sentiment analysis using TextBlob completed for all bank reviews.")
print(df['textblob_sentiment'].value_counts())
#save the sentiment 
df.to_csv('F:/Bank-Reviews-Analysis/Data/cleaned_data/All bank_reviews_sentiment_textblob.csv', index=False, encoding='utf-8-sig')

Sentiment analysis using TextBlob completed for all bank reviews.
textblob_sentiment
positive    763
neutral     323
negative    114
Name: count, dtype: int64
