In [1]:
#import necessary modules and libraries
import unittest
import os
import pandas as pd
import os
import sys
import shutil

#import custom module
sys.path.append(os.path.abspath(os.path.join('..')))
from scripts.scraper_preprocessor import Scraper

class TestScraper(unittest.TestCase):
    def setUp(self):
        #method is called before each test
        self.app_id = "com.example.app" 
        self.bank_name = "ExampleBank"
        self.output_folder = "test_data"
        self.scraper = Scraper(self.app_id, self.bank_name, self.output_folder)

        #create a dummy output folder for testing
        if not os.path.exists(self.output_folder):
            os.makedirs(self.output_folder)

    def tearDown(self):
        #method is called after each test
        #clean up the dummy output folder and files
        if os.path.exists(self.output_folder):
            for file_name in os.listdir(self.output_folder):
                file_path = os.path.join(self.output_folder, file_name)
                try:
                    if os.path.isfile(file_path) or os.path.islink(file_path):
                        os.unlink(file_path)
                    elif os.path.isdir(file_path):
                        shutil.rmtree(file_path)
                except Exception as e:
                    print(f'Failed to delete {file_path}. Reason: {e}')
            os.rmdir(self.output_folder)

    def test_scrape_play_store_reviews(self):
        #test the scraping function
        file_path = self.scraper.scrape_play_store_reviews()
        self.assertIsNotNone(file_path)
        self.assertTrue(os.path.exists(file_path))

    def test_preprocessor(self):
        #create a dummy raw data file for testing the preprocessor
        raw_file_path = os.path.join(self.output_folder, f'{self.bank_name}_reviews_raw.csv')
        dummy_data = {
            'review_text': ['Review 1', 'Review 2', 'Review 1', None],
            'rating': [5, 4, 5, 3],
            'date': ['2023-01-01', '2023-01-02', '2023-01-01', '2023-01-03'],
            'bank_name': [self.bank_name, self.bank_name, self.bank_name, self.bank_name],
            'source': ['Google Play', 'Google Play', 'Google Play', 'Google Play']
        }
        dummy_df = pd.DataFrame(dummy_data)
        dummy_df.to_csv(raw_file_path, index=False)

        #test the preprocessing function
        processed_df = self.scraper.preprocessor(raw_file_path)
        self.assertIsNotNone(processed_df)
        self.assertFalse(processed_df.duplicated().any())
        self.assertFalse(processed_df['review_text'].isnull().any())
        self.assertIn('date', processed_df.columns)

#run the test
if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

.


Removed 1 duplicate rows.
Missing value found.

Missing value found in "review_text" column. Dropping rows.

Missing values handled. Remaining rows: 2

DataFrame head:
     bank_name        date  rating review_text       source
0  ExampleBank  2023-01-01       5    Review 1  Google Play
1  ExampleBank  2023-01-02       4    Review 2  Google Play

DataFrame Summary:
<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, 0 to 1
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   bank_name    2 non-null      object
 1   date         2 non-null      object
 2   rating       2 non-null      int64 
 3   review_text  2 non-null      object
 4   source       2 non-null      object
dtypes: int64(1), object(4)
memory usage: 96.0+ bytes
None

Descriptive statistics for "rating" Column:
count    2.000000
mean     4.500000
std      0.707107
min      4.000000
25%      4.250000
50%      4.500000
75%      4.750000
max      5.000000
N

.
----------------------------------------------------------------------
Ran 2 tests in 0.382s

OK


✅ Saved 0 raw reviews to test_data\ExampleBank_reviews_raw.csv
