In [1]:
import unittest
import pandas as pd
import os
import sys
import shutil

# Setup import path
sys.path.append(os.path.abspath(os.path.join('..')))

from scripts._01_3_preprocess_and_clean import preprocess_and_clean

# Import the preprocessing class
class TestPreprocessAndClean(unittest.TestCase):

    def setUp(self):
        self.test_dir = 'test_data'
        os.makedirs(self.test_dir, exist_ok=True)

        # Valid test CSV (includes Amharic, English, emojis, whitespace)
        self.valid_csv_path = os.path.join(self.test_dir, 'valid_data.csv')
        valid_data = {'ID': [1, 2, 3, 4, 5, 6, 7],
                        'Message': ['Hello 😊',
                                    'Test message with ፡',
                                    'This is a test.',
                                    'ሀይ፣ እንዴት ነህ?',
                                    '',
                                    '   ',
                                    None]}
        pd.DataFrame(valid_data).to_csv(self.valid_csv_path, index=False)

        # Missing columns
        self.missing_cols_csv_path = os.path.join(self.test_dir, 'missing_cols_data.csv')
        pd.DataFrame({'AnotherCol': [1, 2, 3], 'Message': ['Msg 1', 'Msg 2', 'Msg 3']}) \
            .to_csv(self.missing_cols_csv_path, index=False)

        # Empty messages and messages that should be kept
        self.empty_messages_csv_path = os.path.join(self.test_dir, 'empty_messages_data.csv')
        pd.DataFrame({'ID': [1, 2, 3, 4], 'Message': ['ሀይ እንዴት ነህ?', ' ', '', 'ይሄ ሌላ መልዕክት ነው']}) \
            .to_csv(self.empty_messages_csv_path, index=False)

        # Whitespace-only column
        self.whitespace_cols_csv_path = os.path.join(self.test_dir, 'whitespace_cols_data.csv')
        pd.DataFrame({'ID': [1, 2], 'Message': ['Msg 1', 'Msg 2'], 'WhitespaceCol': [' ', '  ']}) \
            .to_csv(self.whitespace_cols_csv_path, index=False)


    def tearDown(self):
        # Remove test data files and directories
        if os.path.exists(self.test_dir):
            shutil.rmtree(self.test_dir) # Use shutil.rmtree to remove directory and contents

        # Remove default output directory
        output_dir = os.path.join(os.getcwd(), "data")
        if os.path.exists(output_dir):
            shutil.rmtree(output_dir) # Use shutil.rmtree

        # Remove the specific output directory created in test_output_folder_creation
        test_output_folder = os.path.join(self.test_dir, 'output_test')
        if os.path.exists(test_output_folder):
             shutil.rmtree(test_output_folder) # Use shutil.rmtree

    def test_load_df_valid(self):
        preprocessor = preprocess_and_clean(self.valid_csv_path, output_folder=None)
        self.assertIsNotNone(preprocessor.df_raw)
        self.assertEqual(preprocessor.df_raw.shape, (7, 2))

    def test_load_df_file_not_found(self):
        preprocessor = preprocess_and_clean('non_existent_file.csv', output_folder=None)
        self.assertIsNone(preprocessor.df_raw)

    def test_load_df_no_path(self):
        preprocessor = preprocess_and_clean(None, output_folder=None)
        self.assertIsNone(preprocessor.df_raw)
    
    def test_normalise_amharic(self):
        preprocessor = preprocess_and_clean(None, output_folder=None)
        self.assertEqual(preprocessor.normalise_amharic("Hello 😊"), "")
        self.assertEqual(preprocessor.normalise_amharic("Test message with ፡"), "")
        self.assertEqual(preprocessor.normalise_amharic("This is a test."), "")
        self.assertEqual(preprocessor.normalise_amharic("ሀይ፣ እንዴት ነህ?"), "ሀይ እንዴት ነህ")
        self.assertEqual(preprocessor.normalise_amharic(""), "")
        self.assertEqual(preprocessor.normalise_amharic("   "), "")
        self.assertEqual(preprocessor.normalise_amharic(None), "")

    def test_clean_df_valid(self):
        preprocessor = preprocess_and_clean(self.valid_csv_path, output_folder=None)
        cleaned_df = preprocessor.clean_df()
        self.assertIsNotNone(cleaned_df)
        self.assertGreaterEqual(cleaned_df.shape[0], 1)
        self.assertIn('ID', cleaned_df.columns)
        self.assertIn('Message', cleaned_df.columns)
        self.assertTrue(cleaned_df['Message'].str.contains('ሀይ').any())

    def test_clean_df_empty_messages(self):
        preprocessor = preprocess_and_clean(self.empty_messages_csv_path, output_folder=None)
        cleaned_df = preprocessor.clean_df()
        self.assertIsNotNone(cleaned_df)
        self.assertEqual(cleaned_df.shape[0], 2)

    def test_clean_df_whitespace_columns(self):
        preprocessor = preprocess_and_clean(self.whitespace_cols_csv_path, output_folder=None)
        cleaned_df = preprocessor.clean_df()
        self.assertIsNotNone(cleaned_df)
        self.assertNotIn('WhitespaceCol', cleaned_df.columns)

    def test_clean_df_no_raw_df(self):
        preprocessor = preprocess_and_clean(None, output_folder=None)
        cleaned_df = preprocessor.clean_df()
        self.assertIsNone(cleaned_df)

    def test_output_folder_creation(self):
        test_output_folder = os.path.join(self.test_dir, 'output_test')
        preprocessor = preprocess_and_clean(self.valid_csv_path, output_folder=test_output_folder)
        preprocessor.clean_df()
        self.assertTrue(os.path.exists(test_output_folder))

        for f in os.listdir(test_output_folder):
            os.remove(os.path.join(test_output_folder, f))
        os.rmdir(test_output_folder)

if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

DataFrame loaded successfully from test_data\empty_messages_data.csv

DataFrame head:


Unnamed: 0,ID,Message
0,1,ሀይ እንዴት ነህ?
1,2,
2,3,
3,4,ይሄ ሌላ መልዕክት ነው



DataFrame shape:


(4, 2)


Phone number separated by space are handled.

Prefixes are handled.

Emojis and non Geʽez characters in 'Message' column are handeled.

Empty rows where 'Message' column is empty and or has just numeric values are dropped.

DataFrame head:


Unnamed: 0,ID,Message
0,1,ሀይ እንዴት ነህ
1,4,ይሄ ሌላ መልዕክት ነው



DataFrame shape:


(2, 2)

..


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ID       2 non-null      int64 
 1   Message  2 non-null      object
dtypes: int64(1), object(1)
memory usage: 164.0+ bytes

Prrocessed DataFrame Saved to: data\processed_telegram_data.csv
DataFrame loaded successfully from test_data\valid_data.csv

DataFrame head:


Unnamed: 0,ID,Message
0,1,Hello 😊
1,2,Test message with ፡
2,3,This is a test.
3,4,ሀይ፣ እንዴት ነህ?
4,5,



DataFrame shape:


(7, 2)


Phone number separated by space are handled.

Prefixes are handled.

Emojis and non Geʽez characters in 'Message' column are handeled.

Empty rows where 'Message' column is empty and or has just numeric values are dropped.

DataFrame head:


Unnamed: 0,ID,Message
0,4,ሀይ እንዴት ነህ



DataFrame shape:


(1, 2)

.


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ID       1 non-null      int64 
 1   Message  1 non-null      object
dtypes: int64(1), object(1)
memory usage: 148.0+ bytes

Prrocessed DataFrame Saved to: data\processed_telegram_data.csv
DataFrame loaded successfully from test_data\whitespace_cols_data.csv

DataFrame head:


Unnamed: 0,ID,Message,WhitespaceCol
0,1,Msg 1,
1,2,Msg 2,



DataFrame shape:


(2, 3)


Phone number separated by space are handled.

Prefixes are handled.

Emojis and non Geʽez characters in 'Message' column are handeled.

Empty rows where 'Message' column is empty and or has just numeric values are dropped.

Dropped columns containing only whitespace: ['ID', 'Message', 'WhitespaceCol']

DataFrame head:



DataFrame shape:


(0, 0)

..


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame

Prrocessed DataFrame Saved to: data\processed_telegram_data.csv
Error: File not found at non_existent_file.csv


.

DataFrame loaded successfully from test_data\valid_data.csv

DataFrame head:


Unnamed: 0,ID,Message
0,1,Hello 😊
1,2,Test message with ፡
2,3,This is a test.
3,4,ሀይ፣ እንዴት ነህ?
4,5,



DataFrame shape:


(7, 2)

..

DataFrame loaded successfully from test_data\valid_data.csv

DataFrame head:


Unnamed: 0,ID,Message
0,1,Hello 😊
1,2,Test message with ፡
2,3,This is a test.
3,4,ሀይ፣ እንዴት ነህ?
4,5,



DataFrame shape:


(7, 2)


Phone number separated by space are handled.

Prefixes are handled.

Emojis and non Geʽez characters in 'Message' column are handeled.

Empty rows where 'Message' column is empty and or has just numeric values are dropped.

DataFrame head:


Unnamed: 0,ID,Message
0,4,ሀይ እንዴት ነህ



DataFrame shape:


(1, 2)


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ID       1 non-null      int64 
 1   Message  1 non-null      object
dtypes: int64(1), object(1)
memory usage: 148.0+ bytes


.
----------------------------------------------------------------------
Ran 9 tests in 0.343s

OK



Prrocessed DataFrame Saved to: test_data\output_test\processed_telegram_data.csv
