In [1]:
import unittest
import pandas as pd
import os
import sys

# Setup import path
sys.path.append(os.path.abspath(os.path.join('..')))

from scripts._02_label_data import label_data

class TestLabelData(unittest.TestCase):

    def setUp(self):
        # Create a dummy DataFrame for testing
        data = {'Message': ['መገናኛ አካባቢ የሚገኝ ሱቅ ቁጥር 123',
                            '0911223344 ላይ ይደውሉ',
                            'ዋጋው 100 ብር ብቻ',
                            'ቦሌ አካባቢ የሚገኝ መኪና',
                            'አዲስ አበባ መርካቶ']}
        self.dummy_df_path = 'dummy_data.csv'
        pd.DataFrame(data).to_csv(self.dummy_df_path, index=False)
        self.labeler = label_data(self.dummy_df_path)

    def tearDown(self):
        # Clean up the dummy file
        if os.path.exists(self.dummy_df_path):
            os.remove(self.dummy_df_path)
        # Clean up dummy output file if created
        output_file = os.path.join(self.labeler.output_folder, "ner_amharic_conll.txt")
        if os.path.exists(output_file):
            os.remove(output_file)
        if os.path.exists(self.labeler.output_folder):
            os.rmdir(self.labeler.output_folder)


    def test_load_df(self):
        self.assertIsNotNone(self.labeler.df)
        self.assertEqual(self.labeler.df.shape[0], 5)

    def test_strip_prefix(self):
        self.assertEqual(self.labeler.strip_prefix("ከመገናኛ"), "መገናኛ")
        self.assertEqual(self.labeler.strip_prefix("በሱቅ"), "ሱቅ")
        self.assertEqual(self.labeler.strip_prefix("የአድራሻ"), "አድራሻ")
        self.assertEqual(self.labeler.strip_prefix("ለዋጋ"), "ዋጋ")
        self.assertEqual(self.labeler.strip_prefix("ስለቦሌ"), "ቦሌ")
        self.assertEqual(self.labeler.strip_prefix("መኪና"), "መኪና")

    def test_label_token_phone(self):
        self.assertEqual(self.labeler.label_token("0911223344"), "B-PHONE")
        self.assertEqual(self.labeler.label_token("251911223344"), "B-PHONE")
        self.assertEqual(self.labeler.label_token("InvalidPhone"), "O")

    def test_label_token_price(self):
        self.assertEqual(self.labeler.label_token("ብር"), "B-PRICE")
        self.assertEqual(self.labeler.label_token("100", "ብር"), "I-PRICE")
        self.assertEqual(self.labeler.label_token("ብቻ", "ብር"), "I-PRICE")
        self.assertEqual(self.labeler.label_token("ነፃ"), "B-PRICE")
        self.assertEqual(self.labeler.label_token("100"), "B-PRICE")
        self.assertEqual(self.labeler.label_token("100", "ብር"), "I-PRICE")
        self.assertEqual(self.labeler.label_token("100", "ነፃ"), "O") # Testing the forbidden case
        self.assertEqual(self.labeler.label_token("ዋጋው"), "B-PRICE")


    def test_label_token_product(self):
        self.assertEqual(self.labeler.label_token("መኪና"), "B-PRODUCT")
        self.assertEqual(self.labeler.label_token("ጫማዎች"), "B-PRODUCT")
        self.assertEqual(self.labeler.label_token("እቃ"), "I-PRODUCT")
        self.assertEqual(self.labeler.label_token("የመኪና"), "B-PRODUCT")

    def test_match_multiword_entity(self):
        # Test case 1: Known multi-token location
        tokens1 = "አዲስ አበባ መርካቶ".split()
        labels1 = self.labeler.match_multiword_entity(tokens1)
        self.assertEqual(labels1, ["B-LOC", "I-LOC", "I-LOC"])


        # Test case 2: Suffix locator and numeric marker
        tokens2 = "መገናኛ አካባቢ ሱቅ ቁጥር 123".split()
        labels2 = self.labeler.match_multiword_entity(tokens2)
        self.assertEqual(labels2, ["B-LOC", "I-LOC", "I-LOC", "I-LOC", "I-LOC"])

        # Test case 3: Single token known location
        tokens3 = "ቦሌ".split()
        labels3 = self.labeler.match_multiword_entity(tokens3)
        self.assertEqual(labels3, ["B-LOC"])

        # Test case 4: Address keyword
        tokens4 = "አድራሻ".split()
        labels4 = self.labeler.match_multiword_entity(tokens4)
        self.assertEqual(labels4, ["B-LOC"])

        # Test case 5: Directional phrase
        tokens5 = "ወደ ግራ".split()
        labels5 = self.labeler.match_multiword_entity(tokens5)
        self.assertEqual(labels5, ["B-LOC", "I-LOC"])

        # Test case 6: No location
        tokens6 = "ይህ መልዕክት ነው".split()
        labels6 = self.labeler.match_multiword_entity(tokens6)
        self.assertEqual(labels6, ["O", "O", "O"])

    def test_generate_conll_sample(self):
        self.labeler.generate_conll_sample(sample_size=5)
        out_path = os.path.join(self.labeler.output_folder, "ner_amharic_conll.txt")
        self.assertTrue(os.path.exists(out_path))
        with open(out_path, 'r', encoding='utf-8') as f:
            content = f.read()
            # Check if some expected tags are present (heuristic)
            self.assertTrue("B-LOC" in content or "I-LOC" in content)
            self.assertIn("B-PHONE", content)
            self.assertTrue("B-PRICE" in content or "I-PRICE" in content)
            self.assertTrue("B-PRODUCT" in content or "I-PRODUCT" in content)


# Rrun the test
if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

Loaded DataFrame from dummy_data.csv


Unnamed: 0,Message
0,መገናኛ አካባቢ የሚገኝ ሱቅ ቁጥር 123
1,0911223344 ላይ ይደውሉ
2,ዋጋው 100 ብር ብቻ
3,ቦሌ አካባቢ የሚገኝ መኪና
4,አዲስ አበባ መርካቶ


(5, 1)

.


Heuristically labeled CoNLL saved to: data\ner_amharic_conll.txt
Loaded DataFrame from dummy_data.csv


Unnamed: 0,Message
0,መገናኛ አካባቢ የሚገኝ ሱቅ ቁጥር 123
1,0911223344 ላይ ይደውሉ
2,ዋጋው 100 ብር ብቻ
3,ቦሌ አካባቢ የሚገኝ መኪና
4,አዲስ አበባ መርካቶ


(5, 1)

.

Loaded DataFrame from dummy_data.csv


Unnamed: 0,Message
0,መገናኛ አካባቢ የሚገኝ ሱቅ ቁጥር 123
1,0911223344 ላይ ይደውሉ
2,ዋጋው 100 ብር ብቻ
3,ቦሌ አካባቢ የሚገኝ መኪና
4,አዲስ አበባ መርካቶ


(5, 1)

.

Loaded DataFrame from dummy_data.csv


Unnamed: 0,Message
0,መገናኛ አካባቢ የሚገኝ ሱቅ ቁጥር 123
1,0911223344 ላይ ይደውሉ
2,ዋጋው 100 ብር ብቻ
3,ቦሌ አካባቢ የሚገኝ መኪና
4,አዲስ አበባ መርካቶ


(5, 1)

.

Loaded DataFrame from dummy_data.csv


Unnamed: 0,Message
0,መገናኛ አካባቢ የሚገኝ ሱቅ ቁጥር 123
1,0911223344 ላይ ይደውሉ
2,ዋጋው 100 ብር ብቻ
3,ቦሌ አካባቢ የሚገኝ መኪና
4,አዲስ አበባ መርካቶ


(5, 1)

.

Loaded DataFrame from dummy_data.csv


Unnamed: 0,Message
0,መገናኛ አካባቢ የሚገኝ ሱቅ ቁጥር 123
1,0911223344 ላይ ይደውሉ
2,ዋጋው 100 ብር ብቻ
3,ቦሌ አካባቢ የሚገኝ መኪና
4,አዲስ አበባ መርካቶ


(5, 1)

.

Loaded DataFrame from dummy_data.csv


Unnamed: 0,Message
0,መገናኛ አካባቢ የሚገኝ ሱቅ ቁጥር 123
1,0911223344 ላይ ይደውሉ
2,ዋጋው 100 ብር ብቻ
3,ቦሌ አካባቢ የሚገኝ መኪና
4,አዲስ አበባ መርካቶ


(5, 1)

.
----------------------------------------------------------------------
Ran 7 tests in 0.200s

OK
