### Using NLP for Text Data Quality
**Objective**: Enhance text data quality using NLP techniques.

**Task**: Removing Stopwords

**Steps**:
1. Data Set: Use a dataset of text product descriptions.
2. Stopword Removal: Utilize an NLP library (e.g., NLTK) to remove stopwords from the
descriptions.
3. Assess Impact: Examine the effectiveness by analyzing word frequency before and after
removal.

In [2]:
import unittest

# ----------------------------------------
# Enhanced stopword removal with more error handling
# ----------------------------------------
def remove_stopwords(text):
    try:
        if not isinstance(text, str):
            raise ValueError("Input must be a string.")
        
        if text.strip() == "":
            return ""
        
        stop_words = set(stopwords.words('english'))
        tokens = word_tokenize(text.lower())
        filtered_tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
        
        return " ".join(filtered_tokens)
    
    except Exception as e:
        return f"Error: {str(e)}"

# ----------------------------------------
# Unit Tests for the remove_stopwords function
# ----------------------------------------
class TestStopwordRemoval(unittest.TestCase):
    
    def test_typical_sentence(self):
        self.assertEqual(remove_stopwords("This is a test sentence."),
                         "test sentence")
    
    def test_empty_string(self):
        self.assertEqual(remove_stopwords(""), "")
    
    def test_only_stopwords(self):
        self.assertEqual(remove_stopwords("the and is in on at"), "")
    
    def test_numbers_and_symbols(self):
        self.assertEqual(remove_stopwords("123 !@# hello!"), "123 hello")
    
    def test_non_string_input(self):
        self.assertTrue("Error:" in remove_stopwords(12345))
        self.assertTrue("Error:" in remove_stopwords(['This', 'is', 'a', 'list']))
    
    def test_mixed_input(self):
        self.assertEqual(remove_stopwords("Wow!!! This, right here, is just amazing."),
                         "wow right amazing")

# ----------------------------------------
# Run unit tests
# ----------------------------------------
unittest.TextTestRunner().run(unittest.TestLoader().loadTestsFromTestCase(TestStopwordRemoval))

.F.FFF
FAIL: test_mixed_input (__main__.TestStopwordRemoval)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_29121/2368000430.py", line 46, in test_mixed_input
    self.assertEqual(remove_stopwords("Wow!!! This, right here, is just amazing."),
AssertionError: "Error: name 'stopwords' is not defined" != 'wow right amazing'
- Error: name 'stopwords' is not defined
+ wow right amazing


FAIL: test_numbers_and_symbols (__main__.TestStopwordRemoval)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_29121/2368000430.py", line 39, in test_numbers_and_symbols
    self.assertEqual(remove_stopwords("123 !@# hello!"), "123 hello")
AssertionError: "Error: name 'stopwords' is not defined" != '123 hello'
- Error: name 'stopwords' is not defined
+ 123 hello


FAIL: test_only_stopwords (__main__.TestStopwordRemoval)
----------------------------

<unittest.runner.TextTestResult run=6 errors=0 failures=4>