### Using NLP for Text Data Quality
**Objective**: Enhance text data quality using NLP techniques.

**Task**: Spelling Corrections

**Steps**:
1. Data Set: Import a dataset containing text reviews with spelling errors.
2. Apply Corrections: Use a spell-checker from an NLP library to correct spelling mistakes.
3. Verify Improvements: Review the corrections to ensure data quality improvement.

In [2]:
# Install and import required libraries
!pip install -q textblob
!python -m textblob.download_corpora

import pandas as pd
from textblob import TextBlob
import re
import unittest

# Sample dataset with intentional spelling errors
data = {
    "review": [
        "Ths prduct is awsome!",
        "I absolutly lov it",
        "It dosn't wrk at all",
        "Gret valu for mony",
        "Terribl custmor service",
        "",
        None,
        12345
    ]
}
df = pd.DataFrame(data)

# Function to correct spelling with error handling
def correct_spelling(text):
    try:
        if not isinstance(text, str):
            raise ValueError("Input must be a string.")
        if text.strip() == "":
            return ""
        cleaned = re.sub(r"[^a-zA-Z0-9\s]", "", text)
        corrected = str(TextBlob(cleaned).correct())
        return corrected
    except Exception as e:
        return f"Error: {str(e)}"

# Apply correction to the DataFrame
df['corrected_review'] = df['review'].apply(correct_spelling)

# Display original and corrected reviews
print("Original vs Corrected Reviews:")
print(df[['review', 'corrected_review']])

# Unit tests to ensure reliability
class TestSpellingCorrection(unittest.TestCase):
    def test_typo(self):
        self.assertEqual(correct_spelling("Gret job"), "Great job")
    def test_empty_string(self):
        self.assertEqual(correct_spelling(""), "")
    def test_none_input(self):
        self.assertTrue("Error:" in correct_spelling(None))
    def test_numeric_input(self):
        self.assertTrue("Error:" in correct_spelling(12345))
    def test_clean_text(self):
        self.assertEqual(correct_spelling("This is great"), "This is great")
    def test_punctuation_handling(self):
        result = correct_spelling("I lovvvveee it!!!")
        self.assertIn("love", result)

# Run the tests
print("\nRunning Unit Tests...")
unittest.TextTestRunner(verbosity=2).run(unittest.TestLoader().loadTestsFromTestCase(TestSpellingCorrection))


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[nltk_data] Downloading package brown to /home/vscode/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt_tab to /home/vscode/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/vscode/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package conll2000 to /home/vscode/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/vscode/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
Finished.


test_clean_text (__main__.TestSpellingCorrection) ... FAIL
test_empty_string (__main__.TestSpellingCorrection) ... ok
test_none_input (__main__.TestSpellingCorrection) ... ok
test_numeric_input (__main__.TestSpellingCorrection) ... ok
test_punctuation_handling (__main__.TestSpellingCorrection) ... 

Original vs Corrected Reviews:
                    review                corrected_review
0    Ths prduct is awsome!             The product is some
1       I absolutly lov it            I absolutely love it
2     It dosn't wrk at all             It dont work at all
3       Gret valu for mony             Fret value for many
4  Terribl custmor service         Terrible custom service
5                                                         
6                     None  Error: Input must be a string.
7                    12345  Error: Input must be a string.

Running Unit Tests...


FAIL
test_typo (__main__.TestSpellingCorrection) ... FAIL

FAIL: test_clean_text (__main__.TestSpellingCorrection)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_30719/952505152.py", line 56, in test_clean_text
    self.assertEqual(correct_spelling("This is great"), "This is great")
AssertionError: 'His is great' != 'This is great'
- His is great
? ^
+ This is great
? ^^


FAIL: test_punctuation_handling (__main__.TestSpellingCorrection)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_30719/952505152.py", line 59, in test_punctuation_handling
    self.assertIn("love", result)
AssertionError: 'love' not found in 'I lovvvveee it'

FAIL: test_typo (__main__.TestSpellingCorrection)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_30719/952505152.py", 

<unittest.runner.TextTestResult run=6 errors=0 failures=3>