### Using NLP for Text Data Quality
**Objective**: Enhance text data quality using NLP techniques.

**Task**: Removing Stopwords

**Steps**:
1. Data Set: Use a dataset of text product descriptions.
2. Stopword Removal: Utilize an NLP library (e.g., NLTK) to remove stopwords from the
descriptions.
3. Assess Impact: Examine the effectiveness by analyzing word frequency before and after
removal.

In [1]:
# write your code from here
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import string

# Download required resources
nltk.download('punkt')
nltk.download('stopwords')

# ----------------------------------------
# Step 1: Sample Product Description Dataset
# ----------------------------------------
data = {
    'ProductID': [101, 102, 103, 104],
    'Description': [
        "This is a high-quality wireless Bluetooth speaker with amazing sound!",
        "Elegant and durable leather wallet for men with multiple card slots.",
        "A fast-charging, lightweight power bank with dual USB ports.",
        "Eco-friendly water bottle made of stainless steel, perfect for travel."
    ]
}

df = pd.DataFrame(data)

# ----------------------------------------
# Step 2: Define stopword removal function
# ----------------------------------------
def remove_stopwords(text):
    try:
        if not isinstance(text, str):
            raise ValueError("Invalid input. Must be a string.")
        
        stop_words = set(stopwords.words('english'))
        tokens = word_tokenize(text.lower())  # lowercase and tokenize
        tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
        return " ".join(tokens)
    
    except Exception as e:
        return f"Error: {str(e)}"

# ----------------------------------------
# Step 3: Apply to dataset
# ----------------------------------------
df['CleanedDescription'] = df['Description'].apply(remove_stopwords)

# ----------------------------------------
# Step 4: Frequency analysis
# ----------------------------------------
def get_word_freq(texts):
    all_words = " ".join(texts).lower().split()
    return Counter(all_words)

original_freq = get_word_freq(df['Description'])
cleaned_freq = get_word_freq(df['CleanedDescription'])

# ----------------------------------------
# Step 5: Plotting
# ----------------------------------------
def plot_top_words(freq_dict, title, n=10):
    common = freq_dict.most_common(n)
    words, counts = zip(*common)
    sns.barplot(x=list(counts), y=list(words), palette='viridis')
    plt.title(title)
    plt.xlabel('Frequency')
    plt.show()

print("Original Word Frequency (Top 10):")
plot_top_words(original_freq, "Before Stopword Removal")

print("Cleaned Word Frequency (Top 10):")
plot_top_words(cleaned_freq, "After Stopword Removal")

# ----------------------------------------
# Optional: Show before vs after
# ----------------------------------------
print("\n=== Example Descriptions (Before vs After) ===")
for i in range(len(df)):
    print(f"\nOriginal: {df['Description'][i]}")
    print(f"Cleaned : {df['CleanedDescription'][i]}")

ModuleNotFoundError: No module named 'nltk'