import

In [1]:
import pandas as pd
import re
import nltk
import textstat
import time
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [2]:
class TextCleaningPipeline:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
     
        self.stats = {
            'emoticons_removed': 0,
            'stop_words_removed': 0,
            'special_chars_removed': 0,
            'tokens_count_before': 0,
            'tokens_count_after': 0,
            'lowercase_converted': 0,
            'addresses_found': 0,  
            'phones_found': 0,
            'accounts_found': 0,
            'vocab_before': set(),
            'vocab_after': set()
        }

    def clean_and_collect_stats(self, text):
        if pd.isna(text) or text == '':
            return ""

        sentences_raw = sent_tokenize(text)
        words_raw = word_tokenize(text)
        self.stats['tokens_count_before'] += len(words_raw)
        self.stats['vocab_before'].update(words_raw)

        self.stats['phones_found'] += len(re.findall(r'\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}', text))
        self.stats['addresses_found'] += len(re.findall(r'\d+\s[A-z]+\s[St|Ave|Dr|Rd]', text)) 
        self.stats['accounts_found'] += len(re.findall(r'\b\d{8,12}\b', text)) 

        emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
        self.stats['emoticons_removed'] += len(emoticons)
        
        text_no_html = re.sub(r'<.*?>', '', text)
        
        special_chars = re.findall(r'[^a-zA-Z0-9\s]', text_no_html)
        self.stats['special_chars_removed'] += len(special_chars)

        text_cleaned = re.sub(r'[^a-zA-Z\s]', '', text_no_html) 
        
        self.stats['lowercase_converted'] += len(re.findall(r'[A-Z]', text_cleaned))
        text_lower = text_cleaned.lower()

        tokens = word_tokenize(text_lower)
        
        final_tokens = []
        for w in tokens:
            if w in self.stop_words:
                self.stats['stop_words_removed'] += 1
            else:
                # --- 6. Step 4: Lemmatization ---
                lemma = self.lemmatizer.lemmatize(w)
                final_tokens.append(lemma)
        
        self.stats['tokens_count_after'] += len(final_tokens)
        self.stats['vocab_after'].update(final_tokens)

        return " ".join(final_tokens)

    def get_lexical_diversity(self, tokens_list):
        if len(tokens_list) == 0: return 0
        return len(set(tokens_list)) / len(tokens_list)

In [6]:
start_time = time.time()

try:
    df = pd.read_csv('Sentiment Analysis Dataset.csv', encoding='ISO-8859-1')
except FileNotFoundError:
    df = pd.read_csv('resouce/Sentiment Analysis Dataset.csv', encoding='ISO-8859-1')

if 'SentimentText' in df.columns:
    df.rename(columns={'SentimentText': 'review'}, inplace=True)

pipeline = TextCleaningPipeline()

print("Calculating raw stats...")
df['sent_count_raw'] = df['review'].apply(lambda x: len(sent_tokenize(str(x))))
df['word_count_raw'] = df['review'].apply(lambda x: len(word_tokenize(str(x))))
avg_sent_len_before = (df['word_count_raw'] / df['sent_count_raw']).mean()

print("Processing Cleaning Pipeline... (This might take a while)")
df['cleaned_review'] = df['review'].apply(pipeline.clean_and_collect_stats)

print("Calculating final metrics...")
df['readability_score'] = df['cleaned_review'].apply(textstat.flesch_kincaid_grade)
df['lexical_diversity'] = df['cleaned_review'].apply(lambda x: pipeline.get_lexical_diversity(x.split()))
df['word_count_after'] = df['cleaned_review'].apply(lambda x: len(x.split()))

all_cleaned_words = " ".join(df['cleaned_review'].tolist()).split()
max_word_len = max([len(w) for w in all_cleaned_words]) if all_cleaned_words else 0

end_time = time.time()
total_runtime = end_time - start_time

Calculating raw stats...
Processing Cleaning Pipeline... (This might take a while)
Calculating final metrics...


In [7]:

stats = pipeline.stats

print("\n" + "#" * 30)
print("### Text Cleaning Statistics ###")
print("#" * 30)

print(f"- Number of documents: {len(df)}")
print(f"- Average tokens per document: {stats['tokens_count_before']/len(df):.0f} -> {stats['tokens_count_after']/len(df):.0f}")
print(f"- Total vocabulary size: {len(stats['vocab_before'])} -> {len(stats['vocab_after'])}")
print(f"- Average Sentence Length (Raw): {avg_sent_len_before:.2f} words/sentence")
print(f"- Max Word Length (Cleaned): {max_word_len}")
print("-" * 20)
print(f"- Number of stop words removed: {stats['stop_words_removed']}")
print(f"- Special characters removed: {stats['special_chars_removed']}")
print(f"- Emoticons removed: {stats['emoticons_removed']}")
print(f"- Lowercase converted: {stats['lowercase_converted']}")
print("-" * 20)
print(f"- Addresses found (Approx): {stats['addresses_found']}")
print(f"- Phone numbers found: {stats['phones_found']}")
print(f"- Account numbers found: {stats['accounts_found']}")
print("-" * 20)
print(f"- Average Readability Score (Flesch-Kincaid): {df['readability_score'].mean():.2f}")
print(f"- Average Lexical Diversity: {df['lexical_diversity'].mean():.2f}")
print("-" * 20)
print(f"- Total runtime: {total_runtime:.2f} seconds ({total_runtime/60:.2f} minutes)")
print("#" * 30)


##############################
### Text Cleaning Statistics ###
##############################
- Number of documents: 1048575
- Average tokens per document: 17 -> 8
- Total vocabulary size: 696811 -> 618111
- Average Sentence Length (Raw): 10.90 words/sentence
- Max Word Length (Cleaned): 125
--------------------
- Number of stop words removed: 5550679
- Special characters removed: 4480301
- Emoticons removed: 10561
- Lowercase converted: 3455246
--------------------
- Addresses found (Approx): 23752
- Phone numbers found: 608
- Account numbers found: 513
--------------------
- Average Readability Score (Flesch-Kincaid): 6.83
- Average Lexical Diversity: 0.98
--------------------
- Total runtime: 583.56 seconds (9.73 minutes)
##############################
