# 02 - Text Preprocessing## INSY 669 Text Analytics | GLP-1 Weight Loss DrugsThis notebook covers text preprocessing steps:1. Text cleaning and normalization2. Tokenization3. Stopword removal4. Lemmatization5. Bag-of-Words and TF-IDF representations6. Corpus separation (Public vs Media)

In [None]:
import pandas as pdimport numpy as npimport reimport nltkfrom nltk.tokenize import word_tokenizefrom nltk.corpus import stopwordsfrom nltk.stem import WordNetLemmatizerfrom sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizerimport matplotlib.pyplot as pltimport seaborn as snsimport warningswarnings.filterwarnings('ignore')# Download NLTK resourcesnltk.download('punkt', quiet=True)nltk.download('punkt_tab', quiet=True)nltk.download('stopwords', quiet=True)nltk.download('wordnet', quiet=True)

## 2.1 Load Raw Data

In [None]:
df_reddit = pd.read_csv('../data/reddit_posts.csv')df_webmd = pd.read_csv('../data/webmd_reviews.csv')df_news = pd.read_csv('../data/news_articles.csv')# Create unified corporadf_public = pd.concat([    df_reddit[['id','text','date']].assign(source='reddit'),    df_webmd[['id','text','date']].assign(source='webmd')], ignore_index=True)df_media = df_news[['id','text','date']].assign(source='news')print(f"Public corpus: {len(df_public)} documents")print(f"Media corpus: {len(df_media)} documents")

## 2.2 Text Cleaning PipelineOur preprocessing pipeline:1. Convert to lowercase2. Remove special characters and numbers3. Tokenize4. Remove stopwords5. Lemmatize

In [None]:
stop_words = set(stopwords.words('english'))# Add domain-specific stopwordsstop_words.update(['mg', 'would', 'also', 'get', 'got', 'one', 'like', 'even', 'im', 'ive'])lemmatizer = WordNetLemmatizer()def preprocess(text):    """Full preprocessing pipeline."""    # Lowercase    text = str(text).lower()    # Remove special characters and numbers    text = re.sub(r'[^a-zA-Z\s]', '', text)    # Tokenize    tokens = word_tokenize(text)    # Remove stopwords and short tokens, lemmatize    tokens = [lemmatizer.lemmatize(t) for t in tokens               if t not in stop_words and len(t) > 2]    return ' '.join(tokens)# Apply preprocessingdf_public['clean'] = df_public['text'].apply(preprocess)df_media['clean'] = df_media['text'].apply(preprocess)# Show examplesprint("=== Original ===")print(df_public['text'].iloc[0])print("\n=== Cleaned ===")print(df_public['clean'].iloc[0])

## 2.3 Document Statistics

In [None]:
# Token countsdf_public['token_count'] = df_public['clean'].apply(lambda x: len(x.split()))df_media['token_count'] = df_media['clean'].apply(lambda x: len(x.split()))fig, axes = plt.subplots(1, 2, figsize=(14, 5))axes[0].hist(df_public['token_count'], bins=30, color='#2196F3', alpha=0.8, edgecolor='white')axes[0].set_title('Public: Token Count Distribution', fontweight='bold')axes[0].set_xlabel('Number of Tokens')axes[1].hist(df_media['token_count'], bins=30, color='#FF9800', alpha=0.8, edgecolor='white')axes[1].set_title('Media: Token Count Distribution', fontweight='bold')axes[1].set_xlabel('Number of Tokens')plt.tight_layout()plt.show()print(f"Public - Mean tokens: {df_public['token_count'].mean():.1f}, Median: {df_public['token_count'].median():.1f}")print(f"Media  - Mean tokens: {df_media['token_count'].mean():.1f}, Median: {df_media['token_count'].median():.1f}")

## 2.4 Bag-of-Words Representation

In [None]:
# Bag of Wordsbow_vectorizer = CountVectorizer(max_features=3000, min_df=5)# Fit on combined corpusall_clean = pd.concat([df_public['clean'], df_media['clean']])bow_matrix = bow_vectorizer.fit_transform(all_clean)print(f"BoW matrix shape: {bow_matrix.shape}")print(f"Vocabulary size: {len(bow_vectorizer.vocabulary_)}")# Most common wordsword_freq = np.array(bow_matrix.sum(axis=0)).flatten()feature_names = bow_vectorizer.get_feature_names_out()top_20 = sorted(zip(feature_names, word_freq), key=lambda x: x[1], reverse=True)[:20]print("\nTop 20 most frequent terms:")for term, freq in top_20:    print(f"  {term:20s} {freq:>6.0f}")

## 2.5 TF-IDF Representation

In [None]:
# TF-IDFtfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=5)# Public corpus TF-IDFtfidf_public = tfidf_vectorizer.fit_transform(df_public['clean'])print(f"Public TF-IDF shape: {tfidf_public.shape}")# Media corpus TF-IDFtfidf_vectorizer2 = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=3)tfidf_media = tfidf_vectorizer2.fit_transform(df_media['clean'])print(f"Media TF-IDF shape: {tfidf_media.shape}")

## 2.6 Save Processed Data

In [None]:
df_public.to_csv('../data/public_processed.csv', index=False)df_media.to_csv('../data/media_processed.csv', index=False)print("Processed data saved successfully!")print(f"Public: {len(df_public)} documents")print(f"Media: {len(df_media)} documents")