___
1. NLP in Practice

___

# Exploratory Data Analysis on Text Data

In this notebook we'll apply everything we've learned so far to perform **Exploratory Data Analysis (EDA)** on a real text dataset. We'll use spaCy for NLP processing and matplotlib/seaborn for visualization.

**Dataset:** 20 Newsgroups - a classic dataset for text classification containing ~20,000 newsgroup posts across 20 topics.

# Setup

In [None]:
!pip install -U spacy
!python -m spacy download en_core_web_sm

In [None]:
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load the Dataset

The 20 Newsgroups dataset is available directly from scikit-learn.

In [None]:
from sklearn.datasets import fetch_20newsgroups

# Load a subset of categories for faster processing
categories = ['sci.space', 'rec.sport.baseball', 'comp.graphics', 'talk.politics.misc']

newsgroups = fetch_20newsgroups(
    subset='train',
    categories=categories,
    remove=('headers', 'footers', 'quotes')  # Remove metadata for cleaner text
)

print(f"Number of documents: {len(newsgroups.data)}")
print(f"Categories: {newsgroups.target_names}")

In [None]:
# Create a DataFrame
df = pd.DataFrame({
    'text': newsgroups.data,
    'category': [newsgroups.target_names[i] for i in newsgroups.target]
})

df.head()

In [None]:
# Look at a sample document
print("Sample document:")
print("-" * 50)
print(df['text'].iloc[0][:500])

# Basic Statistics

In [None]:
# Add text length columns
df['char_count'] = df['text'].apply(len)
df['word_count'] = df['text'].apply(lambda x: len(x.split()))

df[['category', 'char_count', 'word_count']].head(10)

In [None]:
# Summary statistics
df[['char_count', 'word_count']].describe()

# Category Distribution

In [None]:
# Count documents per category
category_counts = df['category'].value_counts()

plt.figure(figsize=(10, 6))
sns.barplot(x=category_counts.values, y=category_counts.index, palette='viridis')
plt.xlabel('Number of Documents')
plt.ylabel('Category')
plt.title('Document Distribution by Category')
plt.tight_layout()
plt.show()

# Text Length Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Character count distribution
axes[0].hist(df['char_count'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Character Count')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Character Count')
axes[0].axvline(df['char_count'].median(), color='red', linestyle='--', label=f'Median: {df["char_count"].median():.0f}')
axes[0].legend()

# Word count distribution
axes[1].hist(df['word_count'], bins=50, edgecolor='black', alpha=0.7, color='orange')
axes[1].set_xlabel('Word Count')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of Word Count')
axes[1].axvline(df['word_count'].median(), color='red', linestyle='--', label=f'Median: {df["word_count"].median():.0f}')
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Text length by category
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='category', y='word_count', palette='Set2')
plt.xlabel('Category')
plt.ylabel('Word Count')
plt.title('Word Count Distribution by Category')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# NLP Processing with spaCy

Now let's process the texts with spaCy to extract linguistic features.

In [None]:
# Process a sample of documents (full dataset would take too long)
# We'll use nlp.pipe for efficiency

sample_size = 500
df_sample = df.sample(n=sample_size, random_state=42).reset_index(drop=True)

print(f"Processing {sample_size} documents with spaCy...")

# Process with spaCy
docs = list(nlp.pipe(df_sample['text'], batch_size=50))

print("Done!")

# Token Analysis

In [None]:
# Extract tokens (excluding punctuation and spaces)
all_tokens = []
all_tokens_no_stop = []

for doc in docs:
    for token in doc:
        if not token.is_punct and not token.is_space:
            all_tokens.append(token.text.lower())
            if not token.is_stop:
                all_tokens_no_stop.append(token.text.lower())

print(f"Total tokens: {len(all_tokens)}")
print(f"Tokens without stopwords: {len(all_tokens_no_stop)}")
print(f"Unique tokens: {len(set(all_tokens))}")
print(f"Unique tokens (no stopwords): {len(set(all_tokens_no_stop))}")

In [None]:
# Top 20 most common tokens (with stopwords)
token_freq = Counter(all_tokens)
top_tokens = token_freq.most_common(20)

plt.figure(figsize=(12, 6))
words, counts = zip(*top_tokens)
sns.barplot(x=list(counts), y=list(words), palette='Blues_d')
plt.xlabel('Frequency')
plt.ylabel('Token')
plt.title('Top 20 Most Common Tokens (with stopwords)')
plt.tight_layout()
plt.show()

In [None]:
# Top 20 most common tokens (without stopwords)
token_freq_no_stop = Counter(all_tokens_no_stop)
top_tokens_no_stop = token_freq_no_stop.most_common(20)

plt.figure(figsize=(12, 6))
words, counts = zip(*top_tokens_no_stop)
sns.barplot(x=list(counts), y=list(words), palette='Greens_d')
plt.xlabel('Frequency')
plt.ylabel('Token')
plt.title('Top 20 Most Common Tokens (without stopwords)')
plt.tight_layout()
plt.show()

# Lemma Analysis

Comparing tokens vs lemmas to see the effect of lemmatization.

In [None]:
# Extract lemmas
all_lemmas = []

for doc in docs:
    for token in doc:
        if not token.is_punct and not token.is_space and not token.is_stop:
            all_lemmas.append(token.lemma_.lower())

print(f"Unique tokens (no stopwords): {len(set(all_tokens_no_stop))}")
print(f"Unique lemmas (no stopwords): {len(set(all_lemmas))}")
print(f"Vocabulary reduction: {(1 - len(set(all_lemmas))/len(set(all_tokens_no_stop)))*100:.1f}%")

In [None]:
# Top 20 most common lemmas
lemma_freq = Counter(all_lemmas)
top_lemmas = lemma_freq.most_common(20)

plt.figure(figsize=(12, 6))
words, counts = zip(*top_lemmas)
sns.barplot(x=list(counts), y=list(words), palette='Oranges_d')
plt.xlabel('Frequency')
plt.ylabel('Lemma')
plt.title('Top 20 Most Common Lemmas (without stopwords)')
plt.tight_layout()
plt.show()

# Part of Speech (POS) Distribution

In [None]:
# Extract POS tags
pos_tags = []

for doc in docs:
    for token in doc:
        if not token.is_punct and not token.is_space:
            pos_tags.append(token.pos_)

pos_freq = Counter(pos_tags)
print("POS Tag Distribution:")
for pos, count in pos_freq.most_common():
    print(f"  {pos}: {count} ({count/len(pos_tags)*100:.1f}%)")

In [None]:
# POS distribution visualization
pos_df = pd.DataFrame(pos_freq.most_common(), columns=['POS', 'Count'])

plt.figure(figsize=(12, 6))
sns.barplot(data=pos_df, x='POS', y='Count', palette='Spectral')
plt.xlabel('Part of Speech')
plt.ylabel('Frequency')
plt.title('Part of Speech Distribution')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# POS distribution as pie chart
plt.figure(figsize=(10, 10))
plt.pie(pos_df['Count'], labels=pos_df['POS'], autopct='%1.1f%%', startangle=90)
plt.title('Part of Speech Distribution')
plt.tight_layout()
plt.show()

# Named Entity Recognition (NER) Analysis

In [None]:
# Extract named entities
entities = []
entity_labels = []

for doc in docs:
    for ent in doc.ents:
        entities.append(ent.text)
        entity_labels.append(ent.label_)

print(f"Total entities found: {len(entities)}")
print(f"Unique entities: {len(set(entities))}")

In [None]:
# Entity type distribution
label_freq = Counter(entity_labels)
label_df = pd.DataFrame(label_freq.most_common(), columns=['Entity Type', 'Count'])

plt.figure(figsize=(12, 6))
sns.barplot(data=label_df, x='Entity Type', y='Count', palette='coolwarm')
plt.xlabel('Entity Type')
plt.ylabel('Frequency')
plt.title('Named Entity Type Distribution')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Top entities by type
entity_df = pd.DataFrame({'entity': entities, 'label': entity_labels})

# Top PERSON entities
print("Top 10 PERSON entities:")
print(entity_df[entity_df['label'] == 'PERSON']['entity'].value_counts().head(10))
print()

# Top ORG entities
print("Top 10 ORG entities:")
print(entity_df[entity_df['label'] == 'ORG']['entity'].value_counts().head(10))
print()

# Top GPE (geopolitical entity) entities
print("Top 10 GPE entities:")
print(entity_df[entity_df['label'] == 'GPE']['entity'].value_counts().head(10))

In [None]:
# Visualize top entities
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

for ax, label, color in zip(axes, ['PERSON', 'ORG', 'GPE'], ['Blues_d', 'Greens_d', 'Reds_d']):
    top_ents = entity_df[entity_df['label'] == label]['entity'].value_counts().head(10)
    if len(top_ents) > 0:
        sns.barplot(x=top_ents.values, y=top_ents.index, palette=color, ax=ax)
        ax.set_xlabel('Frequency')
        ax.set_ylabel('Entity')
        ax.set_title(f'Top 10 {label} Entities')

plt.tight_layout()
plt.show()

# Word Cloud Visualization

In [None]:
!pip install wordcloud

In [None]:
from wordcloud import WordCloud

# Create word cloud from lemmas (no stopwords)
text_for_wordcloud = ' '.join(all_lemmas)

wordcloud = WordCloud(
    width=1200,
    height=600,
    background_color='white',
    colormap='viridis',
    max_words=100
).generate(text_for_wordcloud)

plt.figure(figsize=(15, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud (Lemmas, no stopwords)', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# Word cloud per category
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

for ax, category in zip(axes.flat, df_sample['category'].unique()):
    # Get documents for this category
    category_indices = df_sample[df_sample['category'] == category].index.tolist()
    category_docs = [docs[i] for i in category_indices if i < len(docs)]
    
    # Extract lemmas
    category_lemmas = []
    for doc in category_docs:
        for token in doc:
            if not token.is_punct and not token.is_space and not token.is_stop:
                category_lemmas.append(token.lemma_.lower())
    
    if category_lemmas:
        text = ' '.join(category_lemmas)
        wc = WordCloud(width=600, height=400, background_color='white', colormap='viridis').generate(text)
        ax.imshow(wc, interpolation='bilinear')
        ax.set_title(category, fontsize=14)
    ax.axis('off')

plt.suptitle('Word Clouds by Category', fontsize=16)
plt.tight_layout()
plt.show()

# Sentence Analysis

In [None]:
# Analyze sentence lengths
sentence_lengths = []
sentences_per_doc = []

for doc in docs:
    doc_sentences = list(doc.sents)
    sentences_per_doc.append(len(doc_sentences))
    for sent in doc_sentences:
        sentence_lengths.append(len(sent))

print(f"Total sentences: {len(sentence_lengths)}")
print(f"Average sentence length: {np.mean(sentence_lengths):.1f} tokens")
print(f"Average sentences per document: {np.mean(sentences_per_doc):.1f}")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Sentence length distribution
axes[0].hist(sentence_lengths, bins=50, edgecolor='black', alpha=0.7, color='purple')
axes[0].set_xlabel('Sentence Length (tokens)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Sentence Length')
axes[0].axvline(np.mean(sentence_lengths), color='red', linestyle='--', label=f'Mean: {np.mean(sentence_lengths):.1f}')
axes[0].legend()

# Sentences per document
axes[1].hist(sentences_per_doc, bins=30, edgecolor='black', alpha=0.7, color='teal')
axes[1].set_xlabel('Sentences per Document')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of Sentences per Document')
axes[1].axvline(np.mean(sentences_per_doc), color='red', linestyle='--', label=f'Mean: {np.mean(sentences_per_doc):.1f}')
axes[1].legend()

plt.tight_layout()
plt.show()

# Dependency Parsing Visualization

Let's visualize the dependency structure of a sample sentence using displacy.

In [None]:
from spacy import displacy

# Find a good sample sentence (not too long, not too short)
sample_doc = docs[0]
sample_sentences = [sent for sent in sample_doc.sents if 5 < len(sent) < 15]

if sample_sentences:
    sample_sent = sample_sentences[0]
    print(f"Sample sentence: {sample_sent}")
    print()
    displacy.render(sample_sent, style='dep', jupyter=True)

# NER Visualization

In [None]:
# Find a document with entities
for doc in docs[:20]:
    if len(doc.ents) >= 3:
        print("Sample document with entities:")
        displacy.render(doc[:200], style='ent', jupyter=True)
        break

# Summary Statistics

In [None]:
# Create a summary table
summary = {
    'Total Documents': len(df),
    'Sample Size (processed)': sample_size,
    'Categories': len(df['category'].unique()),
    'Avg Words per Document': f"{df['word_count'].mean():.1f}",
    'Total Tokens (sample)': len(all_tokens),
    'Unique Tokens (sample)': len(set(all_tokens)),
    'Unique Lemmas (sample)': len(set(all_lemmas)),
    'Vocabulary Reduction (lemmatization)': f"{(1 - len(set(all_lemmas))/len(set(all_tokens_no_stop)))*100:.1f}%",
    'Total Entities Found': len(entities),
    'Unique Entity Types': len(set(entity_labels)),
    'Avg Sentences per Document': f"{np.mean(sentences_per_doc):.1f}",
    'Avg Tokens per Sentence': f"{np.mean(sentence_lengths):.1f}"
}

summary_df = pd.DataFrame(list(summary.items()), columns=['Metric', 'Value'])
summary_df

# Key Takeaways

In this EDA we:

1. **Loaded and explored** the 20 Newsgroups dataset
2. **Analyzed text length** distributions (characters, words, sentences)
3. **Compared preprocessing** effects (tokens vs lemmas, with/without stopwords)
4. **Examined POS tag** distributions
5. **Extracted and visualized** named entities
6. **Created word clouds** for the overall corpus and by category
7. **Used displacy** to visualize dependency parsing and NER

These techniques form the foundation for understanding any text dataset before building models.