# EDA Visualizations for Section II Report

Notebook này tạo các visualization chính xác từ dữ liệu gốc cho báo cáo Section II:
1. Bar chart phân bố CEFR
2. Pie chart tỷ lệ phần trăm các nhãn
3. Histogram phân bố độ dài văn bản
4. Box plot độ dài theo nhãn
5. Word cloud (với xử lý text đúng)

Source data: dataset/cefr_leveled_texts.csv

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import numpy as np
import re

# Handle NLTK setup
try:
    import nltk
    # Try to access stopwords, download if not available
    try:
        from nltk.corpus import stopwords
        ENGLISH_STOP_WORDS = set(stopwords.words('english'))
    except LookupError:
        print("Downloading NLTK stopwords...")
        nltk.download('stopwords', quiet=True)
        from nltk.corpus import stopwords
        ENGLISH_STOP_WORDS = set(stopwords.words('english'))
except ImportError:
    print("NLTK not found, using basic English stopwords...")
    # Basic English stopwords as fallback
    ENGLISH_STOP_WORDS = {'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for',
                         'from', 'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on',
                         'that', 'the', 'to', 'was', 'were', 'will', 'with'}

# Style settings - with error handling
try:
    plt.style.use('seaborn')
except:
    print("Warning: seaborn style not available, using default style")
    
# Set seaborn defaults
sns.set_theme()  # This is more reliable than plt.style.use('seaborn')
sns.set_palette('muted')

# Read data
df = pd.read_csv('dataset/cefr_leveled_texts.csv')
df['label'] = df['label'].str.upper()
print(f"Loaded {len(df)} texts from dataset")

Downloading NLTK stopwords...


OSError: 'seaborn' is not a valid package style, path of style file, URL of style file, or library style name (library styles are listed in `style.available`)

In [None]:
def clean_text_for_wordcloud(text):
    """Clean text specifically for word cloud - preserve meaningful words only"""
    if not isinstance(text, str):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters but preserve words
    text = re.sub(r'[^a-z\s]', ' ', text)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

## 1. Bar Chart - CEFR Label Distribution

In [None]:
plt.figure(figsize=(10, 6))
counts = df['label'].value_counts().reindex(['A1','A2','B1','B2','C1','C2'])

ax = sns.barplot(x=counts.index, y=counts.values)
plt.title('CEFR Label Distribution', fontsize=14, pad=15)
plt.xlabel('CEFR Level')
plt.ylabel('Count')

# Add value labels on bars
for i, v in enumerate(counts.values):
    ax.text(i, v + 1, str(int(v)), ha='center', va='bottom')

plt.tight_layout()
plt.savefig('cefr_label_distribution_fixed.png', dpi=300, bbox_inches='tight')
plt.show()

## 2. Pie Chart - Label Percentages

In [None]:
plt.figure(figsize=(8, 8))
plt.pie(counts.values, labels=counts.index, autopct='%1.1f%%', 
        colors=sns.color_palette('muted'), startangle=90)
plt.title('CEFR Label Distribution (%)', pad=15)
plt.axis('equal')

plt.savefig('label_percentage_pie_fixed.png', dpi=300, bbox_inches='tight')
plt.show()

## 3. Text Length Distribution

In [None]:
# Calculate text lengths
df['length'] = df['text'].str.split().str.len()

plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='length', bins=40)
plt.title('Text Length Distribution (words per document)', fontsize=14, pad=15)
plt.xlabel('Number of Words')
plt.ylabel('Count')

# Add median line
median_len = df['length'].median()
plt.axvline(median_len, color='red', linestyle='--', 
            label=f'Median: {median_len:.0f} words')
plt.legend()

plt.savefig('story_length_distribution_fixed.png', dpi=300, bbox_inches='tight')
plt.show()

# Print statistics
print("\nText length statistics:")
print(df['length'].describe())

## 4. Box Plot - Length by Label

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='label', y='length', order=['A1','A2','B1','B2','C1','C2'])
plt.title('Text Length Distribution by CEFR Level', fontsize=14, pad=15)
plt.xlabel('CEFR Level')
plt.ylabel('Number of Words')

plt.savefig('length_boxplot_by_label_fixed.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Word Cloud

Generate word cloud from properly cleaned text, excluding stopwords

In [None]:
# Clean all texts and combine
all_texts = ' '.join(df['text'].apply(clean_text_for_wordcloud))

# Create and generate word cloud
wordcloud = WordCloud(
    width=1200, 
    height=600,
    background_color='white',
    stopwords=ENGLISH_STOP_WORDS,
    max_words=100,
    colormap='viridis'
).generate(all_texts)

# Display
plt.figure(figsize=(15, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Words Across All Texts', fontsize=16, pad=20)

plt.savefig('wordcloud_fixed.png', dpi=300, bbox_inches='tight')
plt.show()