In [2]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datasets import load_dataset

# Clear any cached modules
if 'config' in sys.modules:
    del sys.modules['config']

# Setup project path
current_path = Path.cwd()
project_root = current_path.parent

if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Load and validate configuration
from config import config
print(f"Configuration loaded from: {config.config_path}")

# Validate configuration paths
print("\nConfiguration Validation:")
print(f"Project root: {config.project_root}")
print(f"Data directory: {config.data_dir}")
print(f"Models directory: {config.models_dir}")

# Check if required files exist
required_files = [
    config.raw_data_dir,
    config.summarization_data,
    config.sentiment_data,
    config.stopwords_file
]

print("\nFile Check:")
for file_path in required_files:
    if file_path.exists():
        print(f"{file_path.name}: Found")
    else:
        print(f"{file_path.name}: Missing")

print("\nConfiguration validation complete!")

Configuration loaded from: d:\Project\MajorProject\params.yaml

Configuration Validation:
Project root: d:\Project\MajorProject
Data directory: d:\Project\MajorProject\data
Models directory: d:\Project\MajorProject\models

File Check:
raw: Found
summary_clean.csv: Found
reviews_clean.csv: Found
vietnamese-stopwords.txt: Found

Configuration validation complete!


In [4]:
# Data Ingestion & Validation
# Load summarization dataset
print("\n1. Summarization Dataset:")
print("-" * 30)

try:
    summ_data = load_dataset('csv', data_files=str(config.summarization_data))
    df_summ = summ_data['train'].to_pandas()
    print(f"Successfully loaded: {len(df_summ):,} samples")
    print(f"Dataset shape: {df_summ.shape}")
    print(f"Columns: {list(df_summ.columns)}")
    
    # Data validation
    print(f"\nData Quality Checks:")
    
    # Check for required columns
    required_cols = ['Text', 'Summary']
    missing_cols = [col for col in required_cols if col not in df_summ.columns]
    if missing_cols:
        print(f"Missing columns: {missing_cols}")
        print(f"Available columns: {list(df_summ.columns)}")
    else:
        print(f"All required columns present")
    
    # Check for null values
    null_counts = df_summ.isnull().sum()
    if null_counts.any():
        print(f"Warning - Null values found:")
        for col, count in null_counts[null_counts > 0].items():
            print(f"  - {col}: {count} nulls ({count/len(df_summ)*100:.1f}%)")
    else:
        print(f"No null values found")
    
    # Check for empty strings
    if 'Text' in df_summ.columns and 'Summary' in df_summ.columns:
        empty_content = (df_summ['Text'].str.strip() == '').sum()
        empty_summary = (df_summ['Summary'].str.strip() == '').sum()
        
        if empty_content > 0:
            print(f"Warning - Empty Text entries: {empty_content}")
        else:
            print(f"No empty Text entries")
            
        if empty_summary > 0:
            print(f"Warning - Empty Summary entries: {empty_summary}")
        else:
            print(f"No empty Summary entries")
    
except Exception as e:
    print(f"Failed to load summarization dataset: {e}")
    df_summ = None

# Load sentiment dataset
print("\n2. Sentiment Dataset:")
print("-" * 30)

try:
    sent_data = load_dataset('csv', data_files=str(config.sentiment_data))
    df_sent = sent_data['train'].to_pandas()
    print(f"Successfully loaded: {len(df_sent):,} samples")
    print(f"Dataset shape: {df_sent.shape}")
    print(f"Columns: {list(df_sent.columns)}")
    
    # Data validation
    print(f"\nData Quality Checks:")
    
    # Check for required columns
    required_cols = ['comment', 'label']
    missing_cols = [col for col in required_cols if col not in df_sent.columns]
    if missing_cols:
        print(f"Missing columns: {missing_cols}")
    else:
        print(f"All required columns present")
    
    # Check for null values
    null_counts = df_sent.isnull().sum()
    if null_counts.any():
        print(f"Warning - Null values found:")
        for col, count in null_counts[null_counts > 0].items():
            print(f"  - {col}: {count} nulls ({count/len(df_sent)*100:.1f}%)")
    else:
        print(f"No null values found")
    
    # Check label distribution
    if 'label' in df_sent.columns:
        label_counts = df_sent['label'].value_counts()
        print(f"\nLabel Distribution:")
        for label, count in label_counts.items():
            percentage = (count / len(df_sent)) * 100
            print(f"  - {label}: {count:,} samples ({percentage:.1f}%)")
    
except Exception as e:
    print(f"Failed to load sentiment dataset: {e}")
    df_sent = None


1. Summarization Dataset:
------------------------------
Successfully loaded: 51,292 samples
Dataset shape: (51292, 2)
Columns: ['Text', 'Summary']

Data Quality Checks:
All required columns present
No null values found
No empty Text entries
No empty Summary entries

2. Sentiment Dataset:
------------------------------
Successfully loaded: 12,183 samples
Dataset shape: (12183, 4)
Columns: ['comment', 'label', 'rate', 'Unnamed: 3']

Data Quality Checks:
All required columns present
  - Unnamed: 3: 12169 nulls (99.9%)

Label Distribution:
  - POS: 4,061 samples (33.3%)
  - NEG: 4,061 samples (33.3%)
  - NEU: 4,061 samples (33.3%)


In [32]:
# Data Exploration & Analysis

# Set plot style
plt.style.use('seaborn-v0_8')
plt.rcParams["figure.figsize"] = (15, 10)

# Summarization Dataset Exploration
if df_summ is not None:
    print("\n1. Summarization Dataset Exploration:")
    print("-" * 40)
    
    if 'Text' in df_summ.columns and 'Summary' in df_summ.columns:
        # Calculate text statistics
        df_summ['content_length'] = df_summ['Text'].str.len()
        df_summ['summary_length'] = df_summ['Summary'].str.len()
        df_summ['content_words'] = df_summ['Text'].str.split().str.len()
        df_summ['summary_words'] = df_summ['Summary'].str.split().str.len()
        df_summ['compression_ratio'] = df_summ['summary_words'] / df_summ['content_words']
        
        print(f"Text Length Statistics (Characters):")
        print(f"  Content - Mean: {df_summ['content_length'].mean():.0f}, Median: {df_summ['content_length'].median():.0f}")
        print(f"  Summary - Mean: {df_summ['summary_length'].mean():.0f}, Median: {df_summ['summary_length'].median():.0f}")
        
        print(f"\nWord Count Statistics:")
        print(f"  Content - Mean: {df_summ['content_words'].mean():.1f}, Median: {df_summ['content_words'].median():.0f}")
        print(f"  Summary - Mean: {df_summ['summary_words'].mean():.1f}, Median: {df_summ['summary_words'].median():.0f}")
        
        print(f"\nCompression Analysis:")
        print(f"  Average compression ratio: {df_summ['compression_ratio'].mean():.3f}")
        print(f"  Median compression ratio: {df_summ['compression_ratio'].median():.3f}")
        print(f"  Summary is {df_summ['compression_ratio'].mean()*100:.1f}% of original length")
        
        # Sample data preview
        print(f"\nSample Entry:")
        sample = df_summ.iloc[0]
        print(f"  Content ({sample['content_words']} words): {sample['Text'][:150]}...")
        print(f"  Summary ({sample['summary_words']} words): {sample['Summary']}")
        print(f"  Compression: {sample['compression_ratio']:.2%}")
    else:
        print(f"Expected Text and Summary columns not found. Available columns: {list(df_summ.columns)}")

# Sentiment Dataset Exploration  
if df_sent is not None:
    print(f"\n2. Sentiment Dataset Exploration:")
    print("-" * 40)
    
    if 'comment' in df_sent.columns:
        # Calculate text statistics
        df_sent['comment_length'] = df_sent['comment'].str.len()
        df_sent['comment_words'] = df_sent['comment'].str.split().str.len()
        
        print(f"Text Length Statistics:")
        print(f"  Characters - Mean: {df_sent['comment_length'].mean():.0f}, Median: {df_sent['comment_length'].median():.0f}")
        print(f"  Words - Mean: {df_sent['comment_words'].mean():.1f}, Median: {df_sent['comment_words'].median():.0f}")
        
        # Label analysis
        if 'label' in df_sent.columns:
            print(f"\nLabel Distribution Analysis:")
            label_counts = df_sent['label'].value_counts()
            
            for label in label_counts.index:
                subset = df_sent[df_sent['label'] == label]
                count = len(subset)
                percentage = (count / len(df_sent)) * 100
                avg_length = subset['comment_words'].mean()
                
                print(f"  {label}: {count:,} samples ({percentage:.1f}%) - Avg length: {avg_length:.1f} words")
                
                # Sample comment for each label
                sample_comment = subset['comment'].iloc[0]
                print(f"    Sample: {sample_comment[:100]}...")
        
        # Data quality metrics
        print(f"\nData Quality Metrics:")
        
        # Very short comments (potential quality issues)
        short_comments = (df_sent['comment_words'] <= 3).sum()
        print(f"  Very short comments (≤3 words): {short_comments} ({short_comments/len(df_sent)*100:.1f}%)")
        
        # Very long comments (potential outliers)
        long_comments = (df_sent['comment_words'] >= 100).sum()
        print(f"  Very long comments (≥100 words): {long_comments} ({long_comments/len(df_sent)*100:.1f}%)")


1. Summarization Dataset Exploration:
----------------------------------------
Text Length Statistics (Characters):
  Content - Mean: 3229, Median: 2676
  Summary - Mean: 140, Median: 141

Word Count Statistics:
  Content - Mean: 704.5, Median: 581
  Summary - Mean: 30.5, Median: 31

Compression Analysis:
  Average compression ratio: 0.104
  Median compression ratio: 0.051
  Summary is 10.4% of original length

Sample Entry:
  Content (1016 words): Tập đoàn T&T Group và tập đoàn năng lượng Ørsted (Đan Mạch) vừa ký hợp tác trong lĩnh vực điện gió ngoài khơi tại Việt Nam. Theo đó, tổng công suất dự...
  Summary (29 words): Doanh nghiệp vừa ký kết hợp tác với hai đối tác Bỉ và Đan Mạch nhằm phát triển các dự án về năng lượng tái tạo tại Việt Nam.
  Compression: 2.85%

2. Sentiment Dataset Exploration:
----------------------------------------
Text Length Statistics:
  Characters - Mean: 34, Median: 26
  Words - Mean: 7.8, Median: 6

Label Distribution Analysis:
  POS: 17,789 samples (64.0

In [None]:
# Data Visualization & Summary

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Vietnamese Text Dataset Analysis', fontsize=16, fontweight='bold')

# Summarization Dataset Visualizations
if df_summ is not None and 'content_words' in df_summ.columns:
    
    # 1. Content length distribution
    axes[0,0].hist(df_summ['content_words'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
    axes[0,0].set_title('Content Length Distribution')
    axes[0,0].set_xlabel('Words')
    axes[0,0].set_ylabel('Frequency')
    axes[0,0].axvline(df_summ['content_words'].mean(), color='red', linestyle='--', 
                     label=f'Mean: {df_summ["content_words"].mean():.0f}')
    axes[0,0].legend()
    
    # 2. Summary length distribution
    axes[0,1].hist(df_summ['summary_words'], bins=50, alpha=0.7, color='orange', edgecolor='black')
    axes[0,1].set_title('Summary Length Distribution')
    axes[0,1].set_xlabel('Words')
    axes[0,1].set_ylabel('Frequency')
    axes[0,1].axvline(df_summ['summary_words'].mean(), color='red', linestyle='--',
                     label=f'Mean: {df_summ["summary_words"].mean():.0f}')
    axes[0,1].legend()
    
    # 3. Compression ratio distribution
    # Filter extreme outliers for better visualization
    compression_filtered = df_summ['compression_ratio'][df_summ['compression_ratio'] <= 1.0]
    axes[0,2].hist(compression_filtered, bins=50, alpha=0.7, color='lightgreen', edgecolor='black')
    axes[0,2].set_title('Compression Ratio Distribution')
    axes[0,2].set_xlabel('Compression Ratio')
    axes[0,2].set_ylabel('Frequency')
    axes[0,2].axvline(compression_filtered.mean(), color='red', linestyle='--',
                     label=f'Mean: {compression_filtered.mean():.3f}')
    axes[0,2].legend()

else:
    for i in range(3):
        axes[0,i].text(0.5, 0.5, 'Summarization\nData Not Available', 
                      ha='center', va='center', transform=axes[0,i].transAxes)
        axes[0,i].set_title(f'Summarization Plot {i+1}')

# Sentiment Dataset Visualizations
if df_sent is not None and 'comment_words' in df_sent.columns:
    
    # 4. Comment length distribution
    axes[1,0].hist(df_sent['comment_words'], bins=50, alpha=0.7, color='purple', edgecolor='black')
    axes[1,0].set_title('Comment Length Distribution')
    axes[1,0].set_xlabel('Words')
    axes[1,0].set_ylabel('Frequency')
    axes[1,0].axvline(df_sent['comment_words'].mean(), color='red', linestyle='--',
                     label=f'Mean: {df_sent["comment_words"].mean():.0f}')
    axes[1,0].legend()
    
    # 5. Label distribution
    if 'label' in df_sent.columns:
        label_counts = df_sent['label'].value_counts()
        colors = ['lightcoral', 'lightblue', 'lightgreen'][:len(label_counts)]
        
        bars = axes[1,1].bar(label_counts.index, label_counts.values, color=colors, edgecolor='black')
        axes[1,1].set_title('Sentiment Label Distribution')
        axes[1,1].set_xlabel('Sentiment Label')
        axes[1,1].set_ylabel('Count')
        
        # Add percentage labels on bars
        for bar, count in zip(bars, label_counts.values):
            percentage = (count / len(df_sent)) * 100
            axes[1,1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + len(df_sent)*0.01,
                          f'{percentage:.1f}%', ha='center', va='bottom')
    
    # 6. Length by sentiment
    if 'label' in df_sent.columns:
        sentiment_lengths = []
        sentiment_labels = []
        
        for label in df_sent['label'].unique():
            lengths = df_sent[df_sent['label'] == label]['comment_words']
            sentiment_lengths.append(lengths)
            sentiment_labels.append(label)
        
        axes[1,2].boxplot(sentiment_lengths, labels=sentiment_labels)
        axes[1,2].set_title('Comment Length by Sentiment')
        axes[1,2].set_xlabel('Sentiment Label')
        axes[1,2].set_ylabel('Words')
        
else:
    for i in range(3):
        axes[1,i].text(0.5, 0.5, 'Sentiment\nData Not Available', 
                      ha='center', va='center', transform=axes[1,i].transAxes)
        axes[1,i].set_title(f'Sentiment Plot {i+1}')

plt.tight_layout()
plt.show()



# Final Summary Report

print(f"\nDataset Overview:")
if df_summ is not None:
    print(f"  Summarization: {len(df_summ):,} samples ready for training")
    if 'compression_ratio' in df_summ.columns:
        avg_compression = df_summ['compression_ratio'].mean()
        print(f"average compression: {avg_compression:.1%}")
        print(f"quality: {'Good' if 0.1 <= avg_compression <= 0.5 else 'Needs Review'}")
else:
    print(f"  Summarization: Dataset not loaded")

if df_sent is not None:
    print(f"  Sentiment: {len(df_sent):,} samples ready for training")
    if 'label' in df_sent.columns:
        label_counts = df_sent['label'].value_counts()
        imbalance = label_counts.max() / label_counts.min()
        print(f"label balance: {imbalance:.1f}:1 ratio")
        print(f"quality: {'Balanced' if imbalance <= 3 else 'Imbalanced'}")
else:
    print(f"  Sentiment: Dataset not loaded")

print(f"\nConfiguration Status:")
print(f"  All config files validated")
print(f"  Data paths confirmed")
print(f"  Project structure verified")

print(f"\nReadiness Assessment:")
ready_count = sum([df_summ is not None, df_sent is not None])
print(f"  - Data readiness: {ready_count}/2 datasets available")
print(f"  - Status: {'Ready for model training!' if ready_count == 2 else 'Fix data issues before training'}")

if df_summ is not None and 'compression_ratio' in df_summ.columns:
    if df_summ['compression_ratio'].mean() > 0.5:
        print("=" * 60)
if df_sent is not None and 'label' in df_sent.columns:
    label_counts = df_sent['label'].value_counts()
    if label_counts.max() / label_counts.min() > 3:
        print("=" * 60)

NameError: name 'plt' is not defined