In [2]:
# Vietnamese Restaurant Review Data Cleaning
# This notebook cleans the raw scraped data and prepares it for labeling

import pandas as pd
import re
import numpy as np

# Load the raw data
print("📊 Loading raw review data...")
df = pd.read_csv('../duancntt/foody_reviews_multi.csv', encoding='utf-8')
print(f"Original dataset shape: {df.shape}")
print("\n🔍 First few rows:")
print(df.head())

# === CLEANING STEP 1: Remove rating numbers from text ===
def clean_rating_from_text(text):
    """Remove rating numbers like '1.2', '6.0', '8.0' from the beginning of text"""
    if pd.isna(text):
        return text
    
    # Remove patterns like "1.2 ", "6.0 ", "8.0 " from the start
    text = re.sub(r'^[\d\.]+\s+', '', str(text))
    return text.strip()

df['cleaned_text'] = df['text'].apply(clean_rating_from_text)

# === CLEANING STEP 2: Remove "...Xem thêm" and similar endings ===
def remove_truncation_markers(text):
    """Remove truncation markers like '...Xem thêm', '...Read more'"""
    if pd.isna(text):
        return text
    
    # Common truncation patterns in Vietnamese reviews
    truncation_patterns = [
        r'\.{3,}Xem thêm$',
        r'\.{3,}$',
        r'\s*…\s*$',
        r'\.{2,}\s*$'
    ]
    
    for pattern in truncation_patterns:
        text = re.sub(pattern, '', str(text), flags=re.IGNORECASE)
    
    return text.strip()

df['cleaned_text'] = df['cleaned_text'].apply(remove_truncation_markers)

# === CLEANING STEP 3: Filter out very short or very long reviews ===
def is_valid_review_length(text):
    """Check if review has reasonable length (10-1000 characters)"""
    if pd.isna(text):
        return False
    return 10 <= len(str(text)) <= 1000

# Filter reviews by length
df['is_valid_length'] = df['cleaned_text'].apply(is_valid_review_length)
print(f"\n📏 Reviews with valid length: {df['is_valid_length'].sum()}/{len(df)}")

# === CLEANING STEP 4: Remove reviews with too many special characters ===
def has_reasonable_text_ratio(text):
    """Check if text has reasonable ratio of Vietnamese characters vs. special chars"""
    if pd.isna(text):
        return False
    
    text = str(text)
    # Count Vietnamese letters, numbers, and basic punctuation
    vietnamese_chars = re.findall(r'[a-zA-ZàáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđĐ0-9\s.,!?]', text)
    
    if len(text) == 0:
        return False
    
    ratio = len(vietnamese_chars) / len(text)
    return ratio > 0.7  # At least 70% should be normal characters

df['has_reasonable_text'] = df['cleaned_text'].apply(has_reasonable_text_ratio)
print(f"📝 Reviews with reasonable text ratio: {df['has_reasonable_text'].sum()}/{len(df)}")

# === CLEANING STEP 5: Normalize text (optional cleanup) ===
def normalize_text(text):
    """Basic text normalization"""
    if pd.isna(text):
        return text
    
    text = str(text)
    
    # Replace multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    
    # Replace multiple punctuation with single
    text = re.sub(r'[!]{2,}', '!', text)
    text = re.sub(r'[?]{2,}', '?', text)
    text = re.sub(r'[.]{3,}', '...', text)
    
    return text.strip()

df['normalized_text'] = df['cleaned_text'].apply(normalize_text)

# === FILTERING: Keep only good quality reviews ===
quality_filter = (
    df['is_valid_length'] & 
    df['has_reasonable_text'] & 
    df['normalized_text'].notna()
)

cleaned_df = df[quality_filter].copy()
print(f"\n✨ Final cleaned dataset: {len(cleaned_df)} reviews (from {len(df)} original)")

# === PREPARE FINAL DATASET ===
final_df = cleaned_df[[
    'review_id', 'place_name', 'author', 'rating', 'date', 'normalized_text'
]].copy()

final_df.rename(columns={'normalized_text': 'clean_text'}, inplace=True)

# Show some examples of the cleaning results
print("\n🔍 Examples of cleaned reviews:")
for i in range(min(3, len(final_df))):
    original = df.iloc[i]['text'][:100] + "..." if len(str(df.iloc[i]['text'])) > 100 else df.iloc[i]['text']
    cleaned = final_df.iloc[i]['clean_text'][:100] + "..." if len(final_df.iloc[i]['clean_text']) > 100 else final_df.iloc[i]['clean_text']
    
    print(f"\nExample {i+1}:")
    print(f"BEFORE: {original}")
    print(f"AFTER:  {cleaned}")

# === SAVE CLEANED DATA ===
output_path = '../duancntt/cleaned_reviews.csv'
final_df.to_csv(output_path, index=False, encoding='utf-8')
print(f"\n💾 Cleaned data saved to: {output_path}")

# === DATA QUALITY STATISTICS ===
print(f"\n📊 CLEANING SUMMARY:")
print(f"Original reviews: {len(df)}")
print(f"After cleaning: {len(final_df)}")
print(f"Removal rate: {((len(df) - len(final_df)) / len(df) * 100):.1f}%")

print(f"\nReview length distribution:")
lengths = final_df['clean_text'].str.len()
print(f"Average length: {lengths.mean():.0f} characters")
print(f"Median length: {lengths.median():.0f} characters")
print(f"Min length: {lengths.min()}")
print(f"Max length: {lengths.max()}")

print(f"\nRating distribution:")
print(final_df['rating'].value_counts().sort_index())

📊 Loading raw review data...
Original dataset shape: (2173, 6)

🔍 First few rows:
   review_id                            place_name       author  rating  \
0          1  Quán Ăn Huế O Xuân ở Quận 1, TP. HCM      Thao Le     1.2   
1          2  Quán Ăn Huế O Xuân ở Quận 1, TP. HCM  Thao Nguyen     6.0   
2          3  Quán Ăn Huế O Xuân ở Quận 1, TP. HCM  Thanh Trang     6.4   
3          4  Quán Ăn Huế O Xuân ở Quận 1, TP. HCM    MIMI TRẦN     8.0   
4          5  Quán Ăn Huế O Xuân ở Quận 1, TP. HCM        Quynh     6.0   

        date                                               text  
0  7/20/2022  1.2 Quán phục vụ kém, ông chủ ngồi thu tiền đu...  
1  1/16/2021  6.0 Hơi mắc!!! Phần bún bò không được 3 lát bò...  
2  7/22/2020  6.4 Ăn được Lướt Now thấy quán này có Deal giả...  
3  4/19/2020  8.0 Quán Ăn Huế O Xuân Ăn ở đây thì chỉ ăn các...  
4  2/23/2020  6.0 Món Huế Các loại bánh Huế ngon, nước mắm ổ...  

📏 Reviews with valid length: 2087/2173
📝 Reviews with reasonable text 