## Install libraries

In [1]:
# %%capture
# !pip install emoji regex pandas unicodedata fasttext

In [2]:
import pandas as pd
import re
import regex
import emoji
import unicodedata
import uuid
import json

In [3]:
# df = pd.read_csv("/kaggle/input/eureka-version-dataset/final_raw.csv")
# df = pd.read_csv("/kaggle/input/eureka-version-dataset/v2_raw.csv")
df = pd.read_csv("/kaggle/input/eureka-version-dataset/final_raw_new.csv")
selected_df = df[['summary', 'comment_raw', 'label']]
print("======== Head ========")
print(selected_df.head(5))
print("\n======== Tail ========")
print(selected_df.tail(5))

print("\n======== Shape ========")
print(selected_df.shape)

print("\n======== Info ========")
print(selected_df.info())

                                             summary  \
0  1. Nội dung sơ lược: Bài viết chỉ trích Phạm V...   
1  1. Nội dung sơ lược: Bài viết chỉ trích Phạm V...   
2  1. Nội dung sơ lược: Bài viết chỉ trích Phạm V...   
3  1. Nội dung sơ lược: Bài viết chỉ trích Phạm V...   
4  1. Nội dung sơ lược: Bài viết chỉ trích Phạm V...   

                                         comment_raw            label  
0  luận điệu của bọn phản động, sỏ lá, 3/// viết ...  KHONG_PHAN_DONG  
1  vậy ông bảo đại, ông diệm, ông thiệu là đảng v...  KHONG_PHAN_DONG  
2                              muôn đời của đám 3///  KHONG_PHAN_DONG  
3  già rồi mà đần vậy cháu ? cộng sản đánh mỹ, đá...  KHONG_PHAN_DONG  
4  đúng là 3/// xỏ lá, bác hồ mất nên các bác khó...  KHONG_PHAN_DONG  

                                                 summary  \
17646  1. Nội dung sơ lược: Câu chuyện ngụ ngôn về cu...   
17647  1. Nội dung sơ lược: Câu chuyện ngụ ngôn về cu...   
17648  1. Nội dung sơ lược: Câu chuyện ngụ ngôn về

## 1. Normalize Unicode (NFC) + lowercase

In [4]:
def normalize_unicode_lower(text):
    text = unicodedata.normalize('NFC', text)
    return text.lower()

## 2. Remove Emoji, links/HTML tags/mentions/hashtags/UI indicators

In [5]:
# emoji
EMOTICON_PATTERNS = [
    r":\)+",       # :), :)), :))), ...
    r":\(+",       # :(, :((, ...
    r":v+",        # :v, :vvv, ...
    r":V+",        # :V, :VV, ...
    r"=+\)+",      # =), =)), ...
    r"=+\(+",      # =(, =((, ...
    r":d+",        # :d, :dd
    r":p+",        # :p, :pp
    r"<3+",        # <3<3<3
    r"=+\]+",      # =], =]], =]]], ...
    r"=+\[+",      # =[, =[[, =[[[ ...
    r":>+",        # :>, :>>, ...
    r":<+",        # :<, :<<, ...
    r":\(\(",      # :((
    r"=\(\(",      # =((
]

EMOTICON_REGEX = re.compile("|".join(EMOTICON_PATTERNS), re.IGNORECASE)

EMOJI_REGEX = re.compile(
    "["
    u"\U0001F600-\U0001F64F"
    u"\U0001F300-\U0001F5FF"
    u"\U0001F680-\U0001F6FF"
    u"\U0001F700-\U0001F77F"
    u"\U0001F780-\U0001F7FF"
    u"\U0001F800-\U0001F8FF"
    u"\U0001F900-\U0001F9FF"
    u"\U0001FA00-\U0001FA6F"
    u"\U0001FA70-\U0001FAFF"
    u"\U00002702-\U000027B0"
    u"\U000024C2-\U0001F251" 
    "]+", flags=re.UNICODE
)

def remove_emoji_emoticon(text):    
    try:
        text = emoji.replace_emoji(text, replace=" ")
    except:
        text = EMOJI_REGEX.sub(" ", text)
    
    # Xóa bằng regex
    text = EMOTICON_REGEX.sub(" ", text)
    
    # Loại bỏ khoảng trắng dư thừa
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# url, html, mention, hashtags, ui_indicators
URL_REGEX = re.compile(r'https?://\S+|www\.\S+|\S+\.(com|org|net|co|vn|io)(/\S*)?')
HTML_REGEX = re.compile(r"<[^>]+>")
MENTION_REGEX = re.compile(r"@[\w\._]+")
HASHTAG_REGEX = re.compile(r"#\w+")
UI_INDICATORS = [
    "đã chỉnh sửa", "[đã chỉnh sửa]", "(đã chỉnh sửa)",
    "see more", "xem thêm", "see translation", "xem bản dịch",
    "ẩn bớt", "xem ít hơn", "dịch", "translated", "more", "less",
    "see more reactions"
]

def remove_html_url_mention_hashtag(text):
    if not isinstance(text, str):
        return ""

    # Xóa URL
    text = URL_REGEX.sub(" ", text)
    
    # Xóa HTML tags
    text = HTML_REGEX.sub(" ", text)
    
    # Xóa mentions và hashtags
    text = MENTION_REGEX.sub(" ", text)
    text = HASHTAG_REGEX.sub(" ", text)
    
    # Xóa ui_indicators
    for ind in UI_INDICATORS:
        text = re.sub(r'(?i)' + re.escape(ind), " ", text)
    
    # Loại bỏ dấu câu riêng lẻ
    text = re.sub(r'(?<!\w)[\^\'\`\~\"\,\.]+(?!\w)', ' ', text)
    
    # Làm sạch khoảng trắng dư thừa
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

## 3. Reduce elongated characters

In [6]:
def reduce_elongated(text):
    if not isinstance(text, str):
        return ""
    pattern = regex.compile(r"([\p{L}])\1{2,}", flags=regex.IGNORECASE)
    return pattern.sub(r"\1\1", text)

## 4. Lexical Normalization

In [7]:
import json, re

# --- Load dict and build patterns ---
with open('/kaggle/input/dictionary/abbreviation_dictionary.json', encoding='utf-8') as f:
    norm_dict = json.load(f)

mixed, pure, words = [], [], []
for slang, std in norm_dict.items():
    esc = re.escape(slang)
    # classification
    if re.fullmatch(r"[^\w\s]+", slang):
        pure.append((esc, std))
    elif re.search(r"[^\w\s]", slang) and re.search(r"\w", slang):
        mixed.append((esc, std))
    else:
        # add \b for normal words
        words.append((rf"\b{esc}\b", std))

# sort desc
for lst in (mixed, pure, words):
    lst.sort(key=lambda x: -len(x[0].replace(r"\b","")))

# compile 
_patterns = [
    (re.compile(pat, flags=re.IGNORECASE), std)
    for pat, std in (mixed + pure + words)
]

def apply_lexical_normalization(text):
    if not isinstance(text, str):
        return text
    for regex, std in _patterns:
        text = regex.sub(std, text)
    return text

# 4.1. Debug lexical normalization 

In [8]:
# import json, re

# # --- Load dict and build patterns ---
# with open('/kaggle/input/dictionary/abbreviation_dictionary.json', encoding='utf-8') as f:
#     norm_dict = json.load(f)

# # --- Thêm tracking cho từ khóa chính trị nhạy cảm ---
# POLITICAL_KEYWORDS = [
#     'bò đỏ', 'ba que', 'phản động', 'dư luận viên',
#     'cộng sản', 'việt cộng', 'độc tài', 'đảng',
#     'việt nam cộng hòa', 'bắc kỳ', 'nam kỳ',
#     'trung quốc', 'tàu cộng', 'yêu nước', 'tự do', 'nhân quyền',
#     # Thêm các biến thể có thể có trong dictionary
#     'cs', 'vc', 'vnch', 'tc', 'pd', 'dlv'
# ]

# # Tracking dict để ghi lại các transformations
# political_transformations = {}
# for keyword in POLITICAL_KEYWORDS:
#     if keyword in norm_dict:
#         political_transformations[keyword] = norm_dict[keyword]

# print("=== POLITICAL KEYWORDS NORMALIZATION TRACKING ===")
# print(f"Total dictionary entries: {len(norm_dict):,}")
# print(f"Political keywords to track: {len(POLITICAL_KEYWORDS)}")
# print(f"Political keywords found in dictionary: {len(political_transformations)}")

# if political_transformations:
#     print("\nPolitical keyword transformations:")
#     for original, normalized in political_transformations.items():
#         print(f"  '{original}' → '{normalized}'")
# else:
#     print("\nNo political keywords found in normalization dictionary")

# # Kiểm tra các từ khóa có pattern tương tự
# similar_keys = []
# for key in norm_dict.keys():
#     for pol_word in POLITICAL_KEYWORDS:
#         if pol_word.lower() in key.lower() or key.lower() in pol_word.lower():
#             if key not in political_transformations:
#                 similar_keys.append((key, norm_dict[key]))

# if similar_keys:
#     print(f"\nSimilar/related entries found: {len(similar_keys)}")
#     for original, normalized in similar_keys[:10]:  # Hiển thị top 10
#         print(f"  '{original}' → '{normalized}'")
#     if len(similar_keys) > 10:
#         print(f"  ... and {len(similar_keys) - 10} more")

# mixed, pure, words = [], [], []
# for slang, std in norm_dict.items():
#     esc = re.escape(slang)
#     # classification
#     if re.fullmatch(r"[^\w\s]+", slang):
#         pure.append((esc, std))
#     elif re.search(r"[^\w\s]", slang) and re.search(r"\w", slang):
#         mixed.append((esc, std))
#     else:
#         # add \b for normal words
#         words.append((rf"\b{esc}\b", std))

# # sort desc
# for lst in (mixed, pure, words):
#     lst.sort(key=lambda x: -len(x[0].replace(r"\b","")))

# # compile 
# _patterns = [
#     (re.compile(pat, flags=re.IGNORECASE), std)
#     for pat, std in (mixed + pure + words)
# ]

# def apply_lexical_normalization(text):
#     if not isinstance(text, str):
#         return text
#     for regex, std in _patterns:
#         text = regex.sub(std, text)
#     return text

# print("=== PATTERN COMPILATION COMPLETE ===")
# print(f"Mixed patterns: {len(mixed)}")
# print(f"Pure punctuation patterns: {len(pure)}")
# print(f"Word patterns: {len(words)}")
# print(f"Total compiled patterns: {len(_patterns)}")
# print("="*60)

## 5. Remove punctuation

In [9]:
VIET_CHARACTERS = (
    "àáảãạăằắẳẵặâầấẩẫậ"
    "èéẻẽẹêềếểễệ"
    "ìíỉĩị"
    "òóỏõọôồốổỗộơờớởỡợ"
    "ùúủũụưừứửữự"
    "ỳýỷỹỵ"
    "đ"
)

def remove_punctuation(text):
    if not isinstance(text, str):
        return ""
    # Giữ chữ Việt + chữ Anh + số + space
    text = regex.sub(rf"[^{VIET_CHARACTERS}a-zA-Z0-9\s]+", " ", text)
    return regex.sub(r"\s+", " ", text).strip()

## 6. Whitespace Stripping 

In [10]:
def strip_extra_spaces(text):
    if not isinstance(text, str):
        return ""
    return regex.sub(r"\s+", " ", text).strip()

## 7. Deduplication 

In [11]:
def deduplicate_comments(df, col):
    before = len(df)
    df_nodup = df.drop_duplicates(subset=[col]).reset_index(drop=True)
    after = len(df_nodup)
    return df_nodup

In [12]:
selected_df = selected_df.copy()
original_count = len(selected_df)
print(f"Starting with {original_count:,} comments")

# 1. Unicode normalization + lowercase
print("1. Normalizing Unicode and converting to lowercase")
selected_df['comment_clean'] = selected_df['comment_raw'].apply(normalize_unicode_lower)

# 2. Remove Emoji, links/HTML/mentions/hashtags/UI indicators
print("2. Removing Remove Emoji,links/HTML/mentions/hashtags/UI indicators")
selected_df['comment_clean'] = selected_df['comment_clean'].apply(remove_emoji_emoticon)
selected_df['comment_clean'] = selected_df['comment_clean'].apply(remove_html_url_mention_hashtag)

# 3. Reduce elongated characters
print("3. Reducing elongated characters")
selected_df['comment_clean'] = selected_df['comment_clean'].apply(reduce_elongated)

# 4. Lexical normalization
# print("4. Applying lexical normalization")
# selected_df['comment_clean'] = selected_df['comment_clean'].apply(apply_lexical_normalization)
print("4. Applying lexical normalization")
before = selected_df['comment_clean'].copy()
selected_df['comment_clean'] = selected_df['comment_clean'].apply(apply_lexical_normalization)

num_lines_changed = (before != selected_df['comment_clean']).sum()
percent_lines_changed = (num_lines_changed / len(selected_df)) * 100 if len(selected_df) else 0

print(f"  Số dòng đã chỉnh sửa: {num_lines_changed:,}/{len(selected_df):,} ({percent_lines_changed:.2f}%)")
# 5. Remove punctuation
print("5. Removing all punctuation")
selected_df['comment_clean'] = selected_df['comment_clean'].apply(remove_punctuation)

# 6. Whitespace Stripping
print("6. Whitespace Stripping")
selected_df['comment_clean'] = selected_df['comment_clean'].apply(strip_extra_spaces)

# 7. Deduplication
print("7. Removing duplicate comments")
before_dedup = len(selected_df)
selected_df = deduplicate_comments(selected_df, col='comment_clean')
after_dedup = len(selected_df)
print(f"  Removed {before_dedup - after_dedup:,} duplicate comments")

# Statistics Summary
print("\n" + "="*50)
final_count = len(selected_df)
total_reduction = original_count - final_count
retention_rate = (final_count / original_count) * 100
reduction_rate = (total_reduction / original_count) * 100

print(f"Original comments:      {original_count:,}")
print(f"After filtering:        {original_count:,} (100.0%)")
print(f"Final comments:         {final_count:,} ({retention_rate:.1f}%)")
print(f"Total reduction:        {total_reduction} comments ({reduction_rate:.1f}%)")

# Label Distribution
if 'label' in selected_df.columns:
    print(f"\n Label Distribution:")
    label_counts = selected_df['label'].value_counts().sort_index()
    
    for label, count in label_counts.items():
        percentage = (count / final_count) * 100
        print(f"  • {label:<18}: {count:,} ({percentage:.1f}%)")

print("="*50)


Starting with 17,651 comments
1. Normalizing Unicode and converting to lowercase
2. Removing Remove Emoji,links/HTML/mentions/hashtags/UI indicators
3. Reducing elongated characters
4. Applying lexical normalization
  Số dòng đã chỉnh sửa: 9,517/17,651 (53.92%)
5. Removing all punctuation
6. Whitespace Stripping
7. Removing duplicate comments
  Removed 350 duplicate comments

Original comments:      17,651
After filtering:        17,651 (100.0%)
Final comments:         17,301 (98.0%)
Total reduction:        350 comments (2.0%)

 Label Distribution:
  • KHONG_LIEN_QUAN   : 8,886 (51.4%)
  • KHONG_PHAN_DONG   : 6,202 (35.8%)
  • PHAN_DONG         : 2,213 (12.8%)


In [13]:
selected_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17301 entries, 0 to 17300
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   summary        17301 non-null  object
 1   comment_raw    17301 non-null  object
 2   label          17301 non-null  object
 3   comment_clean  17301 non-null  object
dtypes: object(4)
memory usage: 540.8+ KB


### DEBUG

In [14]:
# selected_df = selected_df.copy()
# original_count = len(selected_df)
# print(f"Starting with {original_count:,} comments")

# # 1. Unicode normalization + lowercase
# print("1. Normalizing Unicode and converting to lowercase")
# selected_df['comment_clean'] = selected_df['comment_raw'].apply(normalize_unicode_lower)

# # 2. Remove Emoji, links/HTML/mentions/hashtags/UI indicators
# print("2. Removing Remove Emoji,links/HTML/mentions/hashtags/UI indicators")
# selected_df['comment_clean'] = selected_df['comment_clean'].apply(remove_emoji_emoticon)
# selected_df['comment_clean'] = selected_df['comment_clean'].apply(remove_html_url_mention_hashtag)

# # 3. Reduce elongated characters
# print("3. Reducing elongated characters")
# selected_df['comment_clean'] = selected_df['comment_clean'].apply(reduce_elongated)

# # 4. Lexical normalization với tracking chi tiết
# print("4. Applying lexical normalization")
# before = selected_df['comment_clean'].copy()
# selected_df['comment_clean'] = selected_df['comment_clean'].apply(apply_lexical_normalization)

# num_lines_changed = (before != selected_df['comment_clean']).sum()
# percent_lines_changed = (num_lines_changed / len(selected_df)) * 100 if len(selected_df) else 0

# print(f"  Số dòng đã chỉnh sửa: {num_lines_changed:,}/{len(selected_df):,} ({percent_lines_changed:.2f}%)")

# # Thêm tracking cho political keywords
# print("\n  === POLITICAL KEYWORDS TRACKING ===")
# political_keyword_stats = {}

# for keyword in POLITICAL_KEYWORDS:
#     before_count = before.str.contains(keyword, case=False, na=False).sum()
#     after_count = selected_df['comment_clean'].str.contains(keyword, case=False, na=False).sum()
    
#     if before_count > 0 or after_count > 0:
#         political_keyword_stats[keyword] = {
#             'before': before_count,
#             'after': after_count,
#             'change': after_count - before_count
#         }

# if political_keyword_stats:
#     print(f"  Political keywords found in dataset:")
#     for keyword, stats in political_keyword_stats.items():
#         if stats['before'] > 0 or stats['after'] > 0:
#             change_sign = "+" if stats['change'] > 0 else ""
#             print(f"    '{keyword}': {stats['before']} → {stats['after']} ({change_sign}{stats['change']})")
# else:
#     print("  No tracked political keywords found in dataset")

# # Kiểm tra từ khóa được normalize thành gì
# if political_transformations:
#     print(f"\n  Checking normalized political keywords:")
#     for original, normalized in political_transformations.items():
#         normalized_count = selected_df['comment_clean'].str.contains(normalized, case=False, na=False).sum()
#         if normalized_count > 0:
#             print(f"    '{normalized}' (from '{original}'): {normalized_count} occurrences")

# print("  =" * 40)

# # 5. Remove punctuation
# print("5. Removing all punctuation")
# selected_df['comment_clean'] = selected_df['comment_clean'].apply(remove_punctuation)

# # 6. Whitespace Stripping
# print("6. Whitespace Stripping")
# selected_df['comment_clean'] = selected_df['comment_clean'].apply(strip_extra_spaces)

# # 7. Deduplication
# print("7. Removing duplicate comments")
# before_dedup = len(selected_df)
# selected_df = deduplicate_comments(selected_df, col='comment_clean')
# after_dedup = len(selected_df)
# print(f"  Removed {before_dedup - after_dedup:,} duplicate comments")

# # Statistics Summary
# print("\n" + "="*50)
# final_count = len(selected_df)
# total_reduction = original_count - final_count
# retention_rate = (final_count / original_count) * 100
# reduction_rate = (total_reduction / original_count) * 100

# print(f"Original comments:      {original_count:,}")
# print(f"After filtering:        {original_count:,} (100.0%)")
# print(f"Final comments:         {final_count:,} ({retention_rate:.1f}%)")
# print(f"Total reduction:        {total_reduction} comments ({reduction_rate:.1f}%)")

# # Label Distribution
# if 'label' in selected_df.columns:
#     print(f"\n Label Distribution:")
#     label_counts = selected_df['label'].value_counts().sort_index()
    
#     for label, count in label_counts.items():
#         percentage = (count / final_count) * 100
#         print(f"  • {label:<18}: {count:,} ({percentage:.1f}%)")

# # Final political keywords summary với breakdown theo label
# print(f"\n=== FINAL POLITICAL KEYWORDS SUMMARY ===")
# final_political_stats = {}
# for keyword in POLITICAL_KEYWORDS:
#     count = selected_df['comment_clean'].str.contains(keyword, case=False, na=False).sum()
#     if count > 0:
#         final_political_stats[keyword] = count

# if final_political_stats:
#     print("Political keywords in final dataset:")
#     for keyword, count in sorted(final_political_stats.items(), key=lambda x: x[1], reverse=True):
#         percentage = (count / final_count) * 100
#         print(f"  '{keyword}': {count:,} occurrences ({percentage:.2f}%)")
        
#         # Breakdown by label
#         if 'label' in selected_df.columns:
#             for label in selected_df['label'].unique():
#                 label_count = selected_df[selected_df['label'] == label]['comment_clean'].str.contains(keyword, case=False, na=False).sum()
#                 if label_count > 0:
#                     label_total = len(selected_df[selected_df['label'] == label])
#                     label_percentage = (label_count / label_total) * 100
#                     print(f"    - {label}: {label_count:,} ({label_percentage:.2f}%)")
# else:
#     print("No political keywords found in final dataset")

# print("="*50)

In [15]:
# Export
output_path = "/kaggle/working/final_claened_new.csv"
selected_df.to_csv(output_path, index=False)
print(f"Cleaned comments exported to: {output_path}")

Cleaned comments exported to: /kaggle/working/final_claened_new.csv


In [16]:
# # rename comment_clean -> comment
# export_df = selected_df[['summary', 'comment_clean', 'label']] \
#     .rename(columns={'comment_clean': 'comment'})

# output_path = "/kaggle/working/v2_clean_1.csv"
# export_df.to_csv(output_path, index=False)

# print(f"Cleaned comments exported to: {output_path}")

In [17]:
# # test
# # DEBUG
# test_comments = [
#     "3///, 3\\\, 3|||, \|/ baq,podo,3#,mấy thg 3q ...	 😀😀",
#     "\\\\, 3que, parky, bake, 3#,",
#     "thèn dvm với hcm ngoo như nhau hết",
#     "d.m.c.s này ghê quáaaaaaaaaaa @user #hashtag",
#     "c.h.ế.t",
#     "cơm sườn ơi sẽn sèng chưa dm cs"
# ]

# print("="*80)
# print("DEBUG: TESTING PREPROCESSING PIPELINE")
# print("="*80)

# for i, text in enumerate(test_comments, 1):
#     print(f"\n--- TEST {i} ---")
#     print(f"Original: '{text}'")
    
#     # Step 1: Unicode + lowercase
#     step1 = normalize_unicode_lower(text)
#     print(f"Step 1:   '{step1}'")
    
#     # Step 2a: Remove emoji/emoticon
#     step2a = remove_emoji_emoticon(step1)
#     print(f"Step 2a:  '{step2a}'")
    
#     # Step 2b: Remove HTML/URL/mentions
#     step2b = remove_html_url_mention_hashtag(step2a)
#     print(f"Step 2b:  '{step2b}'")
    
#     # Step 3: Reduce elongated
#     step3 = reduce_elongated(step2b)
#     print(f"Step 3:   '{step3}'")
    
#     # Step 4: Lexical normalization
#     step4 = apply_lexical_normalization(step3)
#     print(f"Step 4:   '{step4}'")
    
#     # Step 5: Remove punctuation
#     step5 = remove_punctuation(step4)
#     print(f"Step 5:   '{step5}'")
    
#     # Step 6: Strip spaces
#     step6 = strip_extra_spaces(step5)
#     print(f"Final:    '{step6}'")
    
#     if step6 == "":
#         print("🚨 WARNING: Text became empty!")

# print("\n" + "="*80)
# print("Dictionary status:")
# print(f"Loaded entries: {len(norm_dict)}")
# if norm_dict:
#     print("Sample dictionary entries:")
#     for k, v in list(norm_dict.items())[:5]:
#         print(f"  '{k}' → '{v}'")
# print("="*80)