In [17]:
import pandas as pd

df_train = pd.read_csv('dataset/train.csv')
df_test = pd.read_csv('dataset/test.csv')
df_holdout = pd.read_csv('dataset/holdout.csv', delimiter=';')

df_all = pd.concat([df_train, df_test, df_holdout], ignore_index=True)

In [25]:
df_all.tail(40)

Unnamed: 0,comment,label
11633,gen z kebanyakan kena manipulatif subektisetya 😂,0
11634,ajak nael carry ragul,0
11635,menyala kaks,0
11636,best,0
11637,keren parah 𝐌𝐎𝐍𝐀𝟒𝐃 pengen nonton ulang 👏,1
11638,geruduk tambang ilegal geruduk perusahaan dll ...,0
11639,jos,0
11640,makswin mntp sih situsnya,1
11641,_awo_awo_awo_awoo_awoo_awoo,0
11642,bang lu adeknya timoty ronald ya 🗿🗿,0


In [26]:
import pandas as pd
import unicodedata
from collections import Counter
import re

class HomoglyphExtractor:
    """
    Ekstraksi karakter homoglyph dari dataset
    Mendeteksi karakter Unicode yang mirip huruf/angka normal tapi beda encoding
    """
    
    def __init__(self):
        # Range Unicode untuk karakter normal (baseline)
        self.normal_ranges = [
            (0x0020, 0x007E),  # Basic Latin (spasi sampai ~)
            (0x00A0, 0x00FF),  # Latin-1 Supplement
        ]
        
        # Range Unicode untuk emoji (akan di-exclude)
        self.emoji_ranges = [
            (0x1F600, 0x1F64F),  # Emoticons
            (0x1F300, 0x1F5FF),  # Symbols & Pictographs
            (0x1F680, 0x1F6FF),  # Transport & Map
            (0x1F1E0, 0x1F1FF),  # Flags
            (0x2600, 0x26FF),    # Miscellaneous Symbols
            (0x2700, 0x27BF),    # Dingbats
            (0xFE00, 0xFE0F),    # Variation Selectors
            (0x1F900, 0x1F9FF),  # Supplemental Symbols
            (0x1FA70, 0x1FAFF),  # Symbols and Pictographs Extended-A
        ]
        
        # Kategori Unicode yang sering digunakan untuk homoglyph
        self.suspicious_categories = [
            'Lm',  # Letter, Modifier
            'Sk',  # Symbol, Modifier
            'So',  # Symbol, Other
        ]
        
    def is_emoji(self, char):
        """Cek apakah karakter adalah emoji"""
        code_point = ord(char)
        for start, end in self.emoji_ranges:
            if start <= code_point <= end:
                return True
        return False
    
    def is_normal_char(self, char):
        """Cek apakah karakter adalah ASCII/Latin normal"""
        code_point = ord(char)
        for start, end in self.normal_ranges:
            if start <= code_point <= end:
                return True
        return False
    
    def is_homoglyph(self, char):
        """
        Deteksi homoglyph:
        - Bukan karakter normal
        - Bukan emoji
        - Bukan whitespace biasa
        - Adalah huruf/angka/simbol yang terlihat mirip normal
        """
        # Skip whitespace biasa
        if char in [' ', '\t', '\n', '\r']:
            return False
        
        # Skip emoji
        if self.is_emoji(char):
            return False
        
        # Skip karakter normal
        if self.is_normal_char(char):
            return False
        
        # Dapatkan kategori Unicode
        try:
            category = unicodedata.category(char)
            name = unicodedata.name(char, '')
            
            # Karakter yang terlihat seperti huruf/angka
            # Category: L* (Letter), N* (Number), atau simbol tertentu
            is_letter_like = category.startswith('L')
            is_number_like = category.startswith('N')
            is_suspicious_symbol = category in self.suspicious_categories
            
            # Kata kunci dalam nama Unicode yang mengindikasikan homoglyph
            homoglyph_keywords = [
                'MATHEMATICAL', 'BOLD', 'ITALIC', 'SCRIPT', 'FRAKTUR',
                'DOUBLE-STRUCK', 'SANS-SERIF', 'MONOSPACE', 
                'FULLWIDTH', 'HALFWIDTH', 'CIRCLED', 'PARENTHESIZED',
                'SQUARED', 'NEGATIVE', 'REGIONAL', 'TAG'
            ]
            
            has_homoglyph_keyword = any(keyword in name for keyword in homoglyph_keywords)
            
            return (is_letter_like or is_number_like or is_suspicious_symbol) and has_homoglyph_keyword
            
        except (ValueError, TypeError):
            # Jika tidak bisa mendapat info Unicode, anggap bukan homoglyph
            return False
    
    def extract_homoglyphs_from_text(self, text):
        """Ekstraksi semua homoglyph dari satu teks"""
        if pd.isna(text) or not isinstance(text, str):
            return []
        
        homoglyphs = []
        for char in text:
            if self.is_homoglyph(char):
                try:
                    name = unicodedata.name(char, 'UNKNOWN')
                    code_point = f"U+{ord(char):04X}"
                    homoglyphs.append({
                        'char': char,
                        'unicode_name': name,
                        'code_point': code_point,
                        'category': unicodedata.category(char)
                    })
                except:
                    pass
        
        return homoglyphs
    
    def analyze_dataset(self, df, text_column='comment'):
        """
        Analisis dataset dan ekstraksi semua homoglyph yang ditemukan
        
        Parameters:
        -----------
        df : pandas DataFrame
            Dataset dengan kolom teks
        text_column : str
            Nama kolom yang berisi teks (default: 'comment')
        
        Returns:
        --------
        dict : Hasil analisis lengkap
        """
        print(f"Menganalisis kolom '{text_column}'...")
        print(f"Total rows: {len(df)}")
        print("="*80)
        
        all_homoglyphs = []
        rows_with_homoglyphs = []
        
        # Iterasi setiap baris
        for idx, row in df.iterrows():
            text = row[text_column]
            homoglyphs = self.extract_homoglyphs_from_text(text)
            
            if homoglyphs:
                rows_with_homoglyphs.append({
                    'index': idx,
                    'text': text,
                    'homoglyphs': homoglyphs,
                    'homoglyph_count': len(homoglyphs)
                })
                all_homoglyphs.extend(homoglyphs)
        
        # Statistik homoglyph
        homoglyph_chars = [h['char'] for h in all_homoglyphs]
        homoglyph_counter = Counter(homoglyph_chars)
        
        # Buat mapping untuk normalisasi
        homoglyph_mapping = self._create_normalization_mapping(all_homoglyphs)
        
        results = {
            'total_rows': len(df),
            'rows_with_homoglyphs': len(rows_with_homoglyphs),
            'total_homoglyph_chars': len(all_homoglyphs),
            'unique_homoglyphs': len(homoglyph_counter),
            'homoglyph_frequency': homoglyph_counter,
            'detailed_rows': rows_with_homoglyphs,
            'normalization_mapping': homoglyph_mapping
        }
        
        return results
    
    def _create_normalization_mapping(self, homoglyphs):
        """
        Buat mapping otomatis dari homoglyph ke karakter normal
        Berdasarkan nama Unicode
        """
        mapping = {}
        
        for h in homoglyphs:
            char = h['char']
            name = h['unicode_name']
            
            if char in mapping:
                continue
            
            # Ekstraksi karakter normal dari nama Unicode
            # Contoh: "MATHEMATICAL BOLD CAPITAL A" -> "A"
            normal_char = self._extract_normal_char_from_name(name)
            if normal_char:
                mapping[char] = normal_char
        
        return mapping
    
    def _extract_normal_char_from_name(self, unicode_name):
        """Ekstraksi karakter normal dari nama Unicode"""
        # Pattern untuk huruf kapital
        if 'CAPITAL' in unicode_name or 'UPPER' in unicode_name:
            # Cari huruf A-Z di akhir nama
            match = re.search(r'\b([A-Z])\b', unicode_name[::-1])
            if match:
                return match.group(1)
        
        # Pattern untuk huruf kecil
        if 'SMALL' in unicode_name or 'LOWER' in unicode_name:
            # Map ke huruf kecil
            match = re.search(r'\b([A-Z])\b', unicode_name[::-1])
            if match:
                return match.group(1).lower()
        
        # Pattern untuk angka
        if 'DIGIT' in unicode_name:
            match = re.search(r'DIGIT (\w+)', unicode_name)
            if match:
                digit_name = match.group(1)
                digit_map = {
                    'ZERO': '0', 'ONE': '1', 'TWO': '2', 'THREE': '3',
                    'FOUR': '4', 'FIVE': '5', 'SIX': '6', 'SEVEN': '7',
                    'EIGHT': '8', 'NINE': '9'
                }
                return digit_map.get(digit_name)
        
        return None
    
    def print_summary(self, results):
        """Cetak ringkasan hasil analisis"""
        print("\n" + "="*80)
        print("RINGKASAN ANALISIS HOMOGLYPH")
        print("="*80)
        
        print(f"\n📊 Statistik:")
        print(f"  - Total baris dalam dataset: {results['total_rows']}")
        print(f"  - Baris yang mengandung homoglyph: {results['rows_with_homoglyphs']}")
        print(f"  - Persentase: {results['rows_with_homoglyphs']/results['total_rows']*100:.2f}%")
        print(f"  - Total karakter homoglyph ditemukan: {results['total_homoglyph_chars']}")
        print(f"  - Unique homoglyph characters: {results['unique_homoglyphs']}")
        
        print(f"\n🔤 Top 10 Homoglyph Paling Sering Muncul:")
        for char, count in results['homoglyph_frequency'].most_common(10):
            try:
                name = unicodedata.name(char, 'UNKNOWN')
                code = f"U+{ord(char):04X}"
                normal = results['normalization_mapping'].get(char, '?')
                print(f"  '{char}' → '{normal}'  |  {code}  |  {count}x  |  {name}")
            except:
                pass
        
        print(f"\n📝 Contoh Komentar dengan Homoglyph (5 pertama):")
        for i, row in enumerate(results['detailed_rows'][:5], 1):
            print(f"\n  [{i}] Index: {row['index']}")
            print(f"      Text: {row['text'][:100]}{'...' if len(row['text']) > 100 else ''}")
            print(f"      Homoglyphs found: {row['homoglyph_count']}")
            unique_chars = list(set([h['char'] for h in row['homoglyphs']]))
            print(f"      Characters: {', '.join(unique_chars)}")
    
    def export_mapping_code(self, results, output_file='homoglyph_mapping.py'):
        """
        Export mapping ke file Python yang bisa langsung digunakan
        """
        mapping = results['normalization_mapping']
        
        code = "# Auto-generated homoglyph mapping\n"
        code += "# Generated from dataset analysis\n\n"
        code += "HOMOGLYPH_MAP = {\n"
        
        for homo, normal in sorted(mapping.items()):
            try:
                name = unicodedata.name(homo, 'UNKNOWN')
                code += f"    '{homo}': '{normal}',  # {name}\n"
            except:
                code += f"    '{homo}': '{normal}',\n"
        
        code += "}\n\n"
        code += "def normalize_homoglyph(text):\n"
        code += "    \"\"\"Normalize homoglyph characters to normal ASCII\"\"\"\n"
        code += "    for homo, normal in HOMOGLYPH_MAP.items():\n"
        code += "        text = text.replace(homo, normal)\n"
        code += "    return text\n"
        
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(code)
        
        print(f"\n✅ Mapping code exported to: {output_file}")
        return code


In [27]:
extractor = HomoglyphExtractor()
    
# Analisis dataset
results = extractor.analyze_dataset(df_all, text_column='comment')

Menganalisis kolom 'comment'...
Total rows: 11673


In [28]:
extractor.print_summary(results)
    
# Export mapping ke file Python
mapping_code = extractor.export_mapping_code(results)


RINGKASAN ANALISIS HOMOGLYPH

📊 Statistik:
  - Total baris dalam dataset: 11673
  - Baris yang mengandung homoglyph: 927
  - Persentase: 7.94%
  - Total karakter homoglyph ditemukan: 5715
  - Unique homoglyph characters: 217

🔤 Top 10 Homoglyph Paling Sering Muncul:
  '𝟴' → '8'  |  U+1D7F4  |  345x  |  MATHEMATICAL SANS-SERIF BOLD DIGIT EIGHT
  '𝗧' → 'T'  |  U+1D5E7  |  209x  |  MATHEMATICAL SANS-SERIF BOLD CAPITAL T
  '𝐀' → 'A'  |  U+1D400  |  196x  |  MATHEMATICAL BOLD CAPITAL A
  '𝘼' → 'A'  |  U+1D63C  |  176x  |  MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL A
  '𝗢' → 'O'  |  U+1D5E2  |  176x  |  MATHEMATICAL SANS-SERIF BOLD CAPITAL O
  '𝟖' → '8'  |  U+1D7D6  |  168x  |  MATHEMATICAL BOLD DIGIT EIGHT
  '𝗨' → 'U'  |  U+1D5E8  |  140x  |  MATHEMATICAL SANS-SERIF BOLD CAPITAL U
  '𝐃' → 'D'  |  U+1D403  |  136x  |  MATHEMATICAL BOLD CAPITAL D
  '𝟩' → '7'  |  U+1D7E9  |  125x  |  MATHEMATICAL SANS-SERIF DIGIT SEVEN
  '𝘖' → 'O'  |  U+1D616  |  124x  |  MATHEMATICAL SANS-SERIF ITALIC CAPIT

In [40]:
df_all[df_all['label'] == 1].sample(60, ignore_index=True)['comment']

0                          ｈａｕ ｓ ｗ ｉｎ １３ ８_pasti jepe ❤
1           recomen kawan kusumat0t0 muncrat dana tuh 🚩
2     nggak nyangka 𝑴𝑨𝑵𝑫𝑨𝙇𝙄𝙆𝘼❼❼ ngasih solusi nggak ...
3     main 𝐃 𝐎 я a 𝟳 𝟩bikin optimis hasilnya nggak m...
4     maaf terlambat bang semangat ya salam 𝗪𝗘𝗧𝗢𝗡𝟴𝟴 ...
5                                                   777
6     main bentar eh langsung rezeki nomplok 𝘿 𝐄 𝙒 a...
7      web terbaik asia gak salah pilih weton88 gni 🚩 🚩
8                          wede malam makswin starlight
9               bukti layar 𝗣 𝗟 𝗨 𝗧 𝗢 𝟴 𝟴 tempatnya 🔥 🔥
10         keuntungan bikin ceria makasih 𝘼 e r 𝑂 8 𝟾 🛹
11                                jepey 𝗪𝗘𝗧𝗢𝗡𝟴𝟴 beneran
12    menyala kali kusumat0t0baru muter udah pecah j...
13                  wih 𝑴𝑨𝑵𝑫𝑨𝙇𝙄𝙆𝘼❼❼ booming ya bahas ❼❼
14                terima kasi rezekinya petarung138 ⚡ ⚡
15                       jepe pagi siang malem pulau777
16           𝗞𝗢𝗜𝗦𝗟𝗢𝗧 udah kayak rumah pencari bonus ✨ 🎉
17                  𝗣 𝗟 𝗨 𝗧 𝗢 𝟴 𝟴 janji real pro