In [15]:
import pandas as pd

df_train = pd.read_csv('https://raw.githubusercontent.com/nafhanugm/data-mining2/refs/heads/master/dataset/train.csv')
df_test = pd.read_csv('https://raw.githubusercontent.com/nafhanugm/data-mining2/refs/heads/master/dataset/test.csv')
df_holdout = pd.read_csv('https://raw.githubusercontent.com/nafhanugm/data-mining2/refs/heads/master/dataset/holdout.csv', delimiter=';')

df_all = pd.concat([df_train, df_test, df_holdout], ignore_index=True)

In [3]:
df_all.tail(10)

Unnamed: 0,comment,label
10496,ok min weton88,1
10497,bang review movie blood brothers bara naga,0
10498,bang ni sombong,0
10499,queensavha udah bakar duit bro buzzernya gak h...,0
10500,destinasi yg kuinginkan,0
10501,sih tulisan gabung beranda chanel youtube lu s...,0
10502,bagus jga sih mainan gojo y sampe yg termurah ...,0
10503,anak2 indonesia pertumbuham badanya menurun ke...,0
10504,papua original asian nigga,0
10505,🤣🤣🤣🤣🤣,0


In [2]:
import pandas as pd
import unicodedata
from collections import Counter
import re

class HomoglyphExtractor:
    """
    Ekstraksi kar.akter homoglyph dari dataset
    Mendeteksi karakter Unicode yang mirip huruf/angka normal tapi beda encoding
    """

    def __init__(self):
        # Range Unicode untuk karakter normal (baseline)
        self.normal_ranges = [
            (0x0020, 0x007E),  # Basic Latin (spasi sampai ~)
            (0x00A0, 0x00FF),  # Latin-1 Supplement
        ]

        # Range Unicode untuk emoji (akan di-exclude)
        self.emoji_ranges = [
            (0x1F600, 0x1F64F),  # Emoticons
            (0x1F300, 0x1F5FF),  # Symbols & Pictographs
            (0x1F680, 0x1F6FF),  # Transport & Map
            (0x1F1E0, 0x1F1FF),  # Flags
            (0x2600, 0x26FF),    # Miscellaneous Symbols
            (0x2700, 0x27BF),    # Dingbats
            (0xFE00, 0xFE0F),    # Variation Selectors
            (0x1F900, 0x1F9FF),  # Supplemental Symbols
            (0x1FA70, 0x1FAFF),  # Symbols and Pictographs Extended-A
        ]

        # Kategori Unicode yang sering digunakan untuk homoglyph
        self.suspicious_categories = [
            'Lm',  # Letter, Modifier
            'Sk',  # Symbol, Modifier
            'So',  # Symbol, Other
        ]

    def is_emoji(self, char):
        """Cek apakah karakter adalah emoji"""
        code_point = ord(char)
        for start, end in self.emoji_ranges:
            if start <= code_point <= end:
                return True
        return False

    def is_normal_char(self, char):
        """Cek apakah karakter adalah ASCII/Latin normal"""
        code_point = ord(char)
        for start, end in self.normal_ranges:
            if start <= code_point <= end:
                return True
        return False

    def is_homoglyph(self, char):
        """
        Deteksi homoglyph:
        - Bukan karakter normal
        - Bukan emoji
        - Bukan whitespace biasa
        - Adalah huruf/angka/simbol yang terlihat mirip normal
        """
        # Skip whitespace biasa
        if char in [' ', '\t', '\n', '\r']:
            return False

        # Skip emoji
        if self.is_emoji(char):
            return False

        # Skip karakter normal
        if self.is_normal_char(char):
            return False

        # Dapatkan kategori Unicode
        try:
            category = unicodedata.category(char)
            name = unicodedata.name(char, '')

            # Karakter yang terlihat seperti huruf/angka
            # Category: L* (Letter), N* (Number), atau simbol tertentu
            is_letter_like = category.startswith('L')
            is_number_like = category.startswith('N')
            is_suspicious_symbol = category in self.suspicious_categories

            # Kata kunci dalam nama Unicode yang mengindikasikan homoglyph
            homoglyph_keywords = [
                'MATHEMATICAL', 'BOLD', 'ITALIC', 'SCRIPT', 'FRAKTUR',
                'DOUBLE-STRUCK', 'SANS-SERIF', 'MONOSPACE',
                'FULLWIDTH', 'HALFWIDTH', 'CIRCLED', 'PARENTHESIZED',
                'SQUARED', 'NEGATIVE', 'REGIONAL', 'TAG'
            ]

            has_homoglyph_keyword = any(keyword in name for keyword in homoglyph_keywords)

            return (is_letter_like or is_number_like or is_suspicious_symbol) and has_homoglyph_keyword

        except (ValueError, TypeError):
            # Jika tidak bisa mendapat info Unicode, anggap bukan homoglyph
            return False

    def extract_homoglyphs_from_text(self, text):
        """Ekstraksi semua homoglyph dari satu teks"""
        if pd.isna(text) or not isinstance(text, str):
            return []

        homoglyphs = []
        for char in text:
            if self.is_homoglyph(char):
                try:
                    name = unicodedata.name(char, 'UNKNOWN')
                    code_point = f"U+{ord(char):04X}"
                    homoglyphs.append({
                        'char': char,
                        'unicode_name': name,
                        'code_point': code_point,
                        'category': unicodedata.category(char)
                    })
                except:
                    pass

        return homoglyphs

    def analyze_dataset(self, df, text_column='comment'):
        """
        Analisis dataset dan ekstraksi semua homoglyph yang ditemukan

        Parameters:
        -----------
        df : pandas DataFrame
            Dataset dengan kolom teks
        text_column : str
            Nama kolom yang berisi teks (default: 'comment')

        Returns:
        --------
        dict : Hasil analisis lengkap
        """
        print(f"Menganalisis kolom '{text_column}'...")
        print(f"Total rows: {len(df)}")
        print("="*80)

        all_homoglyphs = []
        rows_with_homoglyphs = []

        # Iterasi setiap baris
        for idx, row in df.iterrows():
            text = row[text_column]
            homoglyphs = self.extract_homoglyphs_from_text(text)

            if homoglyphs:
                rows_with_homoglyphs.append({
                    'index': idx,
                    'text': text,
                    'homoglyphs': homoglyphs,
                    'homoglyph_count': len(homoglyphs)
                })
                all_homoglyphs.extend(homoglyphs)

        # Statistik homoglyph
        homoglyph_chars = [h['char'] for h in all_homoglyphs]
        homoglyph_counter = Counter(homoglyph_chars)

        # Buat mapping untuk normalisasi
        homoglyph_mapping = self._create_normalization_mapping(all_homoglyphs)

        results = {
            'total_rows': len(df),
            'rows_with_homoglyphs': len(rows_with_homoglyphs),
            'total_homoglyph_chars': len(all_homoglyphs),
            'unique_homoglyphs': len(homoglyph_counter),
            'homoglyph_frequency': homoglyph_counter,
            'detailed_rows': rows_with_homoglyphs,
            'normalization_mapping': homoglyph_mapping
        }

        return results

    def _create_normalization_mapping(self, homoglyphs):
        """
        Buat mapping otomatis dari homoglyph ke karakter normal
        Berdasarkan nama Unicode
        """
        mapping = {}

        for h in homoglyphs:
            char = h['char']
            name = h['unicode_name']

            if char in mapping:
                continue

            # Ekstraksi karakter normal dari nama Unicode
            # Contoh: "MATHEMATICAL BOLD CAPITAL A" -> "A"
            normal_char = self._extract_normal_char_from_name(name)
            if normal_char:
                mapping[char] = normal_char

        return mapping

    def _extract_normal_char_from_name(self, unicode_name):
        """Ekstraksi karakter normal dari nama Unicode"""
        # Pattern untuk huruf kapital
        if 'CAPITAL' in unicode_name or 'UPPER' in unicode_name:
            # Cari huruf A-Z di akhir nama
            match = re.search(r'\b([A-Z])\b', unicode_name[::-1])
            if match:
                return match.group(1)

        # Pattern untuk huruf kecil
        if 'SMALL' in unicode_name or 'LOWER' in unicode_name:
            # Map ke huruf kecil
            match = re.search(r'\b([A-Z])\b', unicode_name[::-1])
            if match:
                return match.group(1).lower()

        # Pattern untuk angka
        if 'DIGIT' in unicode_name:
            match = re.search(r'DIGIT (\w+)', unicode_name)
            if match:
                digit_name = match.group(1)
                digit_map = {
                    'ZERO': '0', 'ONE': '1', 'TWO': '2', 'THREE': '3',
                    'FOUR': '4', 'FIVE': '5', 'SIX': '6', 'SEVEN': '7',
                    'EIGHT': '8', 'NINE': '9'
                }
                return digit_map.get(digit_name)

        return None

    def print_summary(self, results):
        """Cetak ringkasan hasil analisis"""
        print("\n" + "="*80)
        print("RINGKASAN ANALISIS HOMOGLYPH")
        print("="*80)

        print(f"\n📊 Statistik:")
        print(f"  - Total baris dalam dataset: {results['total_rows']}")
        print(f"  - Baris yang mengandung homoglyph: {results['rows_with_homoglyphs']}")
        print(f"  - Persentase: {results['rows_with_homoglyphs']/results['total_rows']*100:.2f}%")
        print(f"  - Total karakter homoglyph ditemukan: {results['total_homoglyph_chars']}")
        print(f"  - Unique homoglyph characters: {results['unique_homoglyphs']}")

        print(f"\n🔤 Top 10 Homoglyph Paling Sering Muncul:")
        for char, count in results['homoglyph_frequency'].most_common(10):
            try:
                name = unicodedata.name(char, 'UNKNOWN')
                code = f"U+{ord(char):04X}"
                normal = results['normalization_mapping'].get(char, '?')
                print(f"  '{char}' → '{normal}'  |  {code}  |  {count}x  |  {name}")
            except:
                pass

        print(f"\n📝 Contoh Komentar dengan Homoglyph (5 pertama):")
        for i, row in enumerate(results['detailed_rows'][:5], 1):
            print(f"\n  [{i}] Index: {row['index']}")
            print(f"      Text: {row['text'][:100]}{'...' if len(row['text']) > 100 else ''}")
            print(f"      Homoglyphs found: {row['homoglyph_count']}")
            unique_chars = list(set([h['char'] for h in row['homoglyphs']]))
            print(f"      Characters: {', '.join(unique_chars)}")

    def export_mapping_code(self, results, output_file='homoglyph_mapping.py'):
        """
        Export mapping ke file Python yang bisa langsung digunakan
        """
        mapping = results['normalization_mapping']

        code = "# Auto-generated homoglyph mapping\n"
        code += "# Generated from dataset analysis\n\n"
        code += "HOMOGLYPH_MAP = {\n"

        for homo, normal in sorted(mapping.items()):
            try:
                name = unicodedata.name(homo, 'UNKNOWN')
                code += f"    '{homo}': '{normal}',  # {name}\n"
            except:
                code += f"    '{homo}': '{normal}',\n"

        code += "}\n\n"
        code += "def normalize_homoglyph(text):\n"
        code += "    \"\"\"Normalize homoglyph characters to normal ASCII\"\"\"\n"
        code += "    for homo, normal in HOMOGLYPH_MAP.items():\n"
        code += "        text = text.replace(homo, normal)\n"
        code += "    return text\n"

        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(code)

        print(f"\n✅ Mapping code exported to: {output_file}")
        return code


In [3]:
extractor = HomoglyphExtractor()

# Analisis dataset
results = extractor.analyze_dataset(df_all, text_column='comment')

Menganalisis kolom 'comment'...
Total rows: 10506


In [4]:
extractor.print_summary(results)

# Export mapping ke file Python
mapping_code = extractor.export_mapping_code(results)


RINGKASAN ANALISIS HOMOGLYPH

📊 Statistik:
  - Total baris dalam dataset: 10506
  - Baris yang mengandung homoglyph: 801
  - Persentase: 7.62%
  - Total karakter homoglyph ditemukan: 4889
  - Unique homoglyph characters: 197

🔤 Top 10 Homoglyph Paling Sering Muncul:
  '𝟴' → '8'  |  U+1D7F4  |  307x  |  MATHEMATICAL SANS-SERIF BOLD DIGIT EIGHT
  '𝗧' → 'T'  |  U+1D5E7  |  194x  |  MATHEMATICAL SANS-SERIF BOLD CAPITAL T
  '𝘼' → 'A'  |  U+1D63C  |  174x  |  MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL A
  '𝐀' → 'A'  |  U+1D400  |  156x  |  MATHEMATICAL BOLD CAPITAL A
  '𝗢' → 'O'  |  U+1D5E2  |  146x  |  MATHEMATICAL SANS-SERIF BOLD CAPITAL O
  '𝗨' → 'U'  |  U+1D5E8  |  140x  |  MATHEMATICAL SANS-SERIF BOLD CAPITAL U
  '𝟩' → '7'  |  U+1D7E9  |  125x  |  MATHEMATICAL SANS-SERIF DIGIT SEVEN
  '𝘖' → 'O'  |  U+1D616  |  124x  |  MATHEMATICAL SANS-SERIF ITALIC CAPITAL O
  '𝟖' → '8'  |  U+1D7D6  |  117x  |  MATHEMATICAL BOLD DIGIT EIGHT
  '𝑂' → 'O'  |  U+1D442  |  111x  |  MATHEMATICAL ITALIC CAP

Homoglyph Helper

In [5]:
# Auto-generated homoglyph mapping
# Generated from dataset analysis

HOMOGLYPH_MAP = {
    'ℍ': 'H',  # DOUBLE-STRUCK CAPITAL H
    '１': '1',  # FULLWIDTH DIGIT ONE
    '３': '3',  # FULLWIDTH DIGIT THREE
    '８': '8',  # FULLWIDTH DIGIT EIGHT
    'ａ': 'a',  # FULLWIDTH LATIN SMALL LETTER A
    'ｄ': 'd',  # FULLWIDTH LATIN SMALL LETTER D
    'ｅ': 'e',  # FULLWIDTH LATIN SMALL LETTER E
    'ｈ': 'h',  # FULLWIDTH LATIN SMALL LETTER H
    'ｉ': 'i',  # FULLWIDTH LATIN SMALL LETTER I
    'ｊ': 'j',  # FULLWIDTH LATIN SMALL LETTER J
    'ｌ': 'l',  # FULLWIDTH LATIN SMALL LETTER L
    'ｎ': 'n',  # FULLWIDTH LATIN SMALL LETTER N
    'ｏ': 'o',  # FULLWIDTH LATIN SMALL LETTER O
    'ｐ': 'p',  # FULLWIDTH LATIN SMALL LETTER P
    'ｓ': 's',  # FULLWIDTH LATIN SMALL LETTER S
    'ｔ': 't',  # FULLWIDTH LATIN SMALL LETTER T
    'ｕ': 'u',  # FULLWIDTH LATIN SMALL LETTER U
    'ｗ': 'w',  # FULLWIDTH LATIN SMALL LETTER W
    '𝐀': 'A',  # MATHEMATICAL BOLD CAPITAL A
    '𝐁': 'B',  # MATHEMATICAL BOLD CAPITAL B
    '𝐃': 'D',  # MATHEMATICAL BOLD CAPITAL D
    '𝐄': 'E',  # MATHEMATICAL BOLD CAPITAL E
    '𝐆': 'G',  # MATHEMATICAL BOLD CAPITAL G
    '𝐇': 'H',  # MATHEMATICAL BOLD CAPITAL H
    '𝐈': 'I',  # MATHEMATICAL BOLD CAPITAL I
    '𝐊': 'K',  # MATHEMATICAL BOLD CAPITAL K
    '𝐋': 'L',  # MATHEMATICAL BOLD CAPITAL L
    '𝐌': 'M',  # MATHEMATICAL BOLD CAPITAL M
    '𝐍': 'N',  # MATHEMATICAL BOLD CAPITAL N
    '𝐎': 'O',  # MATHEMATICAL BOLD CAPITAL O
    '𝐑': 'R',  # MATHEMATICAL BOLD CAPITAL R
    '𝐒': 'S',  # MATHEMATICAL BOLD CAPITAL S
    '𝐓': 'T',  # MATHEMATICAL BOLD CAPITAL T
    '𝐔': 'U',  # MATHEMATICAL BOLD CAPITAL U
    '𝐕': 'V',  # MATHEMATICAL BOLD CAPITAL V
    '𝐖': 'W',  # MATHEMATICAL BOLD CAPITAL W
    '𝐗': 'X',  # MATHEMATICAL BOLD CAPITAL X
    '𝐘': 'Y',  # MATHEMATICAL BOLD CAPITAL Y
    '𝐚': 'a',  # MATHEMATICAL BOLD SMALL A
    '𝐛': 'b',  # MATHEMATICAL BOLD SMALL B
    '𝐝': 'd',  # MATHEMATICAL BOLD SMALL D
    '𝐞': 'e',  # MATHEMATICAL BOLD SMALL E
    '𝐠': 'g',  # MATHEMATICAL BOLD SMALL G
    '𝐢': 'i',  # MATHEMATICAL BOLD SMALL I
    '𝐣': 'j',  # MATHEMATICAL BOLD SMALL J
    '𝐥': 'l',  # MATHEMATICAL BOLD SMALL L
    '𝐧': 'n',  # MATHEMATICAL BOLD SMALL N
    '𝐨': 'o',  # MATHEMATICAL BOLD SMALL O
    '𝐫': 'r',  # MATHEMATICAL BOLD SMALL R
    '𝐬': 's',  # MATHEMATICAL BOLD SMALL S
    '𝐭': 't',  # MATHEMATICAL BOLD SMALL T
    '𝐲': 'y',  # MATHEMATICAL BOLD SMALL Y
    '𝐴': 'A',  # MATHEMATICAL ITALIC CAPITAL A
    '𝐷': 'D',  # MATHEMATICAL ITALIC CAPITAL D
    '𝐸': 'E',  # MATHEMATICAL ITALIC CAPITAL E
    '𝐺': 'G',  # MATHEMATICAL ITALIC CAPITAL G
    '𝐻': 'H',  # MATHEMATICAL ITALIC CAPITAL H
    '𝐼': 'I',  # MATHEMATICAL ITALIC CAPITAL I
    '𝐿': 'L',  # MATHEMATICAL ITALIC CAPITAL L
    '𝑀': 'M',  # MATHEMATICAL ITALIC CAPITAL M
    '𝑂': 'O',  # MATHEMATICAL ITALIC CAPITAL O
    '𝑅': 'R',  # MATHEMATICAL ITALIC CAPITAL R
    '𝑆': 'S',  # MATHEMATICAL ITALIC CAPITAL S
    '𝑇': 'T',  # MATHEMATICAL ITALIC CAPITAL T
    '𝑈': 'U',  # MATHEMATICAL ITALIC CAPITAL U
    '𝑊': 'W',  # MATHEMATICAL ITALIC CAPITAL W
    '𝑋': 'X',  # MATHEMATICAL ITALIC CAPITAL X
    '𝑨': 'A',  # MATHEMATICAL BOLD ITALIC CAPITAL A
    '𝑪': 'C',  # MATHEMATICAL BOLD ITALIC CAPITAL C
    '𝑫': 'D',  # MATHEMATICAL BOLD ITALIC CAPITAL D
    '𝑮': 'G',  # MATHEMATICAL BOLD ITALIC CAPITAL G
    '𝑰': 'I',  # MATHEMATICAL BOLD ITALIC CAPITAL I
    '𝑲': 'K',  # MATHEMATICAL BOLD ITALIC CAPITAL K
    '𝑳': 'L',  # MATHEMATICAL BOLD ITALIC CAPITAL L
    '𝑴': 'M',  # MATHEMATICAL BOLD ITALIC CAPITAL M
    '𝑵': 'N',  # MATHEMATICAL BOLD ITALIC CAPITAL N
    '𝑶': 'O',  # MATHEMATICAL BOLD ITALIC CAPITAL O
    '𝑺': 'S',  # MATHEMATICAL BOLD ITALIC CAPITAL S
    '𝑻': 'T',  # MATHEMATICAL BOLD ITALIC CAPITAL T
    '𝒀': 'Y',  # MATHEMATICAL BOLD ITALIC CAPITAL Y
    '𝒜': 'A',  # MATHEMATICAL SCRIPT CAPITAL A
    '𝒢': 'G',  # MATHEMATICAL SCRIPT CAPITAL G
    '𝒩': 'N',  # MATHEMATICAL SCRIPT CAPITAL N
    '𝒫': 'P',  # MATHEMATICAL SCRIPT CAPITAL P
    '𝒮': 'S',  # MATHEMATICAL SCRIPT CAPITAL S
    '𝒯': 'T',  # MATHEMATICAL SCRIPT CAPITAL T
    '𝒰': 'U',  # MATHEMATICAL SCRIPT CAPITAL U
    '𝓐': 'A',  # MATHEMATICAL BOLD SCRIPT CAPITAL A
    '𝓘': 'I',  # MATHEMATICAL BOLD SCRIPT CAPITAL I
    '𝓛': 'L',  # MATHEMATICAL BOLD SCRIPT CAPITAL L
    '𝓝': 'N',  # MATHEMATICAL BOLD SCRIPT CAPITAL N
    '𝓟': 'P',  # MATHEMATICAL BOLD SCRIPT CAPITAL P
    '𝓤': 'U',  # MATHEMATICAL BOLD SCRIPT CAPITAL U
    '𝓦': 'W',  # MATHEMATICAL BOLD SCRIPT CAPITAL W
    '𝕂': 'K',  # MATHEMATICAL DOUBLE-STRUCK CAPITAL K
    '𝕒': 'a',  # MATHEMATICAL DOUBLE-STRUCK SMALL A
    '𝕓': 'b',  # MATHEMATICAL DOUBLE-STRUCK SMALL B
    '𝕕': 'd',  # MATHEMATICAL DOUBLE-STRUCK SMALL D
    '𝕖': 'e',  # MATHEMATICAL DOUBLE-STRUCK SMALL E
    '𝕘': 'g',  # MATHEMATICAL DOUBLE-STRUCK SMALL G
    '𝕙': 'h',  # MATHEMATICAL DOUBLE-STRUCK SMALL H
    '𝕚': 'i',  # MATHEMATICAL DOUBLE-STRUCK SMALL I
    '𝕟': 'n',  # MATHEMATICAL DOUBLE-STRUCK SMALL N
    '𝕡': 'p',  # MATHEMATICAL DOUBLE-STRUCK SMALL P
    '𝕣': 'r',  # MATHEMATICAL DOUBLE-STRUCK SMALL R
    '𝕤': 's',  # MATHEMATICAL DOUBLE-STRUCK SMALL S
    '𝕦': 'u',  # MATHEMATICAL DOUBLE-STRUCK SMALL U
    '𝖪': 'K',  # MATHEMATICAL SANS-SERIF CAPITAL K
    '𝖺': 'a',  # MATHEMATICAL SANS-SERIF SMALL A
    '𝖾': 'e',  # MATHEMATICAL SANS-SERIF SMALL E
    '𝗀': 'g',  # MATHEMATICAL SANS-SERIF SMALL G
    '𝗁': 'h',  # MATHEMATICAL SANS-SERIF SMALL H
    '𝗂': 'i',  # MATHEMATICAL SANS-SERIF SMALL I
    '𝗃': 'j',  # MATHEMATICAL SANS-SERIF SMALL J
    '𝗄': 'k',  # MATHEMATICAL SANS-SERIF SMALL K
    '𝗅': 'l',  # MATHEMATICAL SANS-SERIF SMALL L
    '𝗆': 'm',  # MATHEMATICAL SANS-SERIF SMALL M
    '𝗇': 'n',  # MATHEMATICAL SANS-SERIF SMALL N
    '𝗈': 'o',  # MATHEMATICAL SANS-SERIF SMALL O
    '𝗍': 't',  # MATHEMATICAL SANS-SERIF SMALL T
    '𝗎': 'u',  # MATHEMATICAL SANS-SERIF SMALL U
    '𝗒': 'y',  # MATHEMATICAL SANS-SERIF SMALL Y
    '𝗔': 'A',  # MATHEMATICAL SANS-SERIF BOLD CAPITAL A
    '𝗕': 'B',  # MATHEMATICAL SANS-SERIF BOLD CAPITAL B
    '𝗖': 'C',  # MATHEMATICAL SANS-SERIF BOLD CAPITAL C
    '𝗗': 'D',  # MATHEMATICAL SANS-SERIF BOLD CAPITAL D
    '𝗘': 'E',  # MATHEMATICAL SANS-SERIF BOLD CAPITAL E
    '𝗚': 'G',  # MATHEMATICAL SANS-SERIF BOLD CAPITAL G
    '𝗛': 'H',  # MATHEMATICAL SANS-SERIF BOLD CAPITAL H
    '𝗜': 'I',  # MATHEMATICAL SANS-SERIF BOLD CAPITAL I
    '𝗝': 'J',  # MATHEMATICAL SANS-SERIF BOLD CAPITAL J
    '𝗞': 'K',  # MATHEMATICAL SANS-SERIF BOLD CAPITAL K
    '𝗟': 'L',  # MATHEMATICAL SANS-SERIF BOLD CAPITAL L
    '𝗠': 'M',  # MATHEMATICAL SANS-SERIF BOLD CAPITAL M
    '𝗡': 'N',  # MATHEMATICAL SANS-SERIF BOLD CAPITAL N
    '𝗢': 'O',  # MATHEMATICAL SANS-SERIF BOLD CAPITAL O
    '𝗣': 'P',  # MATHEMATICAL SANS-SERIF BOLD CAPITAL P
    '𝗥': 'R',  # MATHEMATICAL SANS-SERIF BOLD CAPITAL R
    '𝗦': 'S',  # MATHEMATICAL SANS-SERIF BOLD CAPITAL S
    '𝗧': 'T',  # MATHEMATICAL SANS-SERIF BOLD CAPITAL T
    '𝗨': 'U',  # MATHEMATICAL SANS-SERIF BOLD CAPITAL U
    '𝗪': 'W',  # MATHEMATICAL SANS-SERIF BOLD CAPITAL W
    '𝗬': 'Y',  # MATHEMATICAL SANS-SERIF BOLD CAPITAL Y
    '𝗮': 'a',  # MATHEMATICAL SANS-SERIF BOLD SMALL A
    '𝗲': 'e',  # MATHEMATICAL SANS-SERIF BOLD SMALL E
    '𝗴': 'g',  # MATHEMATICAL SANS-SERIF BOLD SMALL G
    '𝗶': 'i',  # MATHEMATICAL SANS-SERIF BOLD SMALL I
    '𝗹': 'l',  # MATHEMATICAL SANS-SERIF BOLD SMALL L
    '𝗺': 'm',  # MATHEMATICAL SANS-SERIF BOLD SMALL M
    '𝗻': 'n',  # MATHEMATICAL SANS-SERIF BOLD SMALL N
    '𝗼': 'o',  # MATHEMATICAL SANS-SERIF BOLD SMALL O
    '𝗿': 'r',  # MATHEMATICAL SANS-SERIF BOLD SMALL R
    '𝘀': 's',  # MATHEMATICAL SANS-SERIF BOLD SMALL S
    '𝘂': 'u',  # MATHEMATICAL SANS-SERIF BOLD SMALL U
    '𝘄': 'w',  # MATHEMATICAL SANS-SERIF BOLD SMALL W
    '𝘆': 'y',  # MATHEMATICAL SANS-SERIF BOLD SMALL Y
    '𝘈': 'A',  # MATHEMATICAL SANS-SERIF ITALIC CAPITAL A
    '𝘋': 'D',  # MATHEMATICAL SANS-SERIF ITALIC CAPITAL D
    '𝘌': 'E',  # MATHEMATICAL SANS-SERIF ITALIC CAPITAL E
    '𝘎': 'G',  # MATHEMATICAL SANS-SERIF ITALIC CAPITAL G
    '𝘏': 'H',  # MATHEMATICAL SANS-SERIF ITALIC CAPITAL H
    '𝘓': 'L',  # MATHEMATICAL SANS-SERIF ITALIC CAPITAL L
    '𝘔': 'M',  # MATHEMATICAL SANS-SERIF ITALIC CAPITAL M
    '𝘖': 'O',  # MATHEMATICAL SANS-SERIF ITALIC CAPITAL O
    '𝘙': 'R',  # MATHEMATICAL SANS-SERIF ITALIC CAPITAL R
    '𝘚': 'S',  # MATHEMATICAL SANS-SERIF ITALIC CAPITAL S
    '𝘛': 'T',  # MATHEMATICAL SANS-SERIF ITALIC CAPITAL T
    '𝘜': 'U',  # MATHEMATICAL SANS-SERIF ITALIC CAPITAL U
    '𝘞': 'W',  # MATHEMATICAL SANS-SERIF ITALIC CAPITAL W
    '𝘟': 'X',  # MATHEMATICAL SANS-SERIF ITALIC CAPITAL X
    '𝘦': 'e',  # MATHEMATICAL SANS-SERIF ITALIC SMALL E
    '𝘬': 'k',  # MATHEMATICAL SANS-SERIF ITALIC SMALL K
    '𝘶': 'u',  # MATHEMATICAL SANS-SERIF ITALIC SMALL U
    '𝘼': 'A',  # MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL A
    '𝘽': 'B',  # MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL B
    '𝘿': 'D',  # MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL D
    '𝙀': 'E',  # MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL E
    '𝙁': 'F',  # MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL F
    '𝙂': 'G',  # MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL G
    '𝙃': 'H',  # MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL H
    '𝙄': 'I',  # MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL I
    '𝙆': 'K',  # MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL K
    '𝙇': 'L',  # MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL L
    '𝙈': 'M',  # MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL M
    '𝙉': 'N',  # MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL N
    '𝙊': 'O',  # MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL O
    '𝙋': 'P',  # MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL P
    '𝙍': 'R',  # MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL R
    '𝙎': 'S',  # MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL S
    '𝙏': 'T',  # MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL T
    '𝙐': 'U',  # MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL U
    '𝙒': 'W',  # MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL W
    '𝙓': 'X',  # MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL X
    '𝙔': 'Y',  # MATHEMATICAL SANS-SERIF BOLD ITALIC CAPITAL Y
    '𝙶': 'G',  # MATHEMATICAL MONOSPACE CAPITAL G
    '𝚊': 'a',  # MATHEMATICAL MONOSPACE SMALL A
    '𝚋': 'b',  # MATHEMATICAL MONOSPACE SMALL B
    '𝚝': 't',  # MATHEMATICAL MONOSPACE SMALL T
    '𝚞': 'u',  # MATHEMATICAL MONOSPACE SMALL U
    '𝟎': '0',  # MATHEMATICAL BOLD DIGIT ZERO
    '𝟏': '1',  # MATHEMATICAL BOLD DIGIT ONE
    '𝟐': '2',  # MATHEMATICAL BOLD DIGIT TWO
    '𝟑': '3',  # MATHEMATICAL BOLD DIGIT THREE
    '𝟒': '4',  # MATHEMATICAL BOLD DIGIT FOUR
    '𝟕': '7',  # MATHEMATICAL BOLD DIGIT SEVEN
    '𝟖': '8',  # MATHEMATICAL BOLD DIGIT EIGHT
    '𝟩': '7',  # MATHEMATICAL SANS-SERIF DIGIT SEVEN
    '𝟪': '8',  # MATHEMATICAL SANS-SERIF DIGIT EIGHT
    '𝟮': '2',  # MATHEMATICAL SANS-SERIF BOLD DIGIT TWO
    '𝟳': '7',  # MATHEMATICAL SANS-SERIF BOLD DIGIT SEVEN
    '𝟴': '8',  # MATHEMATICAL SANS-SERIF BOLD DIGIT EIGHT
    '𝟵': '9',  # MATHEMATICAL SANS-SERIF BOLD DIGIT NINE
    '𝟽': '7',  # MATHEMATICAL MONOSPACE DIGIT SEVEN
    '𝟾': '8',  # MATHEMATICAL MONOSPACE DIGIT EIGHT
}

def normalize_homoglyph(text):
    """Normalize homoglyph characters to normal ASCII"""
    for homo, normal in HOMOGLYPH_MAP.items():
        text = text.replace(homo, normal)
    return text


In [9]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m40.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
import re
import unicodedata
from gensim.models import Word2Vec, FastText
from sklearn.base import BaseEstimator, TransformerMixin
import warnings
warnings.filterwarnings('ignore')

# ============================================================
# PREPROCESSING & FEATURE ENGINEERING
# ============================================================

class TextPreprocessor:
    """Preprocessing untuk menangani homoglyph dan variasi Unicode"""
    def normalize_homoglyph(self, text):
        """Konversi homoglyph Unicode ke karakter normal"""
        for homo, normal in HOMOGLYPH_MAP.items():
            text = text.replace(homo, normal)
        return text

    def normalize_unicode(self, text):
        """Normalisasi Unicode menggunakan NFKD"""
        return unicodedata.normalize('NFKD', text)

    def remove_extra_spaces(self, text):
        """Hapus spasi berlebih"""
        return re.sub(r'\s+', ' ', text).strip()

    def preprocess(self, text):
        """Pipeline preprocessing lengkap"""
        text = str(text).lower()
        text = self.normalize_homoglyph(text)
        text = self.normalize_unicode(text)
        text = self.remove_extra_spaces(text)
        return text


class AdditionalFeatures:
    """Ekstraksi fitur tambahan untuk deteksi spam"""

    def count_emoji(self, text):
        """Hitung jumlah emoji"""
        emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            "]+", flags=re.UNICODE)
        return len(emoji_pattern.findall(text))

    def capital_ratio(self, text):
        """Rasio huruf kapital"""
        if len(text) == 0:
            return 0
        return sum(1 for c in text if c.isupper()) / len(text)

    def has_numbers_in_word(self, text):
        """Deteksi angka dalam kata (SLOT88, PLUTO88)"""
        pattern = r'[a-zA-Z]+\d+|\d+[a-zA-Z]+'
        return len(re.findall(pattern, text))

    def excessive_spacing(self, text):
        """Deteksi spasi berlebih antar karakter"""
        pattern = r'(\w\s){3,}'
        return len(re.findall(pattern, text))

    def extract_features(self, texts):
        """Ekstraksi semua fitur"""
        features = []
        for text in texts:
            features.append([
                self.count_emoji(text),
                self.capital_ratio(text),
                self.has_numbers_in_word(text),
                self.excessive_spacing(text)
            ])
        return np.array(features)


class AdditionalFeaturesTransformer(BaseEstimator, TransformerMixin):
    """Transformer untuk fitur tambahan"""

    def __init__(self):
        self.feature_extractor = AdditionalFeatures()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return self.feature_extractor.extract_features(X)


# ============================================================
# WORD2VEC & FASTTEXT TRANSFORMERS
# ============================================================

class Word2VecTransformer(BaseEstimator, TransformerMixin):
    """Transformer untuk Word2Vec embedding"""

    def __init__(self, vector_size=100, window=5, min_count=1, workers=4, sg=0):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.sg = sg  # 0=CBOW, 1=Skip-gram
        self.model = None
        self.preprocessor = TextPreprocessor()

    def fit(self, X, y=None):
        """Train Word2Vec model"""
        # Tokenize sentences
        sentences = [self.preprocessor.preprocess(text).split() for text in X]

        # Train Word2Vec
        self.model = Word2Vec(
            sentences=sentences,
            vector_size=self.vector_size,
            window=self.window,
            min_count=self.min_count,
            workers=self.workers,
            sg=self.sg
        )
        return self

    def transform(self, X):
        """Transform texts to averaged word vectors"""
        vectors = []
        for text in X:
            text = self.preprocessor.preprocess(text)
            words = text.split()

            # Get vectors for words in vocabulary
            word_vectors = [
                self.model.wv[word] for word in words
                if word in self.model.wv
            ]

            # Average word vectors
            if word_vectors:
                vectors.append(np.mean(word_vectors, axis=0))
            else:
                vectors.append(np.zeros(self.vector_size))

        return np.array(vectors)


class FastTextTransformer(BaseEstimator, TransformerMixin):
    """Transformer untuk FastText embedding"""

    def __init__(self, vector_size=100, window=5, min_count=1, workers=4, sg=0):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.sg = sg  # 0=CBOW, 1=Skip-gram
        self.model = None
        self.preprocessor = TextPreprocessor()

    def fit(self, X, y=None):
        """Train FastText model"""
        sentences = [self.preprocessor.preprocess(text).split() for text in X]

        self.model = FastText(
            sentences=sentences,
            vector_size=self.vector_size,
            window=self.window,
            min_count=self.min_count,
            workers=self.workers,
            sg=self.sg
        )
        return self

    def transform(self, X):
        """Transform texts to averaged word vectors"""
        vectors = []
        for text in X:
            text = self.preprocessor.preprocess(text)
            words = text.split()

            # FastText can handle OOV words
            word_vectors = [self.model.wv[word] for word in words if words]

            if word_vectors:
                vectors.append(np.mean(word_vectors, axis=0))
            else:
                vectors.append(np.zeros(self.vector_size))

        return np.array(vectors)


# ============================================================
# MODEL CONFIGURATIONS
# ============================================================

def get_classifiers():
    """
    Konfigurasi semua classifier yang tersedia
    Returns dict: {nama_model: instance_model}
    """
    classifiers = {
        # Linear Models
        'logistic_regression': LogisticRegression(
            max_iter=1000,
            class_weight='balanced',
            random_state=42,
            solver='lbfgs'
        ),

        # Tree-based Models
        'random_forest': RandomForestClassifier(
            n_estimators=100,
            max_depth=20,
            class_weight='balanced',
            random_state=42,
            n_jobs=-1
        ),

        'decision_tree': DecisionTreeClassifier(
            max_depth=20,
            class_weight='balanced',
            random_state=42
        ),

        'gradient_boosting': GradientBoostingClassifier(
            n_estimators=100,
            max_depth=5,
            random_state=42
        ),

        'xgboost': XGBClassifier(
            n_estimators=100,
            max_depth=5,
            learning_rate=0.1,
            random_state=42,
            eval_metric='logloss'
        ),

        'lightgbm': LGBMClassifier(
            n_estimators=100,
            max_depth=5,
            learning_rate=0.1,
            random_state=42,
            verbose=-1
        ),

        # SVM
        'svm_linear': SVC(
            kernel='linear',
            class_weight='balanced',
            random_state=42,
            probability=True
        ),

        'svm_rbf': SVC(
            kernel='rbf',
            class_weight='balanced',
            random_state=42,
            probability=True
        ),

        # Naive Bayes
        'naive_bayes': MultinomialNB(alpha=1.0),

        # KNN
        'knn': KNeighborsClassifier(
            n_neighbors=5,
            weights='distance',
            n_jobs=-1
        )
    }

    return classifiers


def get_vectorizers():
    """
    Konfigurasi semua vectorizer yang tersedia
    Returns dict: {nama_vectorizer: config}
    """
    preprocessor = TextPreprocessor()

    vectorizers = {
        # TF-IDF variants
        'tfidf_char': TfidfVectorizer(
            analyzer='char',
            ngram_range=(2, 5),
            max_features=5000,
            min_df=2,
            preprocessor=preprocessor.preprocess
        ),

        'tfidf_word': TfidfVectorizer(
            analyzer='word',
            ngram_range=(1, 2),
            max_features=5000,
            min_df=2,
            preprocessor=preprocessor.preprocess
        ),

        'tfidf_char_wb': TfidfVectorizer(
            analyzer='char_wb',
            ngram_range=(2, 5),
            max_features=5000,
            min_df=2,
            preprocessor=preprocessor.preprocess
        ),

        # Count Vectorizer
        'count_char': CountVectorizer(
            analyzer='char',
            ngram_range=(2, 5),
            max_features=5000,
            min_df=2,
            preprocessor=preprocessor.preprocess
        ),

        'count_word': CountVectorizer(
            analyzer='word',
            ngram_range=(1, 2),
            max_features=5000,
            min_df=2,
            preprocessor=preprocessor.preprocess
        ),

        # Hashing Vectorizer (memory efficient)
        'hashing_char': HashingVectorizer(
            analyzer='char',
            ngram_range=(2, 5),
            n_features=2**16,
            preprocessor=preprocessor.preprocess
        ),

        # Word2Vec
        'word2vec_cbow': Word2VecTransformer(
            vector_size=100,
            window=5,
            min_count=1,
            sg=0  # CBOW
        ),

        'word2vec_skipgram': Word2VecTransformer(
            vector_size=100,
            window=5,
            min_count=1,
            sg=1  # Skip-gram
        ),

        # FastText
        'fasttext_cbow': FastTextTransformer(
            vector_size=100,
            window=5,
            min_count=1,
            sg=0  # CBOW
        ),

        'fasttext_skipgram': FastTextTransformer(
            vector_size=100,
            window=5,
            min_count=1,
            sg=1  # Skip-gram
        ),

        # Hybrid combinations
        'hybrid_word_char': FeatureUnion([
            ('word_tfidf', TfidfVectorizer(
                analyzer='word',
                ngram_range=(1, 2),
                max_features=3000,
                preprocessor=preprocessor.preprocess
            )),
            ('char_tfidf', TfidfVectorizer(
                analyzer='char',
                ngram_range=(2, 5),
                max_features=3000,
                preprocessor=preprocessor.preprocess
            ))
        ]),

        'hybrid_all_features': FeatureUnion([
            ('word_tfidf', TfidfVectorizer(
                analyzer='word',
                ngram_range=(1, 2),
                max_features=2000,
                preprocessor=preprocessor.preprocess
            )),
            ('char_tfidf', TfidfVectorizer(
                analyzer='char',
                ngram_range=(2, 5),
                max_features=2000,
                preprocessor=preprocessor.preprocess
            )),
            ('additional', AdditionalFeaturesTransformer())
        ])
    }

    return vectorizers


# ============================================================
# PIPELINE BUILDER
# ============================================================

def create_custom_pipeline(vectorizer_name, classifier_name):
    """
    Buat pipeline custom dengan kombinasi vectorizer dan classifier

    Parameters:
    -----------
    vectorizer_name : str
        Nama vectorizer dari get_vectorizers()
    classifier_name : str
        Nama classifier dari get_classifiers()

    Returns:
    --------
    Pipeline object
    """
    vectorizers = get_vectorizers()
    classifiers = get_classifiers()

    if vectorizer_name not in vectorizers:
        raise ValueError(f"Vectorizer '{vectorizer_name}' tidak tersedia. "
                        f"Pilihan: {list(vectorizers.keys())}")

    if classifier_name not in classifiers:
        raise ValueError(f"Classifier '{classifier_name}' tidak tersedia. "
                        f"Pilihan: {list(classifiers.keys())}")

    pipeline = Pipeline([
        ('vectorizer', vectorizers[vectorizer_name]),
        ('classifier', classifiers[classifier_name])
    ])

    return pipeline


# ============================================================
# TRAINING & EVALUATION
# ============================================================

def train_and_evaluate(X, y, pipeline, pipeline_name):
    """Training dan evaluasi model"""
    print(f"\n{'='*60}")
    print(f"EVALUASI: {pipeline_name}")
    print(f"{'='*60}")

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Training
    print("Training model...")
    pipeline.fit(X_train, y_train)

    # Prediction
    y_pred = pipeline.predict(X_test)

    # Evaluation
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred,
                                target_names=['Non-Judi', 'Judi']))

    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"F1-Score: {f1:.4f}")

    # Cross-validation
    print("\nPerforming cross-validation...")
    cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='f1')
    print(f"Cross-Validation F1-Score: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

    return pipeline, {
        'accuracy': accuracy,
        'f1_score': f1,
        'cv_f1_mean': cv_scores.mean(),
        'cv_f1_std': cv_scores.std()
    }


def compare_multiple_models(X, y, vectorizer_configs, classifier_configs):
    """
    Bandingkan multiple kombinasi vectorizer dan classifier

    Parameters:
    -----------
    X : array-like
        Text data
    y : array-like
        Labels
    vectorizer_configs : list of str
        List nama vectorizer yang ingin dicoba
    classifier_configs : list of str
        List nama classifier yang ingin dicoba

    Returns:
    --------
    DataFrame dengan hasil perbandingan
    """
    results = []

    total_experiments = len(vectorizer_configs) * len(classifier_configs)
    experiment_num = 0

    print(f"\n{'='*60}")
    print(f"MEMULAI PERBANDINGAN {total_experiments} KOMBINASI MODEL")
    print(f"{'='*60}\n")

    for vec_name in vectorizer_configs:
        for clf_name in classifier_configs:
            experiment_num += 1
            print(f"\n[{experiment_num}/{total_experiments}] Testing: {vec_name} + {clf_name}")

            try:
                # Create pipeline
                pipeline = create_custom_pipeline(vec_name, clf_name)

                # Train and evaluate
                model, metrics = train_and_evaluate(
                    X, y, pipeline,
                    f"{vec_name} + {clf_name}"
                )

                # Store results
                results.append({
                    'vectorizer': vec_name,
                    'classifier': clf_name,
                    'accuracy': metrics['accuracy'],
                    'f1_score': metrics['f1_score'],
                    'cv_f1_mean': metrics['cv_f1_mean'],
                    'cv_f1_std': metrics['cv_f1_std']
                })

            except Exception as e:
                print(f"❌ Error: {str(e)}")
                results.append({
                    'vectorizer': vec_name,
                    'classifier': clf_name,
                    'accuracy': 0,
                    'f1_score': 0,
                    'cv_f1_mean': 0,
                    'cv_f1_std': 0,
                    'error': str(e)
                })

    # Create results dataframe
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('f1_score', ascending=False)

    return results_df


# ============================================================
# QUICK PRESETS
# ============================================================

def create_pipeline_char_tfidf():
    """Pipeline 1: Character-Level TF-IDF (REKOMENDASI UTAMA)"""
    return create_custom_pipeline('tfidf_char', 'logistic_regression')


def create_pipeline_hybrid():
    """Pipeline 2: Hybrid (Word + Char)"""
    return create_custom_pipeline('hybrid_word_char', 'logistic_regression')


def create_pipeline_advanced():
    """Pipeline 3: Advanced (All Features)"""
    return create_custom_pipeline('hybrid_all_features', 'random_forest')



In [7]:
X = df_all['comment'].values
y = df_all['label'].values

print("Dataset shape:", X.shape)
print("Label distribution:", np.bincount(y))

Dataset shape: (10506,)
Label distribution: [9580  926]


In [11]:

# ============================================================
# OPSI 1: Test Pipeline Quick Presets
# ============================================================
print("\n" + "="*60)
print("OPSI 1: QUICK PRESETS")
print("="*60)

pipeline1 = create_pipeline_char_tfidf()
model1, metrics1 = train_and_evaluate(X, y, pipeline1, "Character TF-IDF")

# ============================================================
# OPSI 2: Test Single Custom Pipeline
# ============================================================
print("\n" + "="*60)
print("OPSI 2: CUSTOM PIPELINE")
print("="*60)

# Contoh: FastText + Random Forest
custom_pipeline = create_custom_pipeline('fasttext_cbow', 'random_forest')
model_custom, metrics_custom = train_and_evaluate(
    X, y, custom_pipeline,
    "FastText CBOW + Random Forest"
)

# ============================================================
# OPSI 3: Compare Multiple Models
# ============================================================
print("\n" + "="*60)
print("OPSI 3: PERBANDINGAN MULTIPLE MODELS")
print("="*60)

# Pilih vectorizer dan classifier yang ingin dibandingkan
vectorizers_to_test = [
    'tfidf_char',
    'tfidf_word',
    'hybrid_word_char',
    'fasttext_cbow',
    'word2vec_cbow'
]

classifiers_to_test = [
    'logistic_regression',
    'random_forest',
    'xgboost',
    'svm_linear',
    'svm_rbf',
    'naive_bayes',
    'knn',
    'lightgbm',
    'gradient_boosting'
]

# Jalankan perbandingan
comparison_results = compare_multiple_models(
    X, y,
    vectorizers_to_test,
    classifiers_to_test
)

    # Tampilkan hasil
print("\n" + "="*80)
print("HASIL PERBANDINGAN MODEL (Sorted by F1-Score)")
print("="*80)
print(comparison_results.to_string(index=False))

    # Top 5 models
print("\n" + "="*80)
print("TOP 5 BEST MODELS")
print("="*80)
print(comparison_results.head().to_string(index=False))

    # ============================================================
    # OPSI 4: List Available Configurations
    # ============================================================
print("\n" + "="*60)
print("AVAILABLE CONFIGURATIONS")
print("="*60)

print("\n📊 Available Vectorizers:")
for i, vec in enumerate(get_vectorizers().keys(), 1):
    print(f"  {i}. {vec}")

print("\n🤖 Available Classifiers:")
for i, clf in enumerate(get_classifiers().keys(), 1):
    print(f"  {i}. {clf}")


OPSI 1: QUICK PRESETS

EVALUASI: Character TF-IDF
Training model...

Classification Report:
              precision    recall  f1-score   support

    Non-Judi       0.99      0.99      0.99      2105
        Judi       0.94      0.93      0.94       230

    accuracy                           0.99      2335
   macro avg       0.97      0.96      0.96      2335
weighted avg       0.99      0.99      0.99      2335


Confusion Matrix:
[[2091   14]
 [  15  215]]

Accuracy: 0.9876
F1-Score: 0.9368

Performing cross-validation...
Cross-Validation F1-Score: 0.8888 (+/- 0.1115)

OPSI 2: CUSTOM PIPELINE

EVALUASI: FastText CBOW + Random Forest
Training model...

Classification Report:
              precision    recall  f1-score   support

    Non-Judi       0.96      0.99      0.97      2105
        Judi       0.85      0.62      0.71       230

    accuracy                           0.95      2335
   macro avg       0.90      0.80      0.84      2335
weighted avg       0.95      0.95      0

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
import re
import unicodedata
from gensim.models import Word2Vec, FastText
from sklearn.base import BaseEstimator, TransformerMixin
import warnings
import time
warnings.filterwarnings('ignore')

# ============================================================
# PREPROCESSING & FEATURE ENGINEERING
# ============================================================

class TextPreprocessor:
    """Preprocessing untuk menangani homoglyph dan variasi Unicode"""
    def normalize_homoglyph(self, text):
        """Konversi homoglyph Unicode ke karakter normal"""
        for homo, normal in HOMOGLYPH_MAP.items():
            text = text.replace(homo, normal)
        return text

    def normalize_unicode(self, text):
        """Normalisasi Unicode menggunakan NFKD"""
        return unicodedata.normalize('NFKD', text)

    def remove_extra_spaces(self, text):
        """Hapus spasi berlebih"""
        return re.sub(r'\s+', ' ', text).strip()

    def preprocess(self, text):
        """Pipeline preprocessing lengkap"""
        text = str(text).lower()
        text = self.normalize_homoglyph(text)
        text = self.normalize_unicode(text)
        text = self.remove_extra_spaces(text)
        return text


class AdditionalFeatures:
    """Ekstraksi fitur tambahan untuk deteksi spam"""

    def count_emoji(self, text):
        """Hitung jumlah emoji"""
        emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            "]+", flags=re.UNICODE)
        return len(emoji_pattern.findall(text))

    def capital_ratio(self, text):
        """Rasio huruf kapital"""
        if len(text) == 0:
            return 0
        return sum(1 for c in text if c.isupper()) / len(text)

    def has_numbers_in_word(self, text):
        """Deteksi angka dalam kata (SLOT88, PLUTO88)"""
        pattern = r'[a-zA-Z]+\d+|\d+[a-zA-Z]+'
        return len(re.findall(pattern, text))

    def excessive_spacing(self, text):
        """Deteksi spasi berlebih antar karakter"""
        pattern = r'(\w\s){3,}'
        return len(re.findall(pattern, text))

    def extract_features(self, texts):
        """Ekstraksi semua fitur"""
        features = []
        for text in texts:
            features.append([
                self.count_emoji(text),
                self.capital_ratio(text),
                self.has_numbers_in_word(text),
                self.excessive_spacing(text)
            ])
        return np.array(features)


class AdditionalFeaturesTransformer(BaseEstimator, TransformerMixin):
    """Transformer untuk fitur tambahan"""

    def __init__(self):
        self.feature_extractor = AdditionalFeatures()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return self.feature_extractor.extract_features(X)


# ============================================================
# WORD2VEC & FASTTEXT TRANSFORMERS
# ============================================================

class Word2VecTransformer(BaseEstimator, TransformerMixin):
    """Transformer untuk Word2Vec embedding"""

    def __init__(self, vector_size=100, window=5, min_count=1, workers=4, sg=0):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.sg = sg  # 0=CBOW, 1=Skip-gram
        self.model = None
        self.preprocessor = TextPreprocessor()

    def fit(self, X, y=None):
        """Train Word2Vec model"""
        # Tokenize sentences
        sentences = [self.preprocessor.preprocess(text).split() for text in X]

        # Train Word2Vec
        self.model = Word2Vec(
            sentences=sentences,
            vector_size=self.vector_size,
            window=self.window,
            min_count=self.min_count,
            workers=self.workers,
            sg=self.sg
        )
        return self

    def transform(self, X):
        """Transform texts to averaged word vectors"""
        vectors = []
        for text in X:
            text = self.preprocessor.preprocess(text)
            words = text.split()

            # Get vectors for words in vocabulary
            word_vectors = [
                self.model.wv[word] for word in words
                if word in self.model.wv
            ]

            # Average word vectors
            if word_vectors:
                vectors.append(np.mean(word_vectors, axis=0))
            else:
                vectors.append(np.zeros(self.vector_size))

        return np.array(vectors)


class FastTextTransformer(BaseEstimator, TransformerMixin):
    """Transformer untuk FastText embedding"""

    def __init__(self, vector_size=100, window=5, min_count=1, workers=4, sg=0):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.sg = sg  # 0=CBOW, 1=Skip-gram
        self.model = None
        self.preprocessor = TextPreprocessor()

    def fit(self, X, y=None):
        """Train FastText model"""
        sentences = [self.preprocessor.preprocess(text).split() for text in X]

        self.model = FastText(
            sentences=sentences,
            vector_size=self.vector_size,
            window=self.window,
            min_count=self.min_count,
            workers=self.workers,
            sg=self.sg
        )
        return self

    def transform(self, X):
        """Transform texts to averaged word vectors"""
        vectors = []
        for text in X:
            text = self.preprocessor.preprocess(text)
            words = text.split()

            # FastText can handle OOV words
            word_vectors = [self.model.wv[word] for word in words if words]

            if word_vectors:
                vectors.append(np.mean(word_vectors, axis=0))
            else:
                vectors.append(np.zeros(self.vector_size))

        return np.array(vectors)


# ============================================================
# MODEL CONFIGURATIONS
# ============================================================

def get_classifiers():
    """
    Konfigurasi semua classifier yang tersedia
    Returns dict: {nama_model: instance_model}
    """
    classifiers = {
        # Linear Models
        'logistic_regression': LogisticRegression(
            max_iter=1000,
            class_weight='balanced',
            random_state=42,
            solver='lbfgs'
        ),

        # Tree-based Models
        'random_forest': RandomForestClassifier(
            n_estimators=100,
            max_depth=20,
            class_weight='balanced',
            random_state=42,
            n_jobs=-1
        ),

        'decision_tree': DecisionTreeClassifier(
            max_depth=20,
            class_weight='balanced',
            random_state=42
        ),

        'gradient_boosting': GradientBoostingClassifier(
            n_estimators=100,
            max_depth=5,
            random_state=42
        ),

        'xgboost': XGBClassifier(
            n_estimators=100,
            max_depth=5,
            learning_rate=0.1,
            random_state=42,
            eval_metric='logloss'
        ),

        'lightgbm': LGBMClassifier(
            n_estimators=100,
            max_depth=5,
            learning_rate=0.1,
            random_state=42,
            verbose=-1
        ),

        # SVM
        'svm_linear': SVC(
            kernel='linear',
            class_weight='balanced',
            random_state=42,
            probability=True
        ),

        'svm_rbf': SVC(
            kernel='rbf',
            class_weight='balanced',
            random_state=42,
            probability=True
        ),

        # Naive Bayes
        'naive_bayes': MultinomialNB(alpha=1.0),

        # KNN
        'knn': KNeighborsClassifier(
            n_neighbors=5,
            weights='distance',
            n_jobs=-1
        )
    }

    return classifiers


def get_vectorizers():
    """
    Konfigurasi semua vectorizer yang tersedia
    Returns dict: {nama_vectorizer: config}
    """
    preprocessor = TextPreprocessor()

    vectorizers = {
        # TF-IDF variants
        'tfidf_char': TfidfVectorizer(
            analyzer='char',
            ngram_range=(2, 5),
            max_features=5000,
            min_df=2,
            preprocessor=preprocessor.preprocess
        ),

        'tfidf_word': TfidfVectorizer(
            analyzer='word',
            ngram_range=(1, 2),
            max_features=5000,
            min_df=2,
            preprocessor=preprocessor.preprocess
        ),

        'tfidf_char_wb': TfidfVectorizer(
            analyzer='char_wb',
            ngram_range=(2, 5),
            max_features=5000,
            min_df=2,
            preprocessor=preprocessor.preprocess
        ),

        # Count Vectorizer
        'count_char': CountVectorizer(
            analyzer='char',
            ngram_range=(2, 5),
            max_features=5000,
            min_df=2,
            preprocessor=preprocessor.preprocess
        ),

        'count_word': CountVectorizer(
            analyzer='word',
            ngram_range=(1, 2),
            max_features=5000,
            min_df=2,
            preprocessor=preprocessor.preprocess
        ),

        # Hashing Vectorizer (memory efficient)
        'hashing_char': HashingVectorizer(
            analyzer='char',
            ngram_range=(2, 5),
            n_features=2**16,
            preprocessor=preprocessor.preprocess
        ),

        # Word2Vec
        'word2vec_cbow': Word2VecTransformer(
            vector_size=100,
            window=5,
            min_count=1,
            sg=0  # CBOW
        ),

        'word2vec_skipgram': Word2VecTransformer(
            vector_size=100,
            window=5,
            min_count=1,
            sg=1  # Skip-gram
        ),

        # FastText
        'fasttext_cbow': FastTextTransformer(
            vector_size=100,
            window=5,
            min_count=1,
            sg=0  # CBOW
        ),

        'fasttext_skipgram': FastTextTransformer(
            vector_size=100,
            window=5,
            min_count=1,
            sg=1  # Skip-gram
        ),

        # Hybrid combinations
        'hybrid_word_char': FeatureUnion([
            ('word_tfidf', TfidfVectorizer(
                analyzer='word',
                ngram_range=(1, 2),
                max_features=3000,
                preprocessor=preprocessor.preprocess
            )),
            ('char_tfidf', TfidfVectorizer(
                analyzer='char',
                ngram_range=(2, 5),
                max_features=3000,
                preprocessor=preprocessor.preprocess
            ))
        ]),

        'hybrid_all_features': FeatureUnion([
            ('word_tfidf', TfidfVectorizer(
                analyzer='word',
                ngram_range=(1, 2),
                max_features=2000,
                preprocessor=preprocessor.preprocess
            )),
            ('char_tfidf', TfidfVectorizer(
                analyzer='char',
                ngram_range=(2, 5),
                max_features=2000,
                preprocessor=preprocessor.preprocess
            )),
            ('additional', AdditionalFeaturesTransformer())
        ])
    }

    return vectorizers


# ============================================================
# PIPELINE BUILDER
# ============================================================

def create_custom_pipeline(vectorizer_name, classifier_name):
    """
    Buat pipeline custom dengan kombinasi vectorizer dan classifier

    Parameters:
    -----------
    vectorizer_name : str
        Nama vectorizer dari get_vectorizers()
    classifier_name : str
        Nama classifier dari get_classifiers()

    Returns:
    --------
    Pipeline object
    """
    vectorizers = get_vectorizers()
    classifiers = get_classifiers()

    if vectorizer_name not in vectorizers:
        raise ValueError(f"Vectorizer '{vectorizer_name}' tidak tersedia. "
                        f"Pilihan: {list(vectorizers.keys())}")

    if classifier_name not in classifiers:
        raise ValueError(f"Classifier '{classifier_name}' tidak tersedia. "
                        f"Pilihan: {list(classifiers.keys())}")

    pipeline = Pipeline([
        ('vectorizer', vectorizers[vectorizer_name]),
        ('classifier', classifiers[classifier_name])
    ])

    return pipeline


# ============================================================
# TIMING UTILITIES
# ============================================================

def measure_single_prediction_time(pipeline, sample_text, num_iterations=100):
    """
    Ukur waktu rata-rata untuk memprediksi satu teks

    Parameters:
    -----------
    pipeline : fitted Pipeline
        Model yang sudah ditraining
    sample_text : str
        Contoh teks untuk prediksi
    num_iterations : int
        Jumlah iterasi untuk mengambil rata-rata

    Returns:
    --------
    dict : {
        'avg_time_ms': waktu rata-rata dalam milidetik,
        'avg_time_sec': waktu rata-rata dalam detik,
        'total_time_sec': total waktu untuk semua iterasi
    }
    """
    times = []

    for _ in range(num_iterations):
        start_time = time.time()
        pipeline.predict([sample_text])
        end_time = time.time()
        times.append(end_time - start_time)

    avg_time = np.mean(times)

    return {
        'avg_time_ms': avg_time * 1000,
        'avg_time_sec': avg_time,
        'total_time_sec': sum(times),
        'min_time_ms': min(times) * 1000,
        'max_time_ms': max(times) * 1000
    }


# ============================================================
# TRAINING & EVALUATION WITH TIMING
# ============================================================

def train_and_evaluate(X, y, pipeline, pipeline_name):
    """Training dan evaluasi model dengan pengukuran waktu"""
    print(f"\n{'='*60}")
    print(f"EVALUASI: {pipeline_name}")
    print(f"{'='*60}")

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Training with timing
    print("Training model...")
    train_start = time.time()
    pipeline.fit(X_train, y_train)
    train_time = time.time() - train_start
    print(f"⏱️  Training Time: {train_time:.4f} seconds")

    # Prediction with timing
    print("\nTesting prediction speed...")
    pred_start = time.time()
    y_pred = pipeline.predict(X_test)
    pred_time = time.time() - pred_start
    avg_pred_time_per_sample = (pred_time / len(X_test)) * 1000  # in milliseconds
    print(f"⏱️  Batch Prediction Time: {pred_time:.4f} seconds")
    print(f"⏱️  Average Time per Sample: {avg_pred_time_per_sample:.4f} ms")

    # Single prediction timing (more accurate for real-time use)
    print("\nMeasuring single prediction latency (100 iterations)...")
    if len(X_test) > 0:
        single_pred_timing = measure_single_prediction_time(
            pipeline, X_test.iloc[0] if hasattr(X_test, 'iloc') else X_test[0],
            num_iterations=100
        )
        print(f"⏱️  Single Prediction Time (avg): {single_pred_timing['avg_time_ms']:.4f} ms")
        print(f"⏱️  Single Prediction Time (min): {single_pred_timing['min_time_ms']:.4f} ms")
        print(f"⏱️  Single Prediction Time (max): {single_pred_timing['max_time_ms']:.4f} ms")
    else:
        single_pred_timing = {'avg_time_ms': 0}

    # Evaluation
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred,
                                target_names=['Non-Judi', 'Judi']))

    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")

    # Cross-validation with timing
    print("\nPerforming cross-validation...")
    cv_start = time.time()
    cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='f1')
    cv_time = time.time() - cv_start
    print(f"⏱️  Cross-Validation Time: {cv_time:.4f} seconds ({cv_time/60:.2f} minutes)")
    print(f"Cross-Validation F1-Score: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

    return pipeline, {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'cv_f1_mean': cv_scores.mean(),
        'cv_f1_std': cv_scores.std(),
        'train_time_sec': train_time,
        'batch_pred_time_sec': pred_time,
        'avg_pred_time_ms': avg_pred_time_per_sample,
        'single_pred_time_ms': single_pred_timing['avg_time_ms'],
        'single_pred_min_ms': single_pred_timing.get('min_time_ms', 0),
        'single_pred_max_ms': single_pred_timing.get('max_time_ms', 0),
        'cv_time_sec': cv_time,
        'cv_time_min': cv_time / 60
    }


def compare_multiple_models(X, y, vectorizer_configs, classifier_configs):
    """
    Bandingkan multiple kombinasi vectorizer dan classifier dengan timing

    Parameters:
    -----------
    X : array-like
        Text data
    y : array-like
        Labels
    vectorizer_configs : list of str
        List nama vectorizer yang ingin dicoba
    classifier_configs : list of str
        List nama classifier yang ingin dicoba

    Returns:
    --------
    DataFrame dengan hasil perbandingan termasuk timing
    """
    results = []

    total_experiments = len(vectorizer_configs) * len(classifier_configs)
    experiment_num = 0

    print(f"\n{'='*60}")
    print(f"MEMULAI PERBANDINGAN {total_experiments} KOMBINASI MODEL")
    print(f"{'='*60}\n")

    for vec_name in vectorizer_configs:
        for clf_name in classifier_configs:
            experiment_num += 1
            print(f"\n[{experiment_num}/{total_experiments}] Testing: {vec_name} + {clf_name}")

            try:
                # Create pipeline
                pipeline = create_custom_pipeline(vec_name, clf_name)

                # Train and evaluate
                model, metrics = train_and_evaluate(
                    X, y, pipeline,
                    f"{vec_name} + {clf_name}"
                )

                # Store results
                results.append({
                    'vectorizer': vec_name,
                    'classifier': clf_name,
                    'accuracy': metrics['accuracy'],
                    'precision': metrics['precision'],
                    'recall': metrics['recall'],
                    'f1_score': metrics['f1_score'],
                    'cv_f1_mean': metrics['cv_f1_mean'],
                    'cv_f1_std': metrics['cv_f1_std'],
                    'train_time_sec': metrics['train_time_sec'],
                    'single_pred_ms': metrics['single_pred_time_ms'],
                    'single_pred_min_ms': metrics['single_pred_min_ms'],
                    'single_pred_max_ms': metrics['single_pred_max_ms'],
                    'cv_time_min': metrics['cv_time_min']
                })

            except Exception as e:
                print(f"❌ Error: {str(e)}")
                results.append({
                    'vectorizer': vec_name,
                    'classifier': clf_name,
                    'accuracy': 0,
                    'precision': 0,
                    'recall': 0,
                    'f1_score': 0,
                    'cv_f1_mean': 0,
                    'cv_f1_std': 0,
                    'train_time_sec': 0,
                    'single_pred_ms': 0,
                    'single_pred_min_ms': 0,
                    'single_pred_max_ms': 0,
                    'cv_time_min': 0,
                    'error': str(e)
                })

    # Create results dataframe
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('f1_score', ascending=False)

    # Display summary
    print(f"\n{'='*80}")
    print("RANKING BERDASARKAN F1-SCORE")
    print(f"{'='*80}")
    print(results_df[['vectorizer', 'classifier', 'precision', 'recall', 'f1_score', 'single_pred_ms']].head(10))

    print(f"\n{'='*80}")
    print("RANKING BERDASARKAN KECEPATAN PREDIKSI (FASTEST)")
    print(f"{'='*80}")
    print(results_df.sort_values('single_pred_ms')[['vectorizer', 'classifier', 'precision', 'recall', 'f1_score', 'single_pred_ms']].head(10))

    return results_df


# ============================================================
# QUICK PRESETS
# ============================================================

def create_pipeline_char_tfidf():
    """Pipeline 1: Character-Level TF-IDF (REKOMENDASI UTAMA)"""
    return create_custom_pipeline('tfidf_char', 'logistic_regression')


def create_pipeline_hybrid():
    """Pipeline 2: Hybrid (Word + Char)"""
    return create_custom_pipeline('hybrid_word_char', 'logistic_regression')


def create_pipeline_advanced():
    """Pipeline 3: Advanced (All Features)"""
    return create_custom_pipeline('hybrid_all_features', 'random_forest')


# ============================================================
# VISUALIZATION HELPER
# ============================================================

def create_comparison_table(results_df):
    """
    Buat tabel perbandingan yang lebih mudah dibaca

    Parameters:
    -----------
    results_df : DataFrame
        Hasil dari compare_multiple_models()

    Returns:
    --------
    DataFrame dengan format yang lebih readable
    """
    comparison_df = results_df.copy()

    # Format kolom
    comparison_df['model_name'] = comparison_df['vectorizer'] + ' + ' + comparison_df['classifier']
    comparison_df['f1_score_pct'] = (comparison_df['f1_score'] * 100).round(2)
    comparison_df['accuracy_pct'] = (comparison_df['accuracy'] * 100).round(2)
    comparison_df['precision_pct'] = (comparison_df['precision'] * 100).round(2)
    comparison_df['recall_pct'] = (comparison_df['recall'] * 100).round(2)
    comparison_df['cv_f1_pct'] = (comparison_df['cv_f1_mean'] * 100).round(2)
    comparison_df['pred_speed'] = comparison_df['single_pred_ms'].round(4)
    comparison_df['cv_time'] = comparison_df['cv_time_min'].round(2)

    # Select dan rename kolom
    final_df = comparison_df[[
        'model_name',
        'accuracy_pct',
        'precision_pct',
        'recall_pct',
        'f1_score_pct',
        'cv_f1_pct',
        'pred_speed',
        'train_time_sec',
        'cv_time'
    ]].copy()

    final_df.columns = [
        'Model',
        'Accuracy (%)',
        'Precision (%)',
        'Recall (%)',
        'F1-Score (%)',
        'CV F1-Score (%)',
        'Pred Time (ms)',
        'Train Time (s)',
        'CV Time (min)'
    ]

    return final_df

In [13]:

# ============================================================
# OPSI 1: Test Pipeline Quick Presets
# ============================================================
print("\n" + "="*60)
print("OPSI 1: QUICK PRESETS")
print("="*60)

pipeline1 = create_pipeline_char_tfidf()
model1, metrics1 = train_and_evaluate(X, y, pipeline1, "Character TF-IDF")

# ============================================================
# OPSI 2: Test Single Custom Pipeline
# ============================================================
print("\n" + "="*60)
print("OPSI 2: CUSTOM PIPELINE")
print("="*60)

# Contoh: FastText + Random Forest
custom_pipeline = create_custom_pipeline('fasttext_cbow', 'random_forest')
model_custom, metrics_custom = train_and_evaluate(
    X, y, custom_pipeline,
    "FastText CBOW + Random Forest"
)

# ============================================================
# OPSI 3: Compare Multiple Models
# ============================================================
print("\n" + "="*60)
print("OPSI 3: PERBANDINGAN MULTIPLE MODELS")
print("="*60)

# Pilih vectorizer dan classifier yang ingin dibandingkan
vectorizers_to_test = [
    'tfidf_char',
    'hybrid_word_char'
]

classifiers_to_test = [
    'logistic_regression',
    'random_forest',
    'xgboost',
    'svm_linear',
    'svm_rbf',
    'naive_bayes',
    'knn',
    'lightgbm',
    'gradient_boosting'
]

# Jalankan perbandingan
comparison_results = compare_multiple_models(
    X, y,
    vectorizers_to_test,
    classifiers_to_test
)

    # Tampilkan hasil
print("\n" + "="*80)
print("HASIL PERBANDINGAN MODEL (Sorted by F1-Score)")
print("="*80)
print(comparison_results.to_string(index=False))

    # Top 5 models
print("\n" + "="*80)
print("TOP 5 BEST MODELS")
print("="*80)
print(comparison_results.head().to_string(index=False))

    # ============================================================
    # OPSI 4: List Available Configurations
    # ============================================================
print("\n" + "="*60)
print("AVAILABLE CONFIGURATIONS")
print("="*60)

print("\n📊 Available Vectorizers:")
for i, vec in enumerate(get_vectorizers().keys(), 1):
    print(f"  {i}. {vec}")

print("\n🤖 Available Classifiers:")
for i, clf in enumerate(get_classifiers().keys(), 1):
    print(f"  {i}. {clf}")


OPSI 1: QUICK PRESETS

EVALUASI: Character TF-IDF
Training model...
⏱️  Training Time: 3.2607 seconds

Testing prediction speed...
⏱️  Batch Prediction Time: 0.6717 seconds
⏱️  Average Time per Sample: 0.2877 ms

Measuring single prediction latency (100 iterations)...
⏱️  Single Prediction Time (avg): 1.4892 ms
⏱️  Single Prediction Time (min): 1.2794 ms
⏱️  Single Prediction Time (max): 3.4258 ms

Classification Report:
              precision    recall  f1-score   support

    Non-Judi       0.99      0.99      0.99      2105
        Judi       0.94      0.93      0.94       230

    accuracy                           0.99      2335
   macro avg       0.97      0.96      0.96      2335
weighted avg       0.99      0.99      0.99      2335


Confusion Matrix:
[[2091   14]
 [  15  215]]

Accuracy: 0.9876
Precision: 0.9389
Recall: 0.9348
F1-Score: 0.9368

Performing cross-validation...
⏱️  Cross-Validation Time: 17.2031 seconds (0.29 minutes)
Cross-Validation F1-Score: 0.8888 (+/- 0.11

In [14]:

# ============================================================
# OPSI 1: Test Pipeline Quick Presets
# ============================================================
print("\n" + "="*60)
print("OPSI 1: QUICK PRESETS")
print("="*60)

pipeline1 = create_pipeline_char_tfidf()
model1, metrics1 = train_and_evaluate(X, y, pipeline1, "Character TF-IDF")

# ============================================================
# OPSI 2: Test Single Custom Pipeline
# ============================================================
print("\n" + "="*60)
print("OPSI 2: CUSTOM PIPELINE")
print("="*60)

# Contoh: FastText + Random Forest
custom_pipeline = create_custom_pipeline('fasttext_cbow', 'random_forest')
model_custom, metrics_custom = train_and_evaluate(
    X, y, custom_pipeline,
    "FastText CBOW + Random Forest"
)

# ============================================================
# OPSI 3: Compare Multiple Models
# ============================================================
print("\n" + "="*60)
print("OPSI 3: PERBANDINGAN MULTIPLE MODELS")
print("="*60)

# Pilih vectorizer dan classifier yang ingin dibandingkan
vectorizers_to_test = [
    'tfidf_char',
    'hybrid_word_char',
    'hybrid_all_features'
]

classifiers_to_test = [
    'logistic_regression',
    'svm_rbf',
]

# Jalankan perbandingan
comparison_results = compare_multiple_models(
    X, y,
    vectorizers_to_test,
    classifiers_to_test
)

    # Tampilkan hasil
print("\n" + "="*80)
print("HASIL PERBANDINGAN MODEL (Sorted by F1-Score)")
print("="*80)
print(comparison_results.to_string(index=False))

    # Top 5 models
print("\n" + "="*80)
print("TOP 5 BEST MODELS")
print("="*80)
print(comparison_results.head().to_string(index=False))

    # ============================================================
    # OPSI 4: List Available Configurations
    # ============================================================
print("\n" + "="*60)
print("AVAILABLE CONFIGURATIONS")
print("="*60)

print("\n📊 Available Vectorizers:")
for i, vec in enumerate(get_vectorizers().keys(), 1):
    print(f"  {i}. {vec}")

print("\n🤖 Available Classifiers:")
for i, clf in enumerate(get_classifiers().keys(), 1):
    print(f"  {i}. {clf}")


OPSI 1: QUICK PRESETS

EVALUASI: Character TF-IDF
Training model...
⏱️  Training Time: 2.8122 seconds

Testing prediction speed...
⏱️  Batch Prediction Time: 0.6696 seconds
⏱️  Average Time per Sample: 0.2867 ms

Measuring single prediction latency (100 iterations)...
⏱️  Single Prediction Time (avg): 1.3891 ms
⏱️  Single Prediction Time (min): 1.2758 ms
⏱️  Single Prediction Time (max): 4.1847 ms

Classification Report:
              precision    recall  f1-score   support

    Non-Judi       0.99      0.99      0.99      2105
        Judi       0.94      0.93      0.94       230

    accuracy                           0.99      2335
   macro avg       0.97      0.96      0.96      2335
weighted avg       0.99      0.99      0.99      2335


Confusion Matrix:
[[2091   14]
 [  15  215]]

Accuracy: 0.9876
Precision: 0.9389
Recall: 0.9348
F1-Score: 0.9368

Performing cross-validation...
⏱️  Cross-Validation Time: 17.4073 seconds (0.29 minutes)
Cross-Validation F1-Score: 0.8888 (+/- 0.11

In [10]:
import pandas as pd

df_train = pd.read_csv('https://raw.githubusercontent.com/nafhanugm/data-mining2/refs/heads/master/dataset/train.csv')
df_test = pd.read_csv('https://raw.githubusercontent.com/nafhanugm/data-mining2/refs/heads/master/dataset/test.csv')
df_holdout = pd.read_csv('https://raw.githubusercontent.com/nafhanugm/data-mining2/refs/heads/master/dataset/holdout.csv', delimiter=';')

df_all = pd.concat([df_train, df_test], ignore_index=True)
X = df_all['comment'].values
y = df_all['label'].values

print("Dataset shape:", X.shape)
print("Label distribution:", np.bincount(y))

Dataset shape: (10506,)
Label distribution: [9580  926]


In [11]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grids for each pipeline
param_grid_hybrid_all_lr = {
    'vectorizer__word_tfidf__ngram_range': [(1, 1), (1, 2)],
    'vectorizer__char_tfidf__ngram_range': [(2, 4), (2, 5)],
    'classifier__C': [0.1, 1, 10],
    'classifier__solver': ['liblinear', 'lbfgs']
}

param_grid_hybrid_word_char_lr = {
    'vectorizer__word_tfidf__ngram_range': [(1, 1), (1, 2)],
    'vectorizer__char_tfidf__ngram_range': [(2, 4), (2, 5)],
    'classifier__C': [0.1, 1, 10],
    'classifier__solver': ['liblinear', 'lbfgs']
}

# Create the pipelines
pipeline_hybrid_all_lr = create_custom_pipeline('hybrid_all_features', 'logistic_regression')
pipeline_hybrid_word_char_lr = create_custom_pipeline('hybrid_word_char', 'logistic_regression')

# Perform GridSearchCV for hybrid_all_features + logistic_regression
print("Performing GridSearchCV for hybrid_all_features + logistic_regression...")
grid_search_hybrid_all_lr = GridSearchCV(
    pipeline_hybrid_all_lr,
    param_grid_hybrid_all_lr,
    cv=3,  # Using 3-fold cross-validation for faster tuning
    scoring='f1',
    n_jobs=-1,
    verbose=2
)
grid_search_hybrid_all_lr.fit(X, y)

print("\nBest parameters for hybrid_all_features + logistic_regression:")
print(grid_search_hybrid_all_lr.best_params_)
print("Best F1-Score:", grid_search_hybrid_all_lr.best_score_)

# Perform GridSearchCV for hybrid_word_char + logistic_regression
print("\nPerforming GridSearchCV for hybrid_word_char + logistic_regression...")
grid_search_hybrid_word_char_lr = GridSearchCV(
    pipeline_hybrid_word_char_lr,
    param_grid_hybrid_word_char_lr,
    cv=3,  # Using 3-fold cross-validation for faster tuning
    scoring='f1',
    n_jobs=-1,
    verbose=2
)
grid_search_hybrid_word_char_lr.fit(X, y)

print("\nBest parameters for hybrid_word_char + logistic_regression:")
print(grid_search_hybrid_word_char_lr.best_params_)
print("Best F1-Score:", grid_search_hybrid_word_char_lr.best_score_)

Performing GridSearchCV for hybrid_all_features + logistic_regression...
Fitting 3 folds for each of 24 candidates, totalling 72 fits

Best parameters for hybrid_all_features + logistic_regression:
{'classifier__C': 10, 'classifier__solver': 'liblinear', 'vectorizer__char_tfidf__ngram_range': (2, 4), 'vectorizer__word_tfidf__ngram_range': (1, 1)}
Best F1-Score: 0.9712223923669906

Performing GridSearchCV for hybrid_word_char + logistic_regression...
Fitting 3 folds for each of 24 candidates, totalling 72 fits

Best parameters for hybrid_word_char + logistic_regression:
{'classifier__C': 10, 'classifier__solver': 'liblinear', 'vectorizer__char_tfidf__ngram_range': (2, 4), 'vectorizer__word_tfidf__ngram_range': (1, 1)}
Best F1-Score: 0.9615746948911467


In [12]:
# ============================================================
# OPSI 1: Test Pipeline Quick Presets
# ============================================================
print("\n" + "="*60)
print("OPSI 1: QUICK PRESETS")
print("="*60)

pipeline1 = create_pipeline_char_tfidf()
model1, metrics1 = train_and_evaluate(X, y, pipeline1, "Character TF-IDF")

# ============================================================
# OPSI 2: Test Single Custom Pipeline
# ============================================================
print("\n" + "="*60)
print("OPSI 2: CUSTOM PIPELINE")
print("="*60)

# Contoh: FastText + Random Forest
custom_pipeline = create_custom_pipeline('fasttext_cbow', 'random_forest')
model_custom, metrics_custom = train_and_evaluate(
    X, y, custom_pipeline,
    "FastText CBOW + Random Forest"
)

# ============================================================
# OPSI 3: Compare Multiple Models
# ============================================================
print("\n" + "="*60)
print("OPSI 3: PERBANDINGAN MULTIPLE MODELS")
print("="*60)

# Pilih vectorizer dan classifier yang ingin dibandingkan
vectorizers_to_test = [
    'tfidf_char',
    'hybrid_word_char',
    'hybrid_all_features'
]

classifiers_to_test = [
    'logistic_regression'
]

# Jalankan perbandingan
comparison_results = compare_multiple_models(
    X, y,
    vectorizers_to_test,
    classifiers_to_test
)

# ============================================================
# OPSI 4: Add Tuned Models to Comparison
# ============================================================
print("\n" + "="*60)
print("OPSI 4: MENAMBAHKAN MODEL TUNED KE PERBANDINGAN")
print("="*60)

# Get the best estimators from GridSearchCV (assuming they were run in a previous cell)
try:
    tuned_hybrid_all_lr = grid_search_hybrid_all_lr.best_estimator_
    tuned_hybrid_word_char_lr = grid_search_hybrid_word_char_lr.best_estimator_

    # Evaluate tuned models
    print("\nEvaluating Tuned Hybrid All Features + Logistic Regression...")
    model_tuned_hybrid_all_lr, metrics_tuned_hybrid_all_lr = train_and_evaluate(
        X, y, tuned_hybrid_all_lr,
        "TUNED Hybrid All Features + Logistic Regression"
    )

    print("\nEvaluating Tuned Hybrid Word Char + Logistic Regression...")
    model_tuned_hybrid_word_char_lr, metrics_tuned_hybrid_word_char_lr = train_and_evaluate(
        X, y, tuned_hybrid_word_char_lr,
        "TUNED Hybrid Word Char + Logistic Regression"
    )

    # Add tuned model results to the comparison DataFrame
    tuned_results = []
    tuned_results.append({
        'vectorizer': 'hybrid_all_features (TUNED)',
        'classifier': 'logistic_regression (TUNED)',
        'accuracy': metrics_tuned_hybrid_all_lr['accuracy'],
        'precision': metrics_tuned_hybrid_all_lr['precision'],
        'recall': metrics_tuned_hybrid_all_lr['recall'],
        'f1_score': metrics_tuned_hybrid_all_lr['f1_score'],
        'cv_f1_mean': metrics_tuned_hybrid_all_lr['cv_f1_mean'],
        'cv_f1_std': metrics_tuned_hybrid_all_lr['cv_f1_std'],
        'train_time_sec': metrics_tuned_hybrid_all_lr['train_time_sec'],
        'single_pred_ms': metrics_tuned_hybrid_all_lr['single_pred_time_ms'],
        'single_pred_min_ms': metrics_tuned_hybrid_all_lr['single_pred_min_ms'],
        'single_pred_max_ms': metrics_tuned_hybrid_all_lr['single_pred_max_ms'],
        'cv_time_min': metrics_tuned_hybrid_all_lr['cv_time_min']
    })
    tuned_results.append({
        'vectorizer': 'hybrid_word_char (TUNED)',
        'classifier': 'logistic_regression (TUNED)',
        'accuracy': metrics_tuned_hybrid_word_char_lr['accuracy'],
        'precision': metrics_tuned_hybrid_word_char_lr['precision'],
        'recall': metrics_tuned_hybrid_word_char_lr['recall'],
        'f1_score': metrics_tuned_hybrid_word_char_lr['f1_score'],
        'cv_f1_mean': metrics_tuned_hybrid_word_char_lr['cv_f1_mean'],
        'cv_f1_std': metrics_tuned_hybrid_word_char_lr['cv_f1_std'],
        'train_time_sec': metrics_tuned_hybrid_word_char_lr['train_time_sec'],
        'single_pred_ms': metrics_tuned_hybrid_word_char_lr['single_pred_time_ms'],
        'single_pred_min_ms': metrics_tuned_hybrid_word_char_lr['single_pred_min_ms'],
        'single_pred_max_ms': metrics_tuned_hybrid_word_char_lr['single_pred_max_ms'],
        'cv_time_min': metrics_tuned_hybrid_word_char_lr['cv_time_min']
    })

    comparison_results = pd.concat([comparison_results, pd.DataFrame(tuned_results)], ignore_index=True)

except NameError:
    print("\nSkipping tuned model comparison: GridSearchCV results not found. Please run the tuning cell first.")
except Exception as e:
    print(f"\nAn error occurred while adding tuned models: {str(e)}")


    # Tampilkan hasil
print("\n" + "="*80)
print("HASIL PERBANDINGAN MODEL (Sorted by F1-Score)")
print("="*80)
print(comparison_results.sort_values('f1_score', ascending=False).to_string(index=False))

    # Top 5 models (including tuned if added)
print("\n" + "="*80)
print("TOP 5 BEST MODELS (Sorted by F1-Score)")
print("="*80)
print(comparison_results.sort_values('f1_score', ascending=False).head().to_string(index=False))

    # Display summary sorted by Cross-Validation F1-Score
print(f"\n{'='*80}")
print("RANKING BERDASARKAN CROSS-VALIDATION F1-SCORE")
print(f"{'='*80}")
print(comparison_results.sort_values('cv_f1_mean', ascending=False)[['vectorizer', 'classifier', 'precision', 'recall', 'f1_score', 'cv_f1_mean', 'single_pred_ms']].head(10))

    # Display summary sorted by prediction speed
print(f"\n{'='*80}")
print("RANKING BERDASARKAN KECEPATAN PREDIKSI (FASTEST)")
print(f"{'='*80}")
print(comparison_results.sort_values('single_pred_ms')[['vectorizer', 'classifier', 'precision', 'recall', 'f1_score', 'single_pred_ms']].head(10))


    # ============================================================
    # OPSI 5: List Available Configurations (Moved to end for clarity)
    # ============================================================
print("\n" + "="*60)
print("AVAILABLE CONFIGURATIONS")
print("="*60)

print("\n📊 Available Vectorizers:")
for i, vec in enumerate(get_vectorizers().keys(), 1):
    print(f"  {i}. {vec}")

print("\n🤖 Available Classifiers:")
for i, clf in enumerate(get_classifiers().keys(), 1):
    print(f"  {i}. {clf}")


OPSI 1: QUICK PRESETS

EVALUASI: Character TF-IDF
Training model...
⏱️  Training Time: 4.2647 seconds

Testing prediction speed...
⏱️  Batch Prediction Time: 0.4601 seconds
⏱️  Average Time per Sample: 0.2189 ms

Measuring single prediction latency (100 iterations)...
⏱️  Single Prediction Time (avg): 2.7903 ms
⏱️  Single Prediction Time (min): 2.0623 ms
⏱️  Single Prediction Time (max): 4.9741 ms

Classification Report:
              precision    recall  f1-score   support

    Non-Judi       0.99      1.00      1.00      1917
        Judi       0.96      0.94      0.95       185

    accuracy                           0.99      2102
   macro avg       0.98      0.97      0.97      2102
weighted avg       0.99      0.99      0.99      2102


Confusion Matrix:
[[1909    8]
 [  11  174]]

Accuracy: 0.9910
Precision: 0.9560
Recall: 0.9405
F1-Score: 0.9482

Performing cross-validation...
⏱️  Cross-Validation Time: 20.2270 seconds (0.34 minutes)
Cross-Validation F1-Score: 0.9448 (+/- 0.01

In [13]:
X_val = df_holdout['comment'].values
y_val = df_holdout['label'].values

In [16]:
# ============================================================
# OPSI 6: Evaluate Best Tuned Model on Holdout Data
# ============================================================
print("\n" + "="*60)
print("OPSI 6: EVALUASI MODEL TUNED PADA DATA HOLDOUT")
print("="*60)

try:
    # Assuming grid_search_hybrid_all_lr and grid_search_hybrid_word_char_lr are available from previous tuning
    tuned_hybrid_all_lr = grid_search_hybrid_all_lr.best_estimator_
    tuned_hybrid_word_char_lr = grid_search_hybrid_word_char_lr.best_estimator_

    print("\nEvaluating TUNED Hybrid All Features + Logistic Regression on Holdout Data...")
    model_holdout_all, metrics_holdout_all = train_and_evaluate(
        X_val, y_val, tuned_hybrid_all_lr,
        "TUNED Hybrid All Features + Logistic Regression (Holdout)"
    )

    print("\nHasil Evaluasi TUNED Hybrid All Features + Logistic Regression pada Data Holdout:")
    print(f"Accuracy: {metrics_holdout_all['accuracy']:.4f}")
    print(f"Precision: {metrics_holdout_all['precision']:.4f}")
    print(f"Recall: {metrics_holdout_all['recall']:.4f}")
    print(f"F1-Score: {metrics_holdout_all['f1_score']:.4f}")
    print(f"Single Prediction Time (avg): {metrics_holdout_all['single_pred_time_ms']:.4f} ms")


    print("\n" + "-"*60)
    print("Evaluating TUNED Hybrid Word Char + Logistic Regression on Holdout Data...")
    model_holdout_word_char, metrics_holdout_word_char = train_and_evaluate(
        X_val, y_val, tuned_hybrid_word_char_lr,
        "TUNED Hybrid Word Char + Logistic Regression (Holdout)"
    )

    print("\nHasil Evaluasi TUNED Hybrid Word Char + Logistic Regression pada Data Holdout:")
    print(f"Accuracy: {metrics_holdout_word_char['accuracy']:.4f}")
    print(f"Precision: {metrics_holdout_word_char['precision']:.4f}")
    print(f"Recall: {metrics_holdout_word_char['recall']:.4f}")
    print(f"F1-Score: {metrics_holdout_word_char['f1_score']:.4f}")
    print(f"Single Prediction Time (avg): {metrics_holdout_word_char['single_pred_time_ms']:.4f} ms")


except NameError:
    print("\nSkipping holdout evaluation: Tuned models ('grid_search_hybrid_all_lr' or 'grid_search_hybrid_word_char_lr') not found. Please run the tuning cell first.")
except Exception as e:
    print(f"\nAn error occurred during holdout evaluation: {str(e)}")


OPSI 6: EVALUASI MODEL TUNED PADA DATA HOLDOUT

Evaluating TUNED Hybrid All Features + Logistic Regression on Holdout Data...

EVALUASI: TUNED Hybrid All Features + Logistic Regression (Holdout)
Training model...
⏱️  Training Time: 0.3134 seconds

Testing prediction speed...
⏱️  Batch Prediction Time: 0.0489 seconds
⏱️  Average Time per Sample: 0.2089 ms

Measuring single prediction latency (100 iterations)...
⏱️  Single Prediction Time (avg): 2.8740 ms
⏱️  Single Prediction Time (min): 2.4090 ms
⏱️  Single Prediction Time (max): 8.1997 ms

Classification Report:
              precision    recall  f1-score   support

    Non-Judi       0.98      0.99      0.98       189
        Judi       0.95      0.91      0.93        45

    accuracy                           0.97       234
   macro avg       0.97      0.95      0.96       234
weighted avg       0.97      0.97      0.97       234


Confusion Matrix:
[[187   2]
 [  4  41]]

Accuracy: 0.9744
Precision: 0.9535
Recall: 0.9111
F1-Score:

In [17]:
# Get predictions from the best tuned model on the holdout data
# Assuming tuned_hybrid_all_lr is the best model from previous tuning
try:
    y_pred_holdout = tuned_hybrid_all_lr.predict(X_val)

    # Find misclassified indices
    misclassified_indices = np.where(y_val != y_pred_holdout)[0]

    print(f"\n{'='*60}")
    print(f"MISCLASSIFIED SAMPLES ON HOLDOUT DATA ({len(misclassified_indices)} total)")
    print(f"{'='*60}")

    # Display misclassified samples
    misclassified_data = df_holdout.iloc[misclassified_indices].copy()
    misclassified_data['predicted_label'] = y_pred_holdout[misclassified_indices]

    display(misclassified_data)

except NameError:
    print("\nSkipping misspredict data display: Tuned model ('tuned_hybrid_all_lr') not found. Please run the tuning cell first.")
except Exception as e:
    print(f"\nAn error occurred while displaying misspredict data: {str(e)}")


MISCLASSIFIED SAMPLES ON HOLDOUT DATA (6 total)


Unnamed: 0,comment,label,predicted_label
497,beneran gampang hasilnya nyata banget,1,0
604,nggak gacor,0,1
665,ubur ubur ikan lele 𝙁o𝙍𝙏𝙐𝙉𝙀𝟮z𝟴 mantap lee,1,0
668,gicir murah jeppe lsg mendarat,1,0
930,banget ya muka lu macing² unboxing 3 btw nanam...,0,1
1137,𝑪𝑵𝑫88 bosan ser,1,0
