In [5]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import re
import emoji
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Load datasets
df_train = pd.read_csv('dataset/train.csv')
df_test = pd.read_csv('dataset/test.csv')
df_holdout = pd.read_csv('dataset/holdout.csv', delimiter=';')

# Combine all datasets
df_all = pd.concat([df_train, df_test, df_holdout], ignore_index=True)

print("Dataset loaded successfully!")
print(f"Total samples: {len(df_all)}")
print(f"Columns: {list(df_all.columns)}")
print(f"Data types:\n{df_all.dtypes}")
print(f"\nNull values:\n{df_all.isnull().sum()}")


Dataset loaded successfully!
Total samples: 11673
Columns: ['comment', 'label']
Data types:
comment    object
label       int64
dtype: object

Null values:
comment    0
label      0
dtype: int64


In [6]:
df_holdout

Unnamed: 0,comment,label
0,modal kaya coba aja 𝑲𝑰𝑵𝑮𝟑𝟐𝟖 🔥 💰💎,1
1,herantapidunia serius bg idup lu,0
2,coba unboxing sukuna ajah gojo mah dikalahkan ...,0
3,seru banget eps iklan tetep w pantengin wkwkwk...,0
4,noefvon negri menawarkan beda dr kompetitor ga,0
...,...,...
1162,gue lo es kiko,0
1163,mbg audit korupsi terbesar 😢,0
1164,𝕂𝕖𝕟𝕒𝕡𝕒 𝕙𝕒𝕣𝕦𝕤 𝕙𝕦𝕤𝕓𝕦 𝕘𝕦𝕖 𝕓𝕒𝕟𝕘 😢,0
1165,syarifairlangga4608 masuk surga,0


In [7]:
# 1. BASIC OVERVIEW
print("=== 1. BASIC OVERVIEW ===")
print("\nFirst 5 samples:")
print(df_all.head())

print(f"\nDataset shape: {df_all.shape}")
print(f"Memory usage: {df_all.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Label distribution
label_counts = df_all['label'].value_counts()
print(f"\nLabel distribution:")
print(label_counts)
print(f"\nLabel percentages:")
print(df_all['label'].value_counts(normalize=True) * 100)

# Balance ratio
balance_ratio = label_counts.min() / label_counts.max()
print(f"\nDataset balance ratio: {balance_ratio:.3f} (1.0 = perfectly balanced)")

# Visualize label distribution
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "pie"}, {"type": "bar"}]],
    subplot_titles=("Label Distribution (Pie)", "Label Distribution (Bar)")
)

# Pie chart
fig.add_trace(
    go.Pie(labels=label_counts.index, values=label_counts.values, name="Labels"),
    row=1, col=1
)

# Bar chart
fig.add_trace(
    go.Bar(x=label_counts.index, y=label_counts.values, name="Count", 
           text=label_counts.values, textposition='auto'),
    row=1, col=2
)

fig.update_layout(
    title="Dataset Label Distribution",
    showlegend=True,
    height=400
)

fig.show()


=== 1. BASIC OVERVIEW ===

First 5 samples:
                                             comment  label
0                                     aamiin ya rabb      0
1  terima kasih mengajak jalan2 virtual raja ampa...      0
2                                        bener prabu      0
3                               tonton video ya hehe      0
4  coach nova plis suruh pda bljr sepak penalti a...      0

Dataset shape: (11673, 2)
Memory usage: 2.12 MB

Label distribution:
label
0    10522
1     1151
Name: count, dtype: int64

Label percentages:
label
0    90.139638
1     9.860362
Name: proportion, dtype: float64

Dataset balance ratio: 0.109 (1.0 = perfectly balanced)


In [9]:
# 2. TEXT LENGTH & WORD COUNT ANALYSIS
print("=== 2. TEXT LENGTH & WORD COUNT ANALYSIS ===")

# Add text length features
df_all['char_count'] = df_all['comment'].str.len()
df_all['word_count'] = df_all['comment'].str.split().str.len()

print(f"\nText length statistics:")
print(df_all[['char_count', 'word_count']].describe())

# Statistics by label
print(f"\nText length by label:")
print(df_all.groupby('label')[['char_count', 'word_count']].describe())

# Visualize character count distribution
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=("Character Count Distribution", "Word Count Distribution",
                   "Character Count by Label", "Word Count by Label")
)

# Character count histogram
fig.add_trace(
    go.Histogram(x=df_all['char_count'], nbinsx=50, name="All Comments"),
    row=1, col=1
)

# Word count histogram
fig.add_trace(
    go.Histogram(x=df_all['word_count'], nbinsx=50, name="All Comments"),
    row=1, col=2
)

# Character count by label
for label in df_all['label'].unique():
    data = df_all[df_all['label'] == label]['char_count']
    fig.add_trace(
        go.Histogram(x=data, nbinsx=30, name=f"Char Count - {label}", opacity=0.7),
        row=2, col=1
    )

# Word count by label
for label in df_all['label'].unique():
    data = df_all[df_all['label'] == label]['word_count']
    fig.add_trace(
        go.Histogram(x=data, nbinsx=30, name=f"Word Count - {label}", opacity=0.7),
        row=2, col=2
    )

fig.update_layout(
    title="Text Length Analysis",
    height=600,
    showlegend=True
)

fig.show()

# Box plots for better comparison
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Character Count by Label", "Word Count by Label")
)

for label in df_all['label'].unique():
    char_data = df_all[df_all['label'] == label]['char_count']
    word_data = df_all[df_all['label'] == label]['word_count']
    
    fig.add_trace(
        go.Box(y=char_data, name=f"Char Count - {label}"),
        row=1, col=1
    )
    
    fig.add_trace(
        go.Box(y=word_data, name=f"Word Count - {label}"),
        row=1, col=2
    )

fig.update_layout(
    title="Text Length Distribution Comparison",
    height=400
)

fig.show()


=== 2. TEXT LENGTH & WORD COUNT ANALYSIS ===

Text length statistics:
         char_count    word_count
count  11673.000000  11673.000000
mean      63.790799     10.405209
std      122.069147     18.648340
min        1.000000      1.000000
25%       20.000000      4.000000
50%       35.000000      6.000000
75%       65.000000     11.000000
max     5258.000000    851.000000

Text length by label:
      char_count                                                        \
           count       mean         std  min   25%   50%   75%     max   
label                                                                    
0        10522.0  65.872553  128.291660  1.0  18.0  33.0  69.0  5258.0   
1         1151.0  44.760209   16.099522  3.0  34.0  44.0  54.0   245.0   

      word_count                                                    
           count       mean        std  min  25%  50%   75%    max  
label                                                               
0        10522.0  10.53

In [10]:
# 3. CHARACTER RATIO ANALYSIS
print("=== 3. CHARACTER RATIO ANALYSIS ===")

# Helper functions
MATH_ALNUM_RANGES = [
    (0x1D400, 0x1D7FF),  # Mathematical Alphanumeric Symbols
]

def is_math_alnum_char(ch: str) -> bool:
    """Check if character is mathematical alphanumeric"""
    cp = ord(ch)
    return any(start <= cp <= end for start, end in MATH_ALNUM_RANGES)

def compute_ratios(original):
    """Compute character ratios for text analysis"""
    total_chars = max(len(original), 1)
    num_digits = sum(ch.isdigit() for ch in original)
    num_special = sum((not ch.isalnum()) and not ch.isspace() for ch in original)
    stylized_count = sum(is_math_alnum_char(ch) for ch in original)
    
    return {
        'special_char_ratio': num_special / total_chars,
        'number_ratio': num_digits / total_chars,
        'stylized_char_ratio': stylized_count / total_chars,
        'char_count': len(original),
        'word_count': len(original.split())
    }

# Apply ratio calculations
ratio_features = df_all['comment'].apply(compute_ratios)
ratio_df = pd.DataFrame(ratio_features.tolist())
df_all = pd.concat([df_all, ratio_df], axis=1)

print("Character ratio statistics:")
print(df_all[['special_char_ratio', 'number_ratio', 'stylized_char_ratio']].describe())

print("\nCharacter ratios by label:")
print(df_all.groupby('label')[['special_char_ratio', 'number_ratio', 'stylized_char_ratio']].describe())

# Visualize character ratios
fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=("Special Char Ratio", "Number Ratio", "Stylized Char Ratio",
                   "Special Char Ratio by Label", "Number Ratio by Label", "Stylized Char Ratio by Label")
)

# Overall distributions
fig.add_trace(
    go.Histogram(x=df_all['special_char_ratio'], nbinsx=50, name="Special Char Ratio"),
    row=1, col=1
)

fig.add_trace(
    go.Histogram(x=df_all['number_ratio'], nbinsx=50, name="Number Ratio"),
    row=1, col=2
)

fig.add_trace(
    go.Histogram(x=df_all['stylized_char_ratio'], nbinsx=50, name="Stylized Char Ratio"),
    row=1, col=3
)

# By label distributions
for label in df_all['label'].unique():
    data = df_all[df_all['label'] == label]
    
    fig.add_trace(
        go.Histogram(x=data['special_char_ratio'], nbinsx=30, name=f"Special - {label}", opacity=0.7),
        row=2, col=1
    )
    
    fig.add_trace(
        go.Histogram(x=data['number_ratio'], nbinsx=30, name=f"Number - {label}", opacity=0.7),
        row=2, col=2
    )
    
    fig.add_trace(
        go.Histogram(x=data['stylized_char_ratio'], nbinsx=30, name=f"Stylized - {label}", opacity=0.7),
        row=2, col=3
    )

fig.update_layout(
    title="Character Ratio Analysis",
    height=600,
    showlegend=True
)

fig.show()

# Box plots for character ratios
fig = make_subplots(
    rows=1, cols=3,
    subplot_titles=("Special Char Ratio by Label", "Number Ratio by Label", "Stylized Char Ratio by Label")
)

for label in df_all['label'].unique():
    data = df_all[df_all['label'] == label]
    
    fig.add_trace(
        go.Box(y=data['special_char_ratio'], name=f"Special - {label}"),
        row=1, col=1
    )
    
    fig.add_trace(
        go.Box(y=data['number_ratio'], name=f"Number - {label}"),
        row=1, col=2
    )
    
    fig.add_trace(
        go.Box(y=data['stylized_char_ratio'], name=f"Stylized - {label}"),
        row=1, col=3
    )

fig.update_layout(
    title="Character Ratio Distribution Comparison",
    height=400
)

fig.show()

# Correlation analysis
correlation_matrix = df_all[['special_char_ratio', 'number_ratio', 'stylized_char_ratio', 'char_count', 'word_count']].corr()

fig = go.Figure(data=go.Heatmap(
    z=correlation_matrix.values,
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    colorscale='RdBu',
    zmid=0,
    text=np.round(correlation_matrix.values, 3),
    texttemplate="%{text}",
    textfont={"size": 10}
))

fig.update_layout(
    title="Correlation Matrix of Text Features",
    height=500
)

fig.show()


=== 3. CHARACTER RATIO ANALYSIS ===
Character ratio statistics:
       special_char_ratio  number_ratio  stylized_char_ratio
count        11673.000000  11673.000000         11673.000000
mean             0.027355      0.023359             0.011820
std              0.095675      0.078260             0.050045
min              0.000000      0.000000             0.000000
25%              0.000000      0.000000             0.000000
50%              0.000000      0.000000             0.000000
75%              0.021739      0.015038             0.000000
max              1.000000      1.000000             1.000000

Character ratios by label:
      special_char_ratio                                                    \
                   count      mean       std  min  25%       50%       75%   
label                                                                        
0                10522.0  0.027721  0.100362  0.0  0.0  0.000000  0.017857   
1                 1151.0  0.024011  0.027286  0

In [11]:
# 4. EMOJI ANALYSIS
print("=== 4. EMOJI ANALYSIS ===")

def extract_emoji(text):
    """Extract emojis from text using emoji library"""
    return [c for c in text if c in emoji.EMOJI_DATA]

def count_emojis(text):
    """Count number of emojis in text"""
    return len(extract_emoji(text))

# Add emoji features
df_all['emoji_list'] = df_all['comment'].apply(extract_emoji)
df_all['emoji_count'] = df_all['comment'].apply(count_emojis)
df_all['has_emoji'] = df_all['emoji_count'] > 0

print(f"Emoji statistics:")
print(f"Comments with emojis: {df_all['has_emoji'].sum()} ({df_all['has_emoji'].mean()*100:.1f}%)")
print(f"Average emojis per comment: {df_all['emoji_count'].mean():.2f}")
print(f"Max emojis in a comment: {df_all['emoji_count'].max()}")

print(f"\nEmoji usage by label:")
emoji_by_label = df_all.groupby('label').agg({
    'has_emoji': ['sum', 'mean'],
    'emoji_count': ['mean', 'std']
}).round(3)
print(emoji_by_label)

# Get all emojis and count frequency
all_emojis = []
for emoji_list in df_all['emoji_list']:
    all_emojis.extend(emoji_list)

emoji_counter = Counter(all_emojis)
top_emojis = emoji_counter.most_common(20)

print(f"\nTop 20 most frequent emojis:")
for emoji_char, count in top_emojis:
    print(f"{emoji_char}: {count}")

# Visualize emoji analysis
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=("Emoji Count Distribution", "Emoji Usage by Label",
                   "Top 20 Most Frequent Emojis", "Emoji Count by Label")
)

# Emoji count distribution
fig.add_trace(
    go.Histogram(x=df_all['emoji_count'], nbinsx=20, name="Emoji Count"),
    row=1, col=1
)

# Emoji usage by label (percentage)
emoji_usage = df_all.groupby('label')['has_emoji'].mean() * 100
fig.add_trace(
    go.Bar(x=emoji_usage.index, y=emoji_usage.values, name="Emoji Usage %"),
    row=1, col=2
)

# Top emojis
top_20_emojis = [item[0] for item in top_emojis]
top_20_counts = [item[1] for item in top_emojis]

fig.add_trace(
    go.Bar(x=top_20_emojis, y=top_20_counts, name="Emoji Frequency"),
    row=2, col=1
)

# Emoji count by label (box plot)
for label in df_all['label'].unique():
    data = df_all[df_all['label'] == label]['emoji_count']
    fig.add_trace(
        go.Box(y=data, name=f"Emoji Count - {label}"),
        row=2, col=2
    )

fig.update_layout(
    title="Emoji Analysis",
    height=600,
    showlegend=True
)

fig.show()

# Emoji frequency comparison between labels
print(f"\nEmoji frequency comparison by label:")
for label in df_all['label'].unique():
    label_emojis = []
    for emoji_list in df_all[df_all['label'] == label]['emoji_list']:
        label_emojis.extend(emoji_list)
    
    label_emoji_counter = Counter(label_emojis)
    top_10_label = label_emoji_counter.most_common(10)
    
    print(f"\nTop 10 emojis for '{label}':")
    for emoji_char, count in top_10_label:
        print(f"  {emoji_char}: {count}")

# Create comparison chart for top emojis by label
fig = go.Figure()

for label in df_all['label'].unique():
    label_emojis = []
    for emoji_list in df_all[df_all['label'] == label]['emoji_list']:
        label_emojis.extend(emoji_list)
    
    label_emoji_counter = Counter(label_emojis)
    top_10_label = label_emoji_counter.most_common(10)
    
    emojis = [item[0] for item in top_10_label]
    counts = [item[1] for item in top_10_label]
    
    fig.add_trace(
        go.Bar(x=emojis, y=counts, name=f"Top 10 - {label}")
    )

fig.update_layout(
    title="Top 10 Emojis by Label",
    xaxis_title="Emoji",
    yaxis_title="Count",
    height=500
)

fig.show()


=== 4. EMOJI ANALYSIS ===
Emoji statistics:
Comments with emojis: 4021 (34.4%)
Average emojis per comment: 0.75
Max emojis in a comment: 1023

Emoji usage by label:
      has_emoji        emoji_count        
            sum   mean        mean     std
label                                     
0          3221  0.306       0.729  10.256
1           800  0.695       0.992   0.882

Top 20 most frequent emojis:
😂: 1604
🚗: 660
❤: 583
😅: 492
🔥: 482
🚝: 363
😊: 336
🤣: 333
🎉: 332
😭: 261
😢: 253
🗿: 240
🙏: 205
👍: 197
😁: 135
😮: 98
🏻: 91
⚡: 74
🚩: 62
💥: 57



Emoji frequency comparison by label:

Top 10 emojis for '0':
  😂: 1598
  🚗: 660
  ❤: 557
  😅: 492
  🚝: 363
  🤣: 332
  😊: 327
  🎉: 309
  🔥: 284
  😭: 260

Top 10 emojis for '1':
  🔥: 198
  ⚡: 74
  🚩: 61
  🔍: 34
  ✌: 34
  💥: 28
  ❤: 26
  🎉: 23
  ✨: 22
  🏆: 20


In [15]:
# 5. WORD FREQUENCY & N-GRAM ANALYSIS
print("=== 5. WORD FREQUENCY & N-GRAM ANALYSIS ===")

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

# Get Indonesian stopwords
try:
    indonesian_stopwords = set(stopwords.words('indonesian'))
except:
    # Fallback to English if Indonesian not available
    indonesian_stopwords = set(stopwords.words('english'))

# Add common Indonesian stopwords
indonesian_stopwords.update(['yang', 'dan', 'di', 'ke', 'dari', 'ini', 'itu', 'dengan', 'untuk', 'pada', 'adalah', 'atau', 'akan', 'telah', 'sudah', 'bisa', 'dapat', 'jika', 'karena', 'jadi', 'juga', 'saja', 'tidak', 'bukan', 'tapi', 'namun', 'oleh', 'dalam', 'atas', 'bawah', 'antara', 'sampai', 'hingga', 'sejak', 'selama', 'ketika', 'saat', 'waktu', 'tempat', 'dari', 'orang', 'yg', 'untuk', 'ke', 'pada', 'saat', 'oleh'])

def clean_text(text):
    """Clean text for word frequency analysis"""
    # Convert to lowercase
    text = text.lower()
    # Remove special characters but keep spaces
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

def get_word_frequency(texts, top_n=20):
    """Get word frequency from list of texts"""
    all_words = []
    for text in texts:
        cleaned = clean_text(text)
        words = word_tokenize(cleaned)
        # Remove stopwords and short words
        words = [word for word in words if word not in indonesian_stopwords and len(word) > 2]
        all_words.extend(words)
    
    word_counter = Counter(all_words)
    return word_counter.most_common(top_n)

def get_ngrams(texts, n=2, top_n=20):
    """Get n-grams from list of texts"""
    all_ngrams = []
    for text in texts:
        cleaned = clean_text(text)
        words = word_tokenize(cleaned)
        words = [word for word in words if word not in indonesian_stopwords and len(word) > 2]
        ngram_list = list(ngrams(words, n))
        all_ngrams.extend(ngram_list)
    
    ngram_counter = Counter(all_ngrams)
    return ngram_counter.most_common(top_n)

# Analyze word frequency by label
print("Word frequency analysis by label:")

fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=("Top 20 Words - All", "Top 20 Words by Label", 
                   "Top 20 Bigrams - All", "Top 20 Trigrams - All")
)

# All words
all_words = get_word_frequency(df_all['comment'], 20)
words, counts = zip(*all_words)
fig.add_trace(
    go.Bar(x=words, y=counts, name="All Words"),
    row=1, col=1
)

# Words by label
for label in df_all['label'].unique():
    label_texts = df_all[df_all['label'] == label]['comment']
    label_words = get_word_frequency(label_texts, 20)
    words, counts = zip(*label_words)
    fig.add_trace(
        go.Bar(x=words, y=counts, name=f"Words - {label}"),
        row=1, col=2
    )

# Bigrams
all_bigrams = get_ngrams(df_all['comment'], n=2, top_n=20)
bigram_texts = [' '.join(bigram) for bigram, _ in all_bigrams]
bigram_counts = [count for _, count in all_bigrams]
fig.add_trace(
    go.Bar(x=bigram_texts, y=bigram_counts, name="Bigrams"),
    row=2, col=1
)

# Trigrams
all_trigrams = get_ngrams(df_all['comment'], n=3, top_n=20)
trigram_texts = [' '.join(trigram) for trigram, _ in all_trigrams]
trigram_counts = [count for _, count in all_trigrams]
fig.add_trace(
    go.Bar(x=trigram_texts, y=trigram_counts, name="Trigrams"),
    row=2, col=2
)

fig.update_layout(
    title="Word Frequency and N-gram Analysis",
    height=800,
    showlegend=True
)

fig.show()

# Print detailed word frequency results
print("\nTop 20 words overall:")
for word, count in all_words:
    print(f"{word}: {count}")

print(f"\nTop 20 words by label:")
for label in df_all['label'].unique():
    label_texts = df_all[df_all['label'] == label]['comment']
    label_words = get_word_frequency(label_texts, 20)
    print(f"\n{label}:")
    for word, count in label_words:
        print(f"  {word}: {count}")

print(f"\nTop 20 bigrams:")
for bigram, count in all_bigrams:
    print(f"{' '.join(bigram)}: {count}")

print(f"\nTop 20 trigrams:")
for trigram, count in all_trigrams:
    print(f"{' '.join(trigram)}: {count}")

# Word cloud analysis (we'll create a simple frequency visualization instead)
print(f"\nWord frequency comparison between labels:")

# Create comparison for top words
fig = go.Figure()

for label in df_all['label'].unique():
    label_texts = df_all[df_all['label'] == label]['comment']
    label_words = get_word_frequency(label_texts, 15)
    words, counts = zip(*label_words)
    
    fig.add_trace(
        go.Bar(x=words, y=counts, name=f"Top 15 - {label}")
    )

fig.update_layout(
    title="Top 15 Words by Label Comparison",
    xaxis_title="Words",
    yaxis_title="Frequency",
    height=500
)

fig.show()


=== 5. WORD FREQUENCY & N-GRAM ANALYSIS ===


[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1028)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1028)>


LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/Users/user/nltk_data'
    - '/Users/user/code/penambangan-data/.venv/nltk_data'
    - '/Users/user/code/penambangan-data/.venv/share/nltk_data'
    - '/Users/user/code/penambangan-data/.venv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
# 6. CHARACTER INSPECTION
print("=== 6. CHARACTER INSPECTION ===")

def analyze_unicode_chars(text):
    """Analyze unicode characters in text"""
    chars = []
    for char in text:
        if ord(char) > 127:  # Non-ASCII characters
            chars.append(char)
    return chars

def get_stylized_chars(text):
    """Get stylized unicode characters"""
    stylized = []
    for char in text:
        if is_math_alnum_char(char):
            stylized.append(char)
    return stylized

# Analyze unicode characters
df_all['unicode_chars'] = df_all['comment'].apply(analyze_unicode_chars)
df_all['unicode_count'] = df_all['unicode_chars'].str.len()
df_all['stylized_chars'] = df_all['comment'].apply(get_stylized_chars)
df_all['stylized_count'] = df_all['stylized_chars'].str.len()

print(f"Unicode character analysis:")
print(f"Comments with unicode chars: {(df_all['unicode_count'] > 0).sum()} ({(df_all['unicode_count'] > 0).mean()*100:.1f}%)")
print(f"Comments with stylized chars: {(df_all['stylized_count'] > 0).sum()} ({(df_all['stylized_count'] > 0).mean()*100:.1f}%)")

print(f"\nUnicode usage by label:")
unicode_by_label = df_all.groupby('label').agg({
    'unicode_count': ['mean', 'std', 'max'],
    'stylized_count': ['mean', 'std', 'max']
}).round(3)
print(unicode_by_label)

# Get all unicode characters and their frequency
all_unicode = []
for char_list in df_all['unicode_chars']:
    all_unicode.extend(char_list)

unicode_counter = Counter(all_unicode)
top_unicode = unicode_counter.most_common(20)

print(f"\nTop 20 most frequent unicode characters:")
for char, count in top_unicode:
    print(f"'{char}' (U+{ord(char):04X}): {count}")

# Get stylized characters frequency
all_stylized = []
for char_list in df_all['stylized_chars']:
    all_stylized.extend(char_list)

stylized_counter = Counter(all_stylized)
top_stylized = stylized_counter.most_common(20)

print(f"\nTop 20 most frequent stylized characters:")
for char, count in top_stylized:
    print(f"'{char}' (U+{ord(char):04X}): {count}")

# Visualize unicode analysis
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=("Unicode Count Distribution", "Stylized Count Distribution",
                   "Unicode Usage by Label", "Stylized Usage by Label")
)

# Unicode count distribution
fig.add_trace(
    go.Histogram(x=df_all['unicode_count'], nbinsx=20, name="Unicode Count"),
    row=1, col=1
)

# Stylized count distribution
fig.add_trace(
    go.Histogram(x=df_all['stylized_count'], nbinsx=20, name="Stylized Count"),
    row=1, col=2
)

# Unicode usage by label
for label in df_all['label'].unique():
    data = df_all[df_all['label'] == label]['unicode_count']
    fig.add_trace(
        go.Box(y=data, name=f"Unicode - {label}"),
        row=2, col=1
    )

# Stylized usage by label
for label in df_all['label'].unique():
    data = df_all[df_all['label'] == label]['stylized_count']
    fig.add_trace(
        go.Box(y=data, name=f"Stylized - {label}"),
        row=2, col=2
    )

fig.update_layout(
    title="Unicode Character Analysis",
    height=600,
    showlegend=True
)

fig.show()

# Analyze unicode characters by label
print(f"\nUnicode character analysis by label:")

for label in df_all['label'].unique():
    label_unicode = []
    for char_list in df_all[df_all['label'] == label]['unicode_chars']:
        label_unicode.extend(char_list)
    
    label_unicode_counter = Counter(label_unicode)
    top_10_unicode = label_unicode_counter.most_common(10)
    
    print(f"\nTop 10 unicode characters for '{label}':")
    for char, count in top_10_unicode:
        print(f"  '{char}' (U+{ord(char):04X}): {count}")

# Create comparison chart for unicode characters
fig = go.Figure()

for label in df_all['label'].unique():
    label_unicode = []
    for char_list in df_all[df_all['label'] == label]['unicode_chars']:
        label_unicode.extend(char_list)
    
    label_unicode_counter = Counter(label_unicode)
    top_10_unicode = label_unicode_counter.most_common(10)
    
    chars = [item[0] for item in top_10_unicode]
    counts = [item[1] for item in top_10_unicode]
    
    fig.add_trace(
        go.Bar(x=chars, y=counts, name=f"Top 10 Unicode - {label}")
    )

fig.update_layout(
    title="Top 10 Unicode Characters by Label",
    xaxis_title="Unicode Character",
    yaxis_title="Count",
    height=500
)

fig.show()

# Special character analysis
print(f"\nSpecial character analysis:")
special_chars = df_all['comment'].str.findall(r'[^\w\s]').str.len()
df_all['special_char_count'] = special_chars

print(f"Special character statistics:")
print(df_all['special_char_count'].describe())

print(f"\nSpecial character usage by label:")
print(df_all.groupby('label')['special_char_count'].describe())

# Visualize special character analysis
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Special Character Count Distribution", "Special Character Count by Label")
)

# Special character distribution
fig.add_trace(
    go.Histogram(x=df_all['special_char_count'], nbinsx=30, name="Special Char Count"),
    row=1, col=1
)

# Special character by label
for label in df_all['label'].unique():
    data = df_all[df_all['label'] == label]['special_char_count']
    fig.add_trace(
        go.Box(y=data, name=f"Special Char - {label}"),
        row=1, col=2
    )

fig.update_layout(
    title="Special Character Analysis",
    height=400
)

fig.show()


In [None]:
# 7. SUMMARY & INSIGHTS
print("=== 7. SUMMARY & INSIGHTS ===")

# Create comprehensive summary
print("📊 COMPREHENSIVE EDA SUMMARY")
print("=" * 50)

# 1. Dataset Overview
print(f"\n1. DATASET OVERVIEW:")
print(f"   • Total samples: {len(df_all):,}")
print(f"   • Features: {len(df_all.columns)}")
print(f"   • Memory usage: {df_all.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# 2. Label Distribution
label_counts = df_all['label'].value_counts()
balance_ratio = label_counts.min() / label_counts.max()
print(f"\n2. LABEL DISTRIBUTION:")
print(f"   • {label_counts.index[0]}: {label_counts.iloc[0]:,} ({label_counts.iloc[0]/len(df_all)*100:.1f}%)")
print(f"   • {label_counts.index[1]}: {label_counts.iloc[1]:,} ({label_counts.iloc[1]/len(df_all)*100:.1f}%)")
print(f"   • Balance ratio: {balance_ratio:.3f} {'✅ Balanced' if balance_ratio > 0.8 else '⚠️ Imbalanced'}")

# 3. Text Length Analysis
print(f"\n3. TEXT LENGTH ANALYSIS:")
char_stats = df_all['char_count'].describe()
word_stats = df_all['word_count'].describe()
print(f"   • Character count: {char_stats['mean']:.1f} ± {char_stats['std']:.1f} (range: {char_stats['min']:.0f}-{char_stats['max']:.0f})")
print(f"   • Word count: {word_stats['mean']:.1f} ± {word_stats['std']:.1f} (range: {word_stats['min']:.0f}-{word_stats['max']:.0f})")

# By label
for label in df_all['label'].unique():
    label_data = df_all[df_all['label'] == label]
    char_mean = label_data['char_count'].mean()
    word_mean = label_data['word_count'].mean()
    print(f"   • {label}: {char_mean:.1f} chars, {word_mean:.1f} words avg")

# 4. Character Ratio Analysis
print(f"\n4. CHARACTER RATIO ANALYSIS:")
print(f"   • Special char ratio: {df_all['special_char_ratio'].mean():.3f} ± {df_all['special_char_ratio'].std():.3f}")
print(f"   • Number ratio: {df_all['number_ratio'].mean():.3f} ± {df_all['number_ratio'].std():.3f}")
print(f"   • Stylized char ratio: {df_all['stylized_char_ratio'].mean():.3f} ± {df_all['stylized_char_ratio'].std():.3f}")

# 5. Emoji Analysis
emoji_usage = df_all['has_emoji'].mean() * 100
avg_emojis = df_all['emoji_count'].mean()
print(f"\n5. EMOJI ANALYSIS:")
print(f"   • Comments with emojis: {emoji_usage:.1f}%")
print(f"   • Average emojis per comment: {avg_emojis:.2f}")

# 6. Unicode Analysis
unicode_usage = (df_all['unicode_count'] > 0).mean() * 100
stylized_usage = (df_all['stylized_count'] > 0).mean() * 100
print(f"\n6. UNICODE ANALYSIS:")
print(f"   • Comments with unicode chars: {unicode_usage:.1f}%")
print(f"   • Comments with stylized chars: {stylized_usage:.1f}%")

# Key Insights
print(f"\n🔍 KEY INSIGHTS:")
print("=" * 50)

# Dataset balance insight
if balance_ratio > 0.8:
    print("✅ Dataset is well-balanced, suitable for training without additional balancing")
else:
    print("⚠️ Dataset is imbalanced, consider using class weights or resampling techniques")

# Text length insights
promo_data = df_all[df_all['label'] == 'promosi_judul_online']
non_promo_data = df_all[df_all['label'] == 'bukan']

if promo_data['char_count'].mean() > non_promo_data['char_count'].mean():
    print("📝 Promotional comments tend to be longer than non-promotional ones")
else:
    print("📝 Non-promotional comments tend to be longer than promotional ones")

# Emoji insights
promo_emoji = promo_data['has_emoji'].mean()
non_promo_emoji = non_promo_data['has_emoji'].mean()

if promo_emoji > non_promo_emoji:
    print(f"😀 Promotional comments use emojis more frequently ({promo_emoji*100:.1f}% vs {non_promo_emoji*100:.1f}%)")
else:
    print(f"😀 Non-promotional comments use emojis more frequently ({non_promo_emoji*100:.1f}% vs {promo_emoji*100:.1f}%)")

# Special character insights
promo_special = promo_data['special_char_ratio'].mean()
non_promo_special = non_promo_data['special_char_ratio'].mean()

if promo_special > non_promo_special:
    print(f"🔤 Promotional comments have higher special character ratio ({promo_special:.3f} vs {non_promo_special:.3f})")
else:
    print(f"🔤 Non-promotional comments have higher special character ratio ({non_promo_special:.3f} vs {promo_special:.3f})")

# Stylized character insights
promo_stylized = promo_data['stylized_char_ratio'].mean()
non_promo_stylized = non_promo_data['stylized_char_ratio'].mean()

if promo_stylized > non_promo_stylized:
    print(f"✨ Promotional comments use more stylized characters ({promo_stylized:.3f} vs {non_promo_stylized:.3f})")
else:
    print(f"✨ Non-promotional comments use more stylized characters ({non_promo_stylized:.3f} vs {promo_stylized:.3f})")

# Create final comprehensive visualization
print(f"\n📈 CREATING COMPREHENSIVE DASHBOARD...")

# Create a comprehensive dashboard
fig = make_subplots(
    rows=3, cols=3,
    subplot_titles=("Label Distribution", "Text Length by Label", "Character Ratios by Label",
                   "Emoji Usage by Label", "Unicode Usage by Label", "Special Char Usage by Label",
                   "Text Length Distribution", "Feature Correlation", "Dataset Balance"),
    specs=[[{"type": "pie"}, {"type": "box"}, {"type": "box"}],
           [{"type": "bar"}, {"type": "bar"}, {"type": "bar"}],
           [{"type": "histogram"}, {"type": "heatmap"}, {"type": "bar"}]]
)

# 1. Label Distribution (Pie)
fig.add_trace(
    go.Pie(labels=label_counts.index, values=label_counts.values, name="Labels"),
    row=1, col=1
)

# 2. Text Length by Label (Box)
for label in df_all['label'].unique():
    data = df_all[df_all['label'] == label]['char_count']
    fig.add_trace(
        go.Box(y=data, name=f"Char Count - {label}"),
        row=1, col=2
    )

# 3. Character Ratios by Label (Box)
for label in df_all['label'].unique():
    data = df_all[df_all['label'] == label]['special_char_ratio']
    fig.add_trace(
        go.Box(y=data, name=f"Special Ratio - {label}"),
        row=1, col=3
    )

# 4. Emoji Usage by Label (Bar)
emoji_usage_by_label = df_all.groupby('label')['has_emoji'].mean() * 100
fig.add_trace(
    go.Bar(x=emoji_usage_by_label.index, y=emoji_usage_by_label.values, name="Emoji Usage %"),
    row=2, col=1
)

# 5. Unicode Usage by Label (Bar)
unicode_usage_by_label = df_all.groupby('label')['unicode_count'].mean()
fig.add_trace(
    go.Bar(x=unicode_usage_by_label.index, y=unicode_usage_by_label.values, name="Unicode Count"),
    row=2, col=2
)

# 6. Special Char Usage by Label (Bar)
special_usage_by_label = df_all.groupby('label')['special_char_count'].mean()
fig.add_trace(
    go.Bar(x=special_usage_by_label.index, y=special_usage_by_label.values, name="Special Char Count"),
    row=2, col=3
)

# 7. Text Length Distribution (Histogram)
fig.add_trace(
    go.Histogram(x=df_all['char_count'], nbinsx=50, name="Char Count Dist"),
    row=3, col=1
)

# 8. Feature Correlation (Heatmap)
correlation_features = ['char_count', 'word_count', 'special_char_ratio', 'number_ratio', 
                       'stylized_char_ratio', 'emoji_count', 'unicode_count', 'special_char_count']
corr_matrix = df_all[correlation_features].corr()

fig.add_trace(
    go.Heatmap(
        z=corr_matrix.values,
        x=corr_matrix.columns,
        y=corr_matrix.columns,
        colorscale='RdBu',
        zmid=0,
        text=np.round(corr_matrix.values, 2),
        texttemplate="%{text}",
        textfont={"size": 8}
    ),
    row=3, col=2
)

# 9. Dataset Balance (Bar)
balance_data = [balance_ratio, 1-balance_ratio]
balance_labels = ['Minority Class', 'Majority Class']
fig.add_trace(
    go.Bar(x=balance_labels, y=balance_data, name="Balance Ratio"),
    row=3, col=3
)

fig.update_layout(
    title="📊 Comprehensive EDA Dashboard - YouTube Comments Analysis",
    height=1000,
    showlegend=True
)

fig.show()

print(f"\n✅ EDA COMPLETED SUCCESSFULLY!")
print("=" * 50)
print("📋 SUMMARY:")
print(f"   • Analyzed {len(df_all):,} YouTube comments")
print(f"   • Identified key patterns between promotional and non-promotional content")
print(f"   • Generated {len(fig.data)} interactive visualizations")
print(f"   • Ready for feature engineering and model training")

print(f"\n🎯 NEXT STEPS:")
print("   1. Use character ratios as features for classification")
print("   2. Consider emoji and unicode patterns as discriminative features")
print("   3. Apply text preprocessing based on identified patterns")
print("   4. Implement feature engineering pipeline")
print("   5. Train classification models with identified features")
