In [None]:
# SCAM ALERT: Analisis & Visualisasi
# Jupyter Notebook untuk Tugas Besar Kecerdasan Buatan

# ## 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from main import ScamDetector
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# ## 2. Load dan Explore Dataset
print("üîÑ Loading dataset...")
detector = ScamDetector()
df = detector.create_dataset()

print(f"\nüìä Dataset Overview:")
print(f"Total pesan: {len(df)}")
print(f"\nüìà Distribusi Label:")
print(df['label'].value_counts())
print(f"\nüìã Sample Data:")
print(df.head(10))

# ## 3. Visualisasi Distribusi Data

# ### 3.1 Label Distribution
fig, ax = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart
df['label'].value_counts().plot(kind='bar', ax=ax[0], color=['#ff6b6b', '#51cf66'])
ax[0].set_title('Distribusi Label Pesan', fontsize=14, fontweight='bold')
ax[0].set_xlabel('Kategori')
ax[0].set_ylabel('Jumlah Pesan')
ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation=0)

# Pie chart
colors = ['#ff6b6b', '#51cf66']
df['label'].value_counts().plot(kind='pie', ax=ax[1], autopct='%1.1f%%', 
                                  colors=colors, startangle=90)
ax[1].set_title('Proporsi Data', fontsize=14, fontweight='bold')
ax[1].set_ylabel('')

plt.tight_layout()
plt.show()

# ## 4. Analisis Text Length

# Hitung panjang pesan
df['message_length'] = df['message'].apply(len)
df['word_count'] = df['message'].apply(lambda x: len(x.split()))

print("\nüìè Statistik Panjang Pesan:")
print(df.groupby('label')[['message_length', 'word_count']].describe())

# ### 4.1 Visualisasi Panjang Pesan
fig, ax = plt.subplots(1, 2, figsize=(14, 5))

# Box plot - Character length
df.boxplot(column='message_length', by='label', ax=ax[0])
ax[0].set_title('Distribusi Panjang Karakter per Kategori')
ax[0].set_xlabel('Kategori')
ax[0].set_ylabel('Jumlah Karakter')

# Box plot - Word count
df.boxplot(column='word_count', by='label', ax=ax[1])
ax[1].set_title('Distribusi Jumlah Kata per Kategori')
ax[1].set_xlabel('Kategori')
ax[1].set_ylabel('Jumlah Kata')

plt.tight_layout()
plt.show()

# ## 5. Word Cloud Analysis

# ### 5.1 Word Cloud untuk SCAM Messages
scam_text = ' '.join(df[df['label'] == 'scam']['message'])
legitimate_text = ' '.join(df[df['label'] == 'legitimate']['message'])

fig, ax = plt.subplots(1, 2, figsize=(16, 6))

# Scam word cloud
wordcloud_scam = WordCloud(width=800, height=400, 
                           background_color='white',
                           colormap='Reds').generate(scam_text)
ax[0].imshow(wordcloud_scam, interpolation='bilinear')
ax[0].set_title('Word Cloud - SCAM Messages', fontsize=14, fontweight='bold')
ax[0].axis('off')

# Legitimate word cloud
wordcloud_legit = WordCloud(width=800, height=400,
                            background_color='white',
                            colormap='Greens').generate(legitimate_text)
ax[1].imshow(wordcloud_legit, interpolation='bilinear')
ax[1].set_title('Word Cloud - LEGITIMATE Messages', fontsize=14, fontweight='bold')
ax[1].axis('off')

plt.tight_layout()
plt.show()

# ## 6. Top Keywords Analysis

def get_top_keywords(text, n=15):
    """Extract top n keywords from text"""
    vectorizer = CountVectorizer(max_features=n, stop_words='english')
    X = vectorizer.fit_transform([text])
    keywords = vectorizer.get_feature_names_out()
    counts = X.toarray()[0]
    return dict(zip(keywords, counts))

# Get top keywords
scam_keywords = get_top_keywords(scam_text, 20)
legit_keywords = get_top_keywords(legitimate_text, 20)

# ### 6.1 Visualisasi Top Keywords
fig, ax = plt.subplots(1, 2, figsize=(16, 6))

# Scam keywords
pd.Series(scam_keywords).sort_values(ascending=True).tail(15).plot(
    kind='barh', ax=ax[0], color='#ff6b6b'
)
ax[0].set_title('Top 15 Keywords - SCAM Messages', fontsize=14, fontweight='bold')
ax[0].set_xlabel('Frequency')

# Legitimate keywords
pd.Series(legit_keywords).sort_values(ascending=True).tail(15).plot(
    kind='barh', ax=ax[1], color='#51cf66'
)
ax[1].set_title('Top 15 Keywords - LEGITIMATE Messages', fontsize=14, fontweight='bold')
ax[1].set_xlabel('Frequency')

plt.tight_layout()
plt.show()

# ## 7. Training Models

print("\nü§ñ Training models...")
X_test_vec, y_test = detector.train_models(df)

# ## 8. Model Evaluation & Comparison

# ### 8.1 Accuracy Comparison
accuracies = {name: result['accuracy'] 
              for name, result in detector.models.items()}

fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.bar(accuracies.keys(), accuracies.values(), 
              color=['#4c6ef5', '#51cf66', '#ff6b6b'])
ax.set_title('Model Accuracy Comparison', fontsize=16, fontweight='bold')
ax.set_ylabel('Accuracy')
ax.set_ylim([0.85, 1.0])

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.2%}',
            ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

# ### 8.2 Detailed Performance Metrics
from sklearn.metrics import classification_report

print("\nüìä DETAILED PERFORMANCE METRICS:")
print("="*70)

for name, result in detector.models.items():
    print(f"\nü§ñ {name}")
    print("-"*70)
    print(classification_report(result['y_test'], result['predictions']))

# ### 8.3 Confusion Matrices
detector.evaluate_models()

# ## 9. Feature Importance (Random Forest)

# Get feature names and importances
rf_model = detector.models['Random Forest']['model']
feature_names = detector.vectorizer.get_feature_names_out()
importances = rf_model.feature_importances_

# Get top 20 important features
indices = np.argsort(importances)[-20:]
top_features = [(feature_names[i], importances[i]) for i in indices]

fig, ax = plt.subplots(figsize=(12, 8))
features, scores = zip(*top_features)
y_pos = np.arange(len(features))

ax.barh(y_pos, scores, color='#ff6b6b')
ax.set_yticks(y_pos)
ax.set_yticklabels(features)
ax.set_xlabel('Importance Score')
ax.set_title('Top 20 Most Important Features (Random Forest)', 
             fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

# ## 10. Testing dengan Contoh Real

test_messages = [
    ("SELAMAT! Anda menang undian 100 juta! Transfer admin 500rb", "Scam Example"),
    ("Meeting besok jam 2 siang. Jangan lupa bawa dokumen", "Legitimate Example"),
    ("URGENT! Akun bank akan diblokir. Klik link ini sekarang!", "Scam Example"),
    ("Invoice sudah dikirim ke email. Mohon dicek ya", "Legitimate Example"),
]

print("\nüß™ TESTING DENGAN CONTOH REAL:")
print("="*70)

results_data = []
for msg, label in test_messages:
    result = detector.predict(msg)
    results_data.append({
        'Message': msg[:50] + '...' if len(msg) > 50 else msg,
        'Expected': label,
        'Prediction': 'SCAM' if result['is_scam'] else 'SAFE',
        'Confidence': f"{result['confidence']:.2f}%"
    })
    
    print(f"\nüì© Message: {msg}")
    print(f"   Expected: {label}")
    print(f"   Predicted: {'SCAM' if result['is_scam'] else 'SAFE'} "
          f"(Confidence: {result['confidence']:.2f}%)")

# Create results dataframe
results_df = pd.DataFrame(results_data)
print("\nüìä Summary:")
print(results_df.to_string(index=False))

# ## 11. Save Model
detector.save_model('scam_detector_model.pkl')

print("\n" + "="*70)
print("‚úÖ ANALISIS SELESAI!")
print("="*70)
print("\nüìù Summary:")
print(f"   ‚Ä¢ Total pesan dianalisis: {len(df)}")
print(f"   ‚Ä¢ Best model accuracy: {max(accuracies.values()):.2%}")
print(f"   ‚Ä¢ Model disimpan: scam_detector_model.pkl")
print("\nüéì Project ini siap untuk dipresentasikan!")
print("="*70)