In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [2]:
# 1. Muat Dataset (Asumsi format: v1=label, v2=teks)
# Ganti nama file jika berbeda
file_path = "spam.csv"
try:
    df = pd.read_csv(file_path, encoding='latin-1')
    # Ambil hanya kolom v1 (label) dan v2 (pesan)
    df = df[['v1', 'v2']]
    df.columns = ['label', 'message']
except FileNotFoundError:
    print(f"ERROR: File '{file_path}' tidak ditemukan. Mohon pastikan file sudah diunggah.")
    exit()

print("--- Informasi Dataset Awal ---")
print(f"Jumlah data: {len(df)}")
print("Distribusi kelas:\n", df['label'].value_counts())
print("-" * 40)

# Encoding label
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label']) # ham=0, spam=1

X = df['message']
y = df['label_encoded']

# Pembagian Data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Data Training: {len(X_train)} baris")
print(f"Data Testing: {len(X_test)} baris")

--- Informasi Dataset Awal ---
Jumlah data: 5572
Distribusi kelas:
 label
ham     4825
spam     747
Name: count, dtype: int64
----------------------------------------
Data Training: 4457 baris
Data Testing: 1115 baris


In [3]:
# 2. Fitur CountVectorizer dengan stop_words (Soal 1)

# Inisialisasi CountVectorizer dengan 'english' stop_words
# Menggunakan 'english' karena dataset spam umum menggunakan bahasa Inggris
count_vectorizer = CountVectorizer(stop_words='english')

# Transformasi data training dan testing
X_train_counts = count_vectorizer.fit_transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)

# Inisialisasi dan latih model Multinomial Naive Bayes
mnb_count = MultinomialNB()
mnb_count.fit(X_train_counts, y_train)

# Prediksi dan Evaluasi
y_pred_count = mnb_count.predict(X_test_counts)
accuracy_count = accuracy_score(y_test, y_pred_count)

print("--- Evaluasi Model MNB dengan CountVectorizer ---")
print(f"Akurasi: {accuracy_count:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_count, target_names=le.classes_))
print("-" * 50)

# Simpan hasil untuk perbandingan
results = {}
results['CountVectorizer'] = accuracy_count

--- Evaluasi Model MNB dengan CountVectorizer ---
Akurasi: 0.9839

Classification Report:
               precision    recall  f1-score   support

         ham       0.99      0.99      0.99       966
        spam       0.96      0.92      0.94       149

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115

--------------------------------------------------


In [4]:
# 3. Fitur TF-IDF dengan stop_words (Soal 2)

# Inisialisasi TfidfVectorizer dengan 'english' stop_words
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Transformasi data training dan testing
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Inisialisasi dan latih model Multinomial Naive Bayes
mnb_tfidf = MultinomialNB()
mnb_tfidf.fit(X_train_tfidf, y_train)

# Prediksi dan Evaluasi
y_pred_tfidf = mnb_tfidf.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)

print("--- Evaluasi Model MNB dengan TF-IDF Vectorizer ---")
print(f"Akurasi: {accuracy_tfidf:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_tfidf, target_names=le.classes_))
print("-" * 50)

# Simpan hasil untuk perbandingan
results['TF-IDF'] = accuracy_tfidf

--- Evaluasi Model MNB dengan TF-IDF Vectorizer ---
Akurasi: 0.9686

Classification Report:
               precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       1.00      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115

--------------------------------------------------


In [5]:
# 4. Perbandingan Hasil dan Kesimpulan (Soal 3 & 4)

print("--- Perbandingan Akurasi Model ---")
comparison_df = pd.DataFrame(results.items(), columns=['Vectorization Method', 'Accuracy'])
comparison_df = comparison_df.sort_values(by='Accuracy', ascending=False).reset_index(drop=True)
print(comparison_df)

best_method = comparison_df.iloc[0]['Vectorization Method']
best_accuracy = comparison_df.iloc[0]['Accuracy']

print("\n--- Kesimpulan (Jawaban Soal No. 4) ---")
print(f"Fitur terbaik pada kasus data 'spam.csv' adalah: {best_method}")
print(f"Dengan Akurasi Tertinggi: {best_accuracy:.4f}")

# Alasan (umum)
if best_method == 'CountVectorizer':
    print("\nAlasan: CountVectorizer lebih unggul karena dalam kasus Naive Bayes pada klasifikasi teks pendek (seperti SMS), frekuensi mentah kata (Count) seringkali menjadi indikator yang lebih kuat. Naive Bayes cenderung bekerja sangat baik dengan data Count.")
else:
    print("\nAlasan: TF-IDF lebih unggul karena ia memberikan bobot yang lebih tinggi pada kata-kata yang penting (muncul di banyak dokumen spam) tetapi jarang secara keseluruhan (tidak muncul di dokumen ham). Ini membantu model fokus pada kata-kata kunci spam yang khas.")

--- Perbandingan Akurasi Model ---
  Vectorization Method  Accuracy
0      CountVectorizer  0.983857
1               TF-IDF  0.968610

--- Kesimpulan (Jawaban Soal No. 4) ---
Fitur terbaik pada kasus data 'spam.csv' adalah: CountVectorizer
Dengan Akurasi Tertinggi: 0.9839

Alasan: CountVectorizer lebih unggul karena dalam kasus Naive Bayes pada klasifikasi teks pendek (seperti SMS), frekuensi mentah kata (Count) seringkali menjadi indikator yang lebih kuat. Naive Bayes cenderung bekerja sangat baik dengan data Count.
