In [2]:
from google.colab import drive
drive.mount('/content/drive')

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import re
import os

nltk.download('stopwords')

def clean_text(raw_review):
    review_text = BeautifulSoup(raw_review, "html.parser").get_text()

    letters_only = re.sub("[^a-zA-Z]", " ", review_text)

    words = letters_only.lower().split()

    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in words if w not in stops]

    return " ".join(meaningful_words)

file_path = '/content/drive/MyDrive/Spams.csv'

try:
    data = pd.read_csv(file_path, encoding='latin-1')
    data = data[['v1', 'v2']]
    data.columns = ['label', 'text']
except FileNotFoundError:
    print("File tidak ditemukan. Pastikan path file benar dan file ada di Google Drive Anda.")
    print(f"Path yang dicoba: {file_path}")
    drive.flush_and_unmount()
    raise

print("Memulai proses cleaning teks...")
data['cleaned_text'] = data['text'].apply(clean_text)
print("Proses cleaning teks selesai.")

vectorizer = CountVectorizer(analyzer="word",
                           tokenizer=None,
                           preprocessor=None,
                           stop_words="english",
                           max_features=5000)

X = vectorizer.fit_transform(data['cleaned_text']).toarray()
y = data['label'].map({'ham': 0, 'spam': 1}).values

def run_experiments(X, y, test_sizes=[0.2, 0.3], n_experiments=5):
    results = {}

    algorithms = {
        'Naive Bayes': MultinomialNB(),
        'SVM': SVC(kernel='linear', random_state=42),
        'KNN': KNeighborsClassifier(n_neighbors=5),
        'Decision Tree': DecisionTreeClassifier(random_state=42)
    }

    for algo_name, algo in algorithms.items():
        for test_size in test_sizes:
            train_size = 1 - test_size
            key = f"{algo_name} {int(train_size*100)}%:{int(test_size*100)}%"
            results[key] = {
                'precision': [],
                'recall': [],
                'f1': [],
                'accuracy': [],
                'cm': []
            }

            print(f"\nMenjalankan eksperimen untuk {key}")

            for experiment in range(n_experiments):
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, test_size=test_size, random_state=experiment)

                model = algo
                model.fit(X_train, y_train)

                y_pred = model.predict(X_test)

                results[key]['precision'].append(precision_score(y_test, y_pred))
                results[key]['recall'].append(recall_score(y_test, y_pred))
                results[key]['f1'].append(f1_score(y_test, y_pred))
                results[key]['accuracy'].append(accuracy_score(y_test, y_pred))
                results[key]['cm'].append(confusion_matrix(y_test, y_pred))

                print(f"Eksperimen {experiment+1}/{n_experiments} selesai")

    return results

print("\nMemulai semua eksperimen...")
all_results = run_experiments(X, y)
print("\nSemua eksperimen selesai!")


def save_and_display_all_results(all_results):
    all_experiments_list = []

    for key in all_results:
        algo, ratio = key.split(" ", 1)
        train_size, test_size = ratio.split(":")

        for exp_num in range(len(all_results[key]['accuracy'])):
            all_experiments_list.append({
                'Algoritma': algo,
                'Data Latih': train_size,
                'Data Uji': test_size,
                'Eksperimen': exp_num + 1,
                'Precision': all_results[key]['precision'][exp_num],
                'Recall': all_results[key]['recall'][exp_num],
                'F1-Score': all_results[key]['f1'][exp_num],
                'Accuracy': all_results[key]['accuracy'][exp_num],
                'TN': all_results[key]['cm'][exp_num][0][0],
                'FP': all_results[key]['cm'][exp_num][0][1],
                'FN': all_results[key]['cm'][exp_num][1][0],
                'TP': all_results[key]['cm'][exp_num][1][1]
            })

    all_experiments_df = pd.DataFrame(all_experiments_list)

    save_path = '/content/drive/MyDrive/semua_hasil_eksperimen.csv'
    all_experiments_df.to_csv(save_path, index=False)
    print(f"\nSemua hasil eksperimen telah disimpan di: {save_path}")

    return all_experiments_df

all_experiments_df = save_and_display_all_results(all_results)

def display_detailed_results(all_results):
    print("\nDetail Hasil Setiap Eksperimen:")
    print("="*80)

    for algo in ['Naive Bayes', 'SVM', 'KNN', 'Decision Tree']:
        for size in ['80%:20%', '70%:30%']:
            key = f"{algo} {size}"

            print(f"\n{algo} - Rasio {size}")
            print("-"*60)

            for exp_num in range(5):
                print(f"\nEksperimen {exp_num+1}:")
                print(f"Precision: {all_results[key]['precision'][exp_num]:.4f}")
                print(f"Recall:    {all_results[key]['recall'][exp_num]:.4f}")
                print(f"F1-Score:  {all_results[key]['f1'][exp_num]:.4f}")
                print(f"Accuracy:  {all_results[key]['accuracy'][exp_num]:.4f}")
                print("Confusion Matrix:")
                print(all_results[key]['cm'][exp_num])

display_detailed_results(all_results)

def calculate_and_display_averages(all_results):
    print("\n\nRangkuman Rata-rata Hasil Eksperimen:")
    print("="*80)

    avg_results = {}
    for key in all_results:
        avg_results[key] = {
            'precision': np.mean(all_results[key]['precision']),
            'recall': np.mean(all_results[key]['recall']),
            'f1': np.mean(all_results[key]['f1']),
            'accuracy': np.mean(all_results[key]['accuracy']),
            'cm': np.mean(all_results[key]['cm'], axis=0)
        }

    for algo in ['Naive Bayes', 'SVM', 'KNN', 'Decision Tree']:
        for size in ['80%:20%', '70%:30%']:
            key = f"{algo} {size}"

            print(f"\n{algo} - Rasio {size}")
            print("-"*60)
            print(f"Rata-rata Precision: {avg_results[key]['precision']:.4f}")
            print(f"Rata-rata Recall:    {avg_results[key]['recall']:.4f}")
            print(f"Rata-rata F1-Score:  {avg_results[key]['f1']:.4f}")
            print(f"Rata-rata Accuracy:  {avg_results[key]['accuracy']:.4f}")
            print("Rata-rata Confusion Matrix:")
            print(avg_results[key]['cm'])

    return avg_results

avg_results = calculate_and_display_averages(all_results)

def calculate_averages(results):
    avg_results = {}

    for key in results:
        avg_results[key] = {
            'precision': np.mean(results[key]['precision']),
            'recall': np.mean(results[key]['recall']),
            'f1': np.mean(results[key]['f1']),
            'accuracy': np.mean(results[key]['accuracy']),
            'cm': np.mean(results[key]['cm'], axis=0)
        }

    return avg_results

avg_results = calculate_averages(all_results)

def print_results(avg_results):
    print("\nHasil Rata-rata Eksperimen:")
    print("="*70)

    for algo in ['Naive Bayes', 'SVM', 'KNN', 'Decision Tree']:
        for size in ['80%:20%', '70%:30%']:
            key = f"{algo} {size}"
            print(f"\n{algo} - Rasio {size}")
            print("-"*50)
            print(f"Precision: {avg_results[key]['precision']:.4f}")
            print(f"Recall:    {avg_results[key]['recall']:.4f}")
            print(f"F1-Score:  {avg_results[key]['f1']:.4f}")
            print(f"Accuracy:  {avg_results[key]['accuracy']:.4f}")
            print("\nConfusion Matrix (rata-rata):")
            print(avg_results[key]['cm'])

print_results(avg_results)

def save_results_to_drive(avg_results, filename='hasil_eksperimen.csv'):
    results_list = []

    for key in avg_results:
        algo, ratio = key.split(" ", 1)
        train_size, test_size = ratio.split(":")

        results_list.append({
            'Algoritma': algo,
            'Data Latih': train_size,
            'Data Uji': test_size,
            'Precision': avg_results[key]['precision'],
            'Recall': avg_results[key]['recall'],
            'F1-Score': avg_results[key]['f1'],
            'Accuracy': avg_results[key]['accuracy']
        })

    results_df = pd.DataFrame(results_list)

    save_path = f'/content/drive/MyDrive/{filename}'
    results_df.to_csv(save_path, index=False)
    print(f"\nHasil eksperimen telah disimpan di: {save_path}")

    return results_df

results_df = save_results_to_drive(avg_results)

print("\nTabel Ringkasan Hasil:")
print(results_df)

best_model = max(avg_results.items(), key=lambda x: x[1]['f1'])
print("\nModel Terbaik:")
print(f"Algoritma: {best_model[0]}")
print(f"F1-Score: {best_model[1]['f1']:.4f}")
print(f"Accuracy: {best_model[1]['accuracy']:.4f}")

drive.flush_and_unmount()
print("\nGoogle Drive telah di-unmount.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Memulai proses cleaning teks...
Proses cleaning teks selesai.

Memulai semua eksperimen...

Menjalankan eksperimen untuk Naive Bayes 80%:20%
Eksperimen 1/5 selesai
Eksperimen 2/5 selesai
Eksperimen 3/5 selesai
Eksperimen 4/5 selesai
Eksperimen 5/5 selesai

Menjalankan eksperimen untuk Naive Bayes 70%:30%
Eksperimen 1/5 selesai
Eksperimen 2/5 selesai
Eksperimen 3/5 selesai
Eksperimen 4/5 selesai
Eksperimen 5/5 selesai

Menjalankan eksperimen untuk SVM 80%:20%
Eksperimen 1/5 selesai
Eksperimen 2/5 selesai
Eksperimen 3/5 selesai
Eksperimen 4/5 selesai
Eksperimen 5/5 selesai

Menjalankan eksperimen untuk SVM 70%:30%
Eksperimen 1/5 selesai
Eksperimen 2/5 selesai
Eksperimen 3/5 selesai
Eksperimen 4/5 selesai
Eksperimen 5/5 selesai

Menjalankan eksperimen untuk KNN 80%:20%
Eksperimen 1/5 selesai
Eksperimen 2/5 selesai
Eksperimen 3/5 selesai
Eksperimen 4/5 selesai
Eksperimen 5/5 selesai

Menjalankan eksperimen untuk KNN 70%:30%
Eksperimen 1/5 selesai
Eksperimen 2/5 selesai
Eksperimen 3/5 seles

In [None]:
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/Spams.csv'

try:
    data = pd.read_csv(file_path, encoding='latin-1')

    if set(['label', 'text']).issubset(data.columns):
        data = data.rename(columns={'label': 'v1', 'text': 'v2'})
    elif set(['v1', 'v2']).issubset(data.columns):
        pass
    else:
        data.columns = ['v1', 'v2'] + list(data.columns[2:])

    data = data[['v1', 'v2']]

    data.to_csv(file_path, index=False)
    print("File telah disesuaikan dengan format 'v1' dan 'v2'")

except Exception as e:
    print(f"Terjadi error: {str(e)}")
    print("Pastikan file ada di path yang benar dan formatnya sesuai")