# Tugas 1

## 1. Buatlah model klasifikasi dengan menggunakan SVM untuk data suara, `voice.csv`.

## Import library

In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Load



In [46]:
data = pd.read_csv('/content/drive/MyDrive/Machine Learning/voice.csv')

# Preprocessing

In [47]:
X = data.drop(columns='label', axis = 1)  # Mengasumsikan kolom 'label' adalah target
y = data['label']

## Splitting data

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Model

In [49]:
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# Evaluasi

In [50]:
y_pred = svm_model.predict(X_test)

# Print evaluation metrics
print(f"Akurasi : {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Akurasi : 97.63%
Classification Report:
               precision    recall  f1-score   support

      female       0.96      0.99      0.98       297
        male       0.99      0.97      0.98       337

    accuracy                           0.98       634
   macro avg       0.98      0.98      0.98       634
weighted avg       0.98      0.98      0.98       634



## 2. Buatlah model klasfikasi Multinomial Naive Bayes dengan ketentuan,

  1. Menggunakan data `spam.csv`
  2. Fitur `CountVectorizer` dengan mengaktifkan `stop_words`
  3. Evaluasi hasilnya

## Data Load

In [51]:
data = pd.read_csv('/content/drive/MyDrive/Machine Learning/spam.csv',  encoding='latin-1')

data = data[['v1', 'v2']]
data.columns = ['label', 'text']
X = data['text']
y = data['label']

## Splitting data & Stop Words

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Buat dan latih model Multinomial Naive Bayes
vectorizer = CountVectorizer(stop_words='english')


X_train_cv = vectorizer.fit_transform(X_train)
X_test_cv = vectorizer.transform(X_test)


## Model

In [53]:
nb_model = MultinomialNB()
nb_model.fit(X_train_cv, y_train)

## Evaluasi

In [54]:
y_pred_cv = nb_model.predict(X_test_cv)

print(f"Accuracy (Count Vectorizer): {accuracy_score(y_test, y_pred_cv) * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred_cv))

Accuracy (Count Vectorizer): 98.39%
Classification Report:
               precision    recall  f1-score   support

         ham       0.99      0.99      0.99       965
        spam       0.96      0.92      0.94       150

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115



## 3. Buatlah model klasfikasi Multinomial Naive Bayes dengan ketentuan,

  1. Menggunakan data `spam.csv`

  2. Fitur `TF-IDF` dengan mengaktifkan `stop_words`

  3. Evaluasi hasilnya dan bandingkan dengan hasil pada Tugas no 2.

  4. Berikan kesimpulan fitur mana yang terbaik pada kasus data `spam.csv`

## Data Load

In [55]:
data = pd.read_csv('/content/drive/MyDrive/Machine Learning/spam.csv',  encoding='latin-1')

data = data[['v1', 'v2']]
data.columns = ['label', 'text']
X = data['text']
y = data['label']

## Splitting data & Stop Words

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)



## Model

In [57]:
nb_model_tfidf = MultinomialNB()
nb_model_tfidf.fit(X_train_tfidf, y_train)

## Evaluasi

In [58]:
y_pred_tfidf = nb_model_tfidf.predict(X_test_tfidf)

# Menampilkan akurasi dan classification report
print(f"Accuracy (TF-IDF): {accuracy_score(y_test, y_pred_tfidf) * 100:.2f}%")
print("Classification Report (TF-IDF):\n", classification_report(y_test, y_pred_tfidf))

Accuracy (TF-IDF): 96.68%
Classification Report (TF-IDF):
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115



## Perbandingan dengan no 2

In [59]:
print(f"Accuracy (Count Vectorizer): {accuracy_score(y_test, y_pred_cv) * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred_cv))
print()
print(f"Accuracy (TF-IDF): {accuracy_score(y_test, y_pred_tfidf) * 100:.2f}%")
print("Classification Report (TF-IDF):\n", classification_report(y_test, y_pred_tfidf))

Accuracy (Count Vectorizer): 98.39%
Classification Report:
               precision    recall  f1-score   support

         ham       0.99      0.99      0.99       965
        spam       0.96      0.92      0.94       150

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115


Accuracy (TF-IDF): 96.68%
Classification Report (TF-IDF):
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115



## Kesimpulan Hasil
Model `TF-IDF` menunjukkan `precision` yang sangat baik untuk kelas spam (1.00), tetapi memiliki `precision` yang lebih rendah untuk kelas ham (0.96). Di sisi lain, `Count Vectorizer` memiliki `precision` yang lebih seimbang antara kedua kelas. <br><br>
`Count Vectorizer` memiliki `recall` yang lebih baik untuk spam (0.92) dibandingkan `TF-IDF` (0.75) untuk kelas spam. Namun, `TF-IDF` memiliki `recall` yang sempurna untuk kelas ham (1.00). <br><br>
`F1-Score` untuk spam lebih tinggi pada `Count Vectorizer` (0.94) dibandingkan `TF-IDF` (0.86). Ini menunjukkan bahwa `Count Vectorizer` lebih baik dalam menangani trade-off antara `precision` dan `recall`. <br><br>
Model dengan `Count Vectorizer` memiliki akurasi yang lebih tinggi (98.39%) dibandingkan model dengan `TF-IDF` (96.68%). Ini menunjukkan bahwa Count Vectorizer lebih efektif dalam menangani data ini.



# Tugas 2

## Buatlah model klasifikasi dengan menggunakan SVM berdasarkan data pada Percobaan 5 dengan menggunakan fitur histogram.

## Import Library

In [60]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from pathlib import Path
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder

## Load Data

In [61]:
data_test = ('/content/drive/MyDrive/Machine Learning/images/test/')
data_train = ('/content/drive/MyDrive/Machine Learning/images/training/')

def load_dataset(img_dir):
    p = Path(img_dir)
    dirs = p.glob('*')

    img_list = []

    for dir in dirs:
        label = str(dir).split('/')[-1]
        for file in dir.glob('*.jpg'):
            img = mpimg.imread(file)

            if not img is None:
                img_list.append((img, label))

    return img_list

train_data = load_dataset(data_train)
test_data = load_dataset(data_test)


## Fitur Extraction

In [62]:
def extract_histogram(image, bins=(8, 8, 8)):
    # Asumsi gambar dalam format RGB
    hist = []
    for channel in range(3):  # Untuk setiap kanal R, G, B
        hist_channel, _ = np.histogram(image[:, :, channel], bins=bins[channel], range=(0, 256))
        hist.append(hist_channel)

    hist = np.concatenate(hist)  # Gabungkan histogram dari semua kanal
    return hist

## Data Splitting

In [63]:
X_train = [extract_histogram(img) for img, label in train_data]
y_train = [label for img, label in train_data]

X_test = [extract_histogram(img) for img, label in test_data]
y_test = [label for img, label in test_data]

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

##  Model

In [64]:
model = SVC(kernel='rbf', C=0.1, random_state=50)

model.fit(X_train, y_train_enc)


## Evaluasi

In [65]:
y_pred_test = model.predict(X_test)

# Evaluasi model
print("Confusion Matrix:")
print(confusion_matrix(y_test_enc, y_pred_test))
print("\nClassification Report:")
print(classification_report(y_test_enc, y_pred_test))
print(f"\nAccuracy: {accuracy_score(y_test_enc, y_pred_test)*100:.2f}%")

y_train_pred = model.predict(X_train)

# Evaluasi prediksi pada data training
print("Confusion Matrix (Training Data):")
print(confusion_matrix(y_train_enc, y_train_pred))
print("\nClassification Report (Training Data):")
print(classification_report(y_train_enc, y_train_pred))
print(f"\nTraining Accuracy: {accuracy_score(y_train_enc, y_train_pred) * 100:.2f}%")


Confusion Matrix:
[[78  2]
 [ 7 73]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95        80
           1       0.97      0.91      0.94        80

    accuracy                           0.94       160
   macro avg       0.95      0.94      0.94       160
weighted avg       0.95      0.94      0.94       160


Accuracy: 94.38%
Confusion Matrix (Training Data):
[[120   0]
 [  2 118]]

Classification Report (Training Data):
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       120
           1       1.00      0.98      0.99       120

    accuracy                           0.99       240
   macro avg       0.99      0.99      0.99       240
weighted avg       0.99      0.99      0.99       240


Training Accuracy: 99.17%
