# Klasifikasi Penerimaan Murid Prasekolah menggunakan Support Vector Machine (SVM) dan Neural Network (NN)

KELOMPOK 3
- 2210511046 Hanifah Az-Zahra
- 2210511054 Dinda Cantika Putri
- 2210511070 Choirunnisa Zalfaa Nabilah
- 2210511072 Edwina Martha Putri

# Preprocessing Data

In [None]:
pip install pandas tensorflow scikit-learn numpy

In [3]:
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense 
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [4]:
# Menambahkan header
headers = ["parents", "has_nurs", "form", "children", "housing", "finance", "social", "health", "class"]
df = pd.read_csv('nursery/nursery.data', names=headers)
df.head()

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,class
0,usual,proper,complete,1,convenient,convenient,nonprob,recommended,recommend
1,usual,proper,complete,1,convenient,convenient,nonprob,priority,priority
2,usual,proper,complete,1,convenient,convenient,nonprob,not_recom,not_recom
3,usual,proper,complete,1,convenient,convenient,slightly_prob,recommended,recommend
4,usual,proper,complete,1,convenient,convenient,slightly_prob,priority,priority


In [5]:
df.describe()

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,class
count,12960,12960,12960,12960,12960,12960,12960,12960,12960
unique,3,5,4,4,3,2,3,3,5
top,usual,proper,complete,1,convenient,convenient,nonprob,recommended,not_recom
freq,4320,2592,3240,3240,4320,6480,4320,4320,4320


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12960 entries, 0 to 12959
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   parents   12960 non-null  object
 1   has_nurs  12960 non-null  object
 2   form      12960 non-null  object
 3   children  12960 non-null  object
 4   housing   12960 non-null  object
 5   finance   12960 non-null  object
 6   social    12960 non-null  object
 7   health    12960 non-null  object
 8   class     12960 non-null  object
dtypes: object(9)
memory usage: 911.4+ KB


*missing value*

In [7]:
df.isnull().sum()

parents     0
has_nurs    0
form        0
children    0
housing     0
finance     0
social      0
health      0
class       0
dtype: int64

*duplicate*

In [8]:
df.duplicated().sum()

0

*outlier check*

In [9]:
for col in df.columns:
    print(col)
    print(df[col].value_counts())
    print("\n")

parents
parents
usual          4320
pretentious    4320
great_pret     4320
Name: count, dtype: int64


has_nurs
has_nurs
proper         2592
less_proper    2592
improper       2592
critical       2592
very_crit      2592
Name: count, dtype: int64


form
form
complete      3240
completed     3240
incomplete    3240
foster        3240
Name: count, dtype: int64


children
children
1       3240
2       3240
3       3240
more    3240
Name: count, dtype: int64


housing
housing
convenient    4320
less_conv     4320
critical      4320
Name: count, dtype: int64


finance
finance
convenient    6480
inconv        6480
Name: count, dtype: int64


social
social
nonprob          4320
slightly_prob    4320
problematic      4320
Name: count, dtype: int64


health
health
recommended    4320
priority       4320
not_recom      4320
Name: count, dtype: int64


class
class
not_recom     4320
priority      4266
spec_prior    4044
very_recom     328
recommend        2
Name: count, dtype: int64




*pemilihan kelas dan kolom*

In [10]:
# Filter hanya baris dengan class 'not_recom', 'priority' dan 'spec prior'
df = df[df['class'].isin(['not_recom', 'priority', 'spec_prior'])]

# Tampilkan jumlah kelas 'not_recom' dan 'priority'
print(df['class'].value_counts())

class
not_recom     4320
priority      4266
spec_prior    4044
Name: count, dtype: int64


In [11]:
# Pilih kolom yang akan digunakan
kolom_pilihan = ['finance', 'social', 'health', 'class']
df_pilihan = df[kolom_pilihan].copy()

df_pilihan.head()

Unnamed: 0,finance,social,health,class
1,convenient,nonprob,priority,priority
2,convenient,nonprob,not_recom,not_recom
4,convenient,slightly_prob,priority,priority
5,convenient,slightly_prob,not_recom,not_recom
6,convenient,problematic,recommended,priority


*encoding*

In [12]:
le = LabelEncoder()

df_pilihan['social'] = le.fit_transform(df_pilihan['social']).astype(int)
df_pilihan['finance'] = le.fit_transform(df_pilihan['finance']).astype(int)
df_pilihan['health'] = le.fit_transform(df_pilihan['health']).astype(int)
df_pilihan['class'] = le.fit_transform(df_pilihan['class']).astype(int)

df_pilihan.head()

Unnamed: 0,finance,social,health,class
1,0,0,1,1
2,0,0,0,0
4,0,2,1,1
5,0,2,0,0
6,0,1,2,1


*splitting*

In [13]:
X = df_pilihan[['social', 'finance', 'health']]
y = df_pilihan['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data Training: ", X_train.shape)
print("Data Testing: ", X_test.shape)

Data Training:  (10104, 3)
Data Testing:  (2526, 3)


# Model SVM

*Modeling Evaluasi*

In [14]:
from sklearn.svm import SVC

svm_model = SVC(kernel='linear', decision_function_shape='ovo')
svm_model.fit(X_train, y_train)
svm_predict = svm_model.predict(X_test)

In [15]:
df_hasil = X_test.copy()
df_hasil['Label asli'] = y_test.values
df_hasil['Label prediksi'] = svm_predict

In [16]:
df_hasil['Prediksi benar'] = df_hasil['Label asli'] == df_hasil['Label prediksi']
prediksi_benar = df_hasil['Prediksi benar'].sum()

print(df_hasil.head(20))
print(f'Jumlah hasil prediksi yang benar adalah {prediksi_benar}')

       social  finance  health  Label asli  Label prediksi  Prediksi benar
9234        0        0       2           1               1            True
11965       2        1       1           2               2            True
4530        2        1       2           1               1            True
11732       2        1       0           0               0            True
8441        1        1       0           0               0            True
11107       0        0       1           2               2            True
7272        0        0       2           2               1           False
12532       2        0       1           2               2            True
1057        2        1       1           1               2           False
1707        1        1       2           1               1            True
1817        1        1       0           0               0            True
8152        1        1       1           2               2            True
10081       0        0   

In [17]:
# df_hasil.to_csv('Hasil_prediksiSVM_Linear.csv', index='False')

In [18]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print(f"Accuracy Score: {accuracy_score(y_test, svm_predict)}")
print(f"Classification Report:\n {classification_report(y_test, svm_predict)}")
print(f"Confusion Matrix:\n {confusion_matrix(y_test, svm_predict)}")

Accuracy Score: 0.7268408551068883
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       896
           1       0.61      0.55      0.58       861
           2       0.55      0.60      0.57       769

    accuracy                           0.73      2526
   macro avg       0.72      0.72      0.72      2526
weighted avg       0.73      0.73      0.73      2526

Confusion Matrix:
 [[896   0   0]
 [  0 476 385]
 [  0 305 464]]


In [19]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

sk_fold = StratifiedKFold(n_splits=3)
scores = cross_val_score(svm_model, X, y, cv=sk_fold)

print(f"Cross Validation Score: ", scores)
print(f"Rata Rata Accuracy: {scores.mean():.2f}")
print(f"Standar Deviasi: {scores.std():.2f}")

Cross Validation Score:  [0.69239905 0.72066508 0.77173397]
Rata Rata Accuracy: 0.73
Standar Deviasi: 0.03


In [20]:
from sklearn.svm import SVC

svm_modelrbf = SVC(kernel='rbf', decision_function_shape='ovo')
svm_modelrbf.fit(X_train, y_train)
svm_predrbf = svm_modelrbf.predict(X_test)

In [21]:
df_hasilrbf = X_test.copy()
df_hasilrbf['Label asli'] = y_test.values
df_hasilrbf['Label prediksi'] = svm_predrbf

In [22]:
df_hasilrbf['Prediksi benar'] = df_hasilrbf['Label asli'] == df_hasilrbf['Label prediksi']
prediksi_benarrbf = df_hasilrbf['Prediksi benar'].sum()

print(df_hasilrbf.head(20))
print(f'Jumlah hasil prediksi yang benar adalah {prediksi_benarrbf}')

       social  finance  health  Label asli  Label prediksi  Prediksi benar
9234        0        0       2           1               1            True
11965       2        1       1           2               2            True
4530        2        1       2           1               1            True
11732       2        1       0           0               0            True
8441        1        1       0           0               0            True
11107       0        0       1           2               2            True
7272        0        0       2           2               1           False
12532       2        0       1           2               2            True
1057        2        1       1           1               2           False
1707        1        1       2           1               2           False
1817        1        1       0           0               0            True
8152        1        1       1           2               2            True
10081       0        0   

In [23]:
# df_hasilrbf.to_csv('Hasil_prediksiSVM_rbf.csv', index='False')

In [24]:
print(f"Accuracy Score: {accuracy_score(y_test, svm_predrbf)}")
print(f"Classification Report:\n {classification_report(y_test, svm_predrbf)}")
print(f"Confusion Matrix:\n {confusion_matrix(y_test, svm_predrbf)}")

Accuracy Score: 0.7387173396674585
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       896
           1       0.70      0.41      0.51       861
           2       0.55      0.81      0.65       769

    accuracy                           0.74      2526
   macro avg       0.75      0.74      0.72      2526
weighted avg       0.76      0.74      0.73      2526

Confusion Matrix:
 [[896   0   0]
 [  0 349 512]
 [  0 148 621]]


In [25]:
sk_fold = StratifiedKFold(n_splits=3)
scores = cross_val_score(svm_modelrbf, X, y, cv=sk_fold)

print(f"Cross Validation Score: ", scores)
print(f"Rata Rata Accuracy: {scores.mean():.2f}")
print(f"Standar Deviasi: {scores.std():.2f}")

Cross Validation Score:  [0.69643705 0.73444181 0.8023753 ]
Rata Rata Accuracy: 0.74
Standar Deviasi: 0.04


# Modelling NN

In [26]:
from tensorflow.keras.utils import to_categorical
y_train_ann = y_train.copy()
y_train_ann = to_categorical(y_train_ann, num_classes=3)
y_test_ann = y_test.copy()
y_test_ann = to_categorical(y_test_ann, num_classes=3)

In [27]:
print("Dimensi data :\n")
print("X train \t X test \t Y train \t Y test")  
print("%s \t %s \t %s \t %s" % (X_train.shape, X_test.shape, y_train_ann.shape, y_test_ann.shape))

Dimensi data :

X train 	 X test 	 Y train 	 Y test
(10104, 3) 	 (2526, 3) 	 (10104, 3) 	 (2526, 3)


In [28]:
modelANN = Sequential()
modelANN.add(Dense(6, activation='relu', input_dim=X_train.shape[1]))
modelANN.add(Dense(3, activation='relu'))
modelANN.add(Dense(3, activation='softmax'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [29]:
modelANN.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

In [30]:
history = modelANN.fit(X_train, y_train_ann, 
                    epochs=50, 
                    batch_size=32, 
                    validation_split=0.2)

Epoch 1/50
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.4840 - loss: 0.9716 - val_accuracy: 0.6823 - val_loss: 0.7314
Epoch 2/50
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.6875 - loss: 0.6438 - val_accuracy: 0.7249 - val_loss: 0.4868
Epoch 3/50
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.7164 - loss: 0.4725 - val_accuracy: 0.7264 - val_loss: 0.4638
Epoch 4/50
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.7306 - loss: 0.4547 - val_accuracy: 0.7264 - val_loss: 0.4599
Epoch 5/50
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.7241 - loss: 0.4517 - val_accuracy: 0.7264 - val_loss: 0.4575
Epoch 6/50
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.7390 - loss: 0.4436 - val_accuracy: 0.7264 - val_loss: 0.4566
Epoch 7/50
[1m253/253[0m 

In [31]:
loss, accuracy = modelANN.evaluate(X_test, y_test_ann)
print(f"Test Accuracy: {accuracy:.2f}")

[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.7241 - loss: 0.4350
Test Accuracy: 0.73


In [32]:
ann_predict = modelANN.predict(X_test)

[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step


In [33]:
import numpy as np

y_predict_classes = np.argmax(ann_predict, axis=1)

In [34]:
print("Predictions: ", y_predict_classes[:20])  # Tampilkan 10 prediksi pertama

Predictions:  [1 2 1 0 0 2 1 2 2 1 0 2 2 1 0 0 0 2 1 1]


In [35]:
y_test_labels = np.argmax(y_test_ann, axis=1)
y_pred_labels = np.argmax(ann_predict, axis=1)

In [36]:
df_hasilnn = X_test.copy()
df_hasilnn['Label asli'] = y_test_labels
df_hasilnn['Label prediksi'] = y_pred_labels

In [37]:
df_hasilnn['Prediksi benar'] = df_hasilnn['Label asli'] == df_hasilnn['Label prediksi']
prediksi_benarnn = df_hasilnn['Prediksi benar'].sum()

print(df_hasilnn.head(20))
print(f'Jumlah hasil prediksi yang benar adalah {prediksi_benarnn}')

       social  finance  health  Label asli  Label prediksi  Prediksi benar
9234        0        0       2           1               1            True
11965       2        1       1           2               2            True
4530        2        1       2           1               1            True
11732       2        1       0           0               0            True
8441        1        1       0           0               0            True
11107       0        0       1           2               2            True
7272        0        0       2           2               1           False
12532       2        0       1           2               2            True
1057        2        1       1           1               2           False
1707        1        1       2           1               1            True
1817        1        1       0           0               0            True
8152        1        1       1           2               2            True
10081       0        0   

In [38]:
print(f"Accuracy Score: {accuracy_score(y_test_labels, y_pred_labels)}")
print(f"Classification Report:\n {classification_report(y_test_labels, y_pred_labels)}")
print(f"Confusion Matrix:\n {confusion_matrix(y_test_labels, y_pred_labels)}")

Accuracy Score: 0.7268408551068883
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       896
           1       0.61      0.55      0.58       861
           2       0.55      0.60      0.57       769

    accuracy                           0.73      2526
   macro avg       0.72      0.72      0.72      2526
weighted avg       0.73      0.73      0.73      2526

Confusion Matrix:
 [[896   0   0]
 [  0 476 385]
 [  0 305 464]]


# Preprocessing PCA

*encoding*

In [39]:
# Encode semua kolom
le = LabelEncoder()
for col in df.columns:
    df[col] = le.fit_transform(df[col])

df.head()

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,class
1,2,3,0,0,0,0,0,1,1
2,2,3,0,0,0,0,0,0,0
4,2,3,0,0,0,0,2,1,1
5,2,3,0,0,0,0,2,0,0
6,2,3,0,0,0,0,1,2,1


*data splitting*

In [40]:
X2 = df.drop(columns=['class'])  # Semua fitur kecuali 'class'
y2 = df['class']                 # Target

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)

print("Data Training: ", X_train2.shape)
print("Data Testing: ", X_test2.shape)

Data Training:  (10104, 8)
Data Testing:  (2526, 8)


*menentukan atribut pca*

In [41]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
Xtrain_pca = pca.fit_transform(X_train2)
Xtrain_pca

array([[-2.01211225,  1.35549907, -0.71441935],
       [-0.99154764, -0.03901466, -0.70322708],
       [ 1.01458348,  1.44000872, -0.69603704],
       ...,
       [ 0.03824963, -1.41810161,  0.64729594],
       [ 0.03360286,  0.71860018, -1.44437807],
       [-1.96038509, -1.45330404, -0.73407172]])

In [42]:
# Melihat komponen utama yang dihasilkan oleh PCA
print("Komponen PCA:")
print(pca.components_)

# Jika Anda ingin melihat kontribusi masing-masing fitur pada komponen utama
# Kita anggap X_train2 memiliki nama kolom yang representatif
feature_names = X_train2.columns  # Nama fitur asli

# Menampilkan bobot komponen untuk setiap fitur
for i, component in enumerate(pca.components_):
    print(f"Komponen {i + 1}:")
    component_df = pd.DataFrame(component, index=feature_names, columns=[f"Komponen {i + 1}"])
    print(component_df)


Komponen PCA:
[[ 3.18786442e-03  9.99802096e-01  4.40523640e-03 -1.54061168e-02
   8.68910076e-03 -5.54472318e-03 -3.30802478e-03 -3.41535458e-03]
 [ 9.24090640e-04  1.38297082e-02 -7.06784244e-01  7.07016902e-01
   1.93611103e-02  5.97462409e-04 -3.93614777e-03  5.61935101e-04]
 [ 9.44856161e-03  7.75858173e-03  7.06933910e-01  7.06183208e-01
   1.32454665e-02 -3.31887679e-04  6.43368556e-03  3.43992445e-02]]
Komponen 1:
          Komponen 1
parents     0.003188
has_nurs    0.999802
form        0.004405
children   -0.015406
housing     0.008689
finance    -0.005545
social     -0.003308
health     -0.003415
Komponen 2:
          Komponen 2
parents     0.000924
has_nurs    0.013830
form       -0.706784
children    0.707017
housing     0.019361
finance     0.000597
social     -0.003936
health      0.000562
Komponen 3:
          Komponen 3
parents     0.009449
has_nurs    0.007759
form        0.706934
children    0.706183
housing     0.013245
finance    -0.000332
social      0.006434
heal

In [43]:
Xtest_pca = pca.transform(X_test2)

# Modelling SVM dengan PCA

*svm linear*

In [44]:
from sklearn.svm import SVC

pcalin_model = SVC(kernel='linear', decision_function_shape='ovo')
pcalin_model.fit(Xtrain_pca, y_train2)
pcalin_predict = pcalin_model.predict(Xtest_pca)

In [45]:
df_hasil_pcalin = pd.DataFrame(Xtest_pca, columns=[f'PC{i+1}' for i in range(Xtest_pca.shape[1])])
df_hasil_pcalin['Label asli'] = y_test2.values
df_hasil_pcalin['Label prediksi'] = pcalin_predict

In [46]:
df_hasil_pcalin['Prediksi benar'] = df_hasil_pcalin['Label asli'] == df_hasil_pcalin['Label prediksi']
prediksi_benar_pcalin = df_hasil_pcalin['Prediksi benar'].sum()

print(df_hasil_pcalin.head(20))
print(f'Jumlah hasil prediksi yang benar adalah {prediksi_benar_pcalin}')

         PC1       PC2       PC3  Label asli  Label prediksi  Prediksi benar
0   0.988532 -0.009399  2.113525           1               2           False
1  -1.975835 -0.727253 -0.024423           2               1           False
2   0.975032  2.123964  0.027953           1               2           False
3  -1.985392 -1.473321  0.621620           0               1           False
4   2.031313 -1.413375 -0.757448           0               1           False
5  -0.972561 -0.725509 -0.042446           2               1           False
6  -1.992402  0.675948 -0.007107           2               2            True
7   2.031352 -2.125055 -0.018798           2               1           False
8  -0.009477  2.129857  0.008489           1               2           False
9  -0.000774  0.720787  1.450323           1               2           False
10 -0.971744  0.705368 -1.452469           0               1           False
11  2.001369  0.727366 -0.004370           2               2            True

In [47]:
# df_hasil_pcalin.to_csv('Hasil_prediksiSVM_Linear_pca.csv', index='False')

In [48]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print(f"Accuracy Score: {accuracy_score(y_test2, pcalin_predict)}")
print(f"Classification Report:\n {classification_report(y_test2, pcalin_predict)}")
print(f"Confusion Matrix:\n {confusion_matrix(y_test2, pcalin_predict)}")

Accuracy Score: 0.35233570863024544
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       896
           1       0.37      0.54      0.43       861
           2       0.34      0.56      0.42       769

    accuracy                           0.35      2526
   macro avg       0.23      0.36      0.29      2526
weighted avg       0.23      0.35      0.28      2526

Confusion Matrix:
 [[  0 463 433]
 [  0 463 398]
 [  0 342 427]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [49]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

sk_fold = StratifiedKFold(n_splits=3)
scores = cross_val_score(pcalin_model, X2, y2, cv=sk_fold)

print(f"Cross Validation Score: ", scores)
print(f"Rata Rata Accuracy: {scores.mean():.2f}")
print(f"Standar Deviasi: {scores.std():.2f}")

Cross Validation Score:  [0.67980998 0.78836105 0.66223278]
Rata Rata Accuracy: 0.71
Standar Deviasi: 0.06


*svm rbf*

In [50]:
from sklearn.svm import SVC

pcarbf_model = SVC(kernel='rbf', decision_function_shape='ovo')
pcarbf_model.fit(Xtrain_pca, y_train2)
pcarbf_predict = pcarbf_model.predict(Xtest_pca)

In [51]:
df_hasil_pcarbf = pd.DataFrame(Xtest_pca, columns=[f'PC{i+1}' for i in range(Xtest_pca.shape[1])])
df_hasil_pcarbf['Label asli'] = y_test2.values
df_hasil_pcarbf['Label prediksi'] = pcarbf_predict

In [52]:
df_hasil_pcarbf['Prediksi benar'] = df_hasil_pcarbf['Label asli'] == df_hasil_pcarbf['Label prediksi']
prediksi_benar_pcarbf = df_hasil_pcarbf['Prediksi benar'].sum()

print(df_hasil_pcarbf.head(20))
print(f'Jumlah hasil prediksi yang benar adalah {prediksi_benar_pcarbf}')

         PC1       PC2       PC3  Label asli  Label prediksi  Prediksi benar
0   0.988532 -0.009399  2.113525           1               1            True
1  -1.975835 -0.727253 -0.024423           2               2            True
2   0.975032  2.123964  0.027953           1               1            True
3  -1.985392 -1.473321  0.621620           0               2           False
4   2.031313 -1.413375 -0.757448           0               2           False
5  -0.972561 -0.725509 -0.042446           2               1           False
6  -1.992402  0.675948 -0.007107           2               2            True
7   2.031352 -2.125055 -0.018798           2               2            True
8  -0.009477  2.129857  0.008489           1               1            True
9  -0.000774  0.720787  1.450323           1               1            True
10 -0.971744  0.705368 -1.452469           0               1           False
11  2.001369  0.727366 -0.004370           2               2            True

In [53]:
# df_hasil_pcarbf.to_csv('Hasil_prediksiSVM_rbf_pca.csv', index='False')

In [54]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print(f"Accuracy Score: {accuracy_score(y_test2, pcarbf_predict)}")
print(f"Classification Report:\n {classification_report(y_test2, pcarbf_predict)}")
print(f"Confusion Matrix:\n {confusion_matrix(y_test2, pcarbf_predict)}")

Accuracy Score: 0.49287410926365793
Classification Report:
               precision    recall  f1-score   support

           0       0.29      0.01      0.02       896
           1       0.50      0.77      0.61       861
           2       0.49      0.74      0.59       769

    accuracy                           0.49      2526
   macro avg       0.43      0.51      0.41      2526
weighted avg       0.42      0.49      0.39      2526

Confusion Matrix:
 [[ 10 479 407]
 [ 14 664 183]
 [ 11 187 571]]


In [55]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

sk_fold = StratifiedKFold(n_splits=3)
scores = cross_val_score(pcarbf_model, X2, y2, cv=sk_fold)

print(f"Cross Validation Score: ", scores)
print(f"Rata Rata Accuracy: {scores.mean():.2f}")
print(f"Standar Deviasi: {scores.std():.2f}")

Cross Validation Score:  [0.79952494 0.83016627 0.69904988]
Rata Rata Accuracy: 0.78
Standar Deviasi: 0.06


# Modelling NN dengan PCA

In [56]:
from tensorflow.keras.utils import to_categorical
y_train2_ann = y_train2.copy()
y_train2_ann = to_categorical(y_train2_ann, num_classes=3)
y_test2_ann = y_test2.copy()
y_test2_ann = to_categorical(y_test2_ann, num_classes=3)

In [57]:
import numpy as np
Xtrain_pca = np.array(Xtrain_pca)
y_train2_ann = np.array(y_train2_ann)

In [58]:
print("Dimensi data :\n")
print("X train \t X test \t Y train \t Y test")  
print("%s \t %s \t %s \t %s" % (Xtrain_pca.shape, Xtest_pca.shape, y_train2_ann.shape, y_test2_ann.shape))

Dimensi data :

X train 	 X test 	 Y train 	 Y test
(10104, 3) 	 (2526, 3) 	 (10104, 3) 	 (2526, 3)


In [59]:
modelANN2 = Sequential()
modelANN2.add(Dense(32, activation='relu', input_dim=Xtrain_pca.shape[1]))
modelANN2.add(Dense(16, activation='relu'))
modelANN2.add(Dense(3, activation='softmax'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [60]:
modelANN2.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

In [61]:
modelANN2.summary()

In [62]:
import keras
from keras.callbacks import EarlyStopping 

# Early stopping callback 
earlyStop = EarlyStopping(monitor='val_loss', 
                          mode='min', 
                          patience=10, 
                          restore_best_weights=True) 

# Train the model with early stopping 
history2 = modelANN2.fit(Xtrain_pca, y_train2_ann, 
                         callbacks=[earlyStop], 
                         epochs=50, 
                         batch_size=32, 
                         shuffle=True, 
                         validation_split=0.2, 
                         verbose=1)


Epoch 1/50
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.4022 - loss: 1.0850 - val_accuracy: 0.4760 - val_loss: 1.0101
Epoch 2/50
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.4888 - loss: 1.0042 - val_accuracy: 0.5012 - val_loss: 0.9817
Epoch 3/50
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.4916 - loss: 0.9811 - val_accuracy: 0.5027 - val_loss: 0.9756
Epoch 4/50
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.5132 - loss: 0.9714 - val_accuracy: 0.4963 - val_loss: 0.9686
Epoch 5/50
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.4861 - loss: 0.9759 - val_accuracy: 0.5205 - val_loss: 0.9661
Epoch 6/50
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.5157 - loss: 0.9619 - val_accuracy: 0.5151 - val_loss: 0.9619
Epoch 7/50
[1m253/253[0m 

In [63]:
loss, accuracy = modelANN2.evaluate(Xtest_pca, y_test2_ann)
print(f"Test Accuracy: {accuracy:.2f}")

[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.4947 - loss: 0.9594
Test Accuracy: 0.50


In [64]:
ann2_predict = modelANN2.predict(Xtest_pca)

[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step


In [65]:
import numpy as np

y_predict2_classes = np.argmax(ann2_predict, axis=1)

In [66]:
print("Predictions: ", y_predict2_classes[:20])  # Tampilkan 10 prediksi pertama

Predictions:  [1 2 1 2 2 0 2 2 1 1 1 2 1 2 1 2 2 0 1 2]


In [67]:
y_test2_labels = np.argmax(y_test2_ann, axis=1)
y_pred2_labels = np.argmax(ann2_predict, axis=1)

In [68]:
import pandas as pd

y_test2_labels = pd.Series(y_test2_labels)
y_pred2_labels = pd.Series(y_pred2_labels)

Xtest_pca = pd.DataFrame(Xtest_pca)
df_hasilnn2 = Xtest_pca.copy()
df_hasilnn2['Label asli'] = y_test2_labels
df_hasilnn2['Label prediksi'] = y_pred2_labels


In [69]:
df_hasilnn2['Prediksi benar'] = df_hasilnn2['Label asli'] == df_hasilnn2['Label prediksi']
prediksi_benarnn2 = df_hasilnn2['Prediksi benar'].sum()

print(df_hasilnn2.head(20))
print(f'Jumlah hasil prediksi yang benar adalah {prediksi_benarnn2}')

           0         1         2  Label asli  Label prediksi  Prediksi benar
0   0.988532 -0.009399  2.113525           1               1            True
1  -1.975835 -0.727253 -0.024423           2               2            True
2   0.975032  2.123964  0.027953           1               1            True
3  -1.985392 -1.473321  0.621620           0               2           False
4   2.031313 -1.413375 -0.757448           0               2           False
5  -0.972561 -0.725509 -0.042446           2               0           False
6  -1.992402  0.675948 -0.007107           2               2            True
7   2.031352 -2.125055 -0.018798           2               2            True
8  -0.009477  2.129857  0.008489           1               1            True
9  -0.000774  0.720787  1.450323           1               1            True
10 -0.971744  0.705368 -1.452469           0               1           False
11  2.001369  0.727366 -0.004370           2               2            True

In [70]:
# df_hasilnn2.to_csv('Hasil_prediksiNN_pca.csv', index='False')

In [71]:
print(f"Accuracy Score: {accuracy_score(y_test2_labels, y_pred2_labels)}")
print(f"Classification Report:\n {classification_report(y_test2_labels, y_pred2_labels)}")
print(f"Confusion Matrix:\n {confusion_matrix(y_test2_labels, y_pred2_labels)}")

Accuracy Score: 0.5007917656373714
Classification Report:
               precision    recall  f1-score   support

           0       0.39      0.07      0.11       896
           1       0.51      0.75      0.61       861
           2       0.50      0.72      0.59       769

    accuracy                           0.50      2526
   macro avg       0.47      0.51      0.44      2526
weighted avg       0.47      0.50      0.43      2526

Confusion Matrix:
 [[ 60 456 380]
 [ 43 649 169]
 [ 49 164 556]]


# Import Model Terbaik

Dari keempat algoritma yang dicobakan, yaitu:
1. SVM Tanpa PCA
2. SVM dengan PCA
3. Neural Network Tanpa PCA
4. Neural Network dengan PCA

Model yang menghasilkan akurasi terbaik adalah **SVM RBF tanpa PCA**

In [72]:
import pickle

pickle.dump(svm_modelrbf, open('rbf_model', 'wb'))