In [110]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [111]:
import numpy as np

data = np.loadtxt("/content/drive/MyDrive/ML_Lab/report/6/lab6_spambase.csv", delimiter = ',')
data

array([[0.00e+00, 0.00e+00, 0.00e+00, ..., 1.80e+01, 1.82e+02, 0.00e+00],
       [0.00e+00, 7.80e-01, 1.56e+00, ..., 9.00e+00, 3.90e+01, 0.00e+00],
       [0.00e+00, 0.00e+00, 0.00e+00, ..., 4.70e+01, 1.16e+02, 0.00e+00],
       ...,
       [4.90e-01, 0.00e+00, 9.90e-01, ..., 8.00e+00, 9.40e+01, 1.00e+00],
       [4.60e-01, 3.00e-01, 4.60e-01, ..., 1.93e+02, 3.04e+03, 1.00e+00],
       [4.60e-01, 4.60e-01, 2.60e-01, ..., 4.00e+01, 8.98e+02, 1.00e+00]])

In [112]:
from sklearn.model_selection import train_test_split

trn, tst = train_test_split(data, test_size = 0.3, random_state = 0)

x_train = trn[:, :-1]
y_train = trn[:, -1]

x_test = tst[:, :-1]
y_test = tst[:, -1]

In [113]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

clf = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

print("F1... \t{: .3f}".format(f1_score(y_pred, y_test)))
print("ACC... \t{: .3f}".format(np.mean(y_pred == y_test)))

F1... 	 0.839
ACC... 	 0.967


In [142]:
# label별로 개수 파악

unique_classes, class_count = np.unique(y_train, return_counts = True)
positive, negative = class_count

In [143]:
# OverSampling

import random

random.seed(42)

x_train_positive = x_train[y_train == unique_classes[0]]
x_train_negative = x_train[y_train == unique_classes[1]]


if positive > negative :
  for i in range(positive - negative) : # 같은 비율로 설정
    random_int = random.randint(0, len(x_train_negative)-1)
    random_sample = x_train_negative[random_int, :]
    x_train_negative = np.concatenate((x_train_negative, random_sample.reshape(1, len(x_train_negative[0]))), axis = 0)

else :
  for i in range(negative - positive) :
    random_int = random.randint(0, len(x_train_positive)-1)
    random_sample = x_train_positive[random_int, :]
    x_train_positive = np.concatenate((x_train_positive, random_sample.reshape(1, len(x_train_positive[0]))), axis = 0)

In [150]:
# x_train_positive, x_train_negative 합치기

x_train_over = np.concatenate((x_train_positive, x_train_negative), axis = 0)
y_train_over = np.array([unique_classes[0]] * len(x_train_positive) + [unique_classes[1]] * len(x_train_negative))


In [155]:
clf_over = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
clf_over.fit(x_train_over, y_train_over)
y_pred_over = clf_over.predict(x_test)

print("F1... \t{: .3f}".format(f1_score(y_pred_over, y_test)))
print("ACC... \t{: .3f}".format(np.mean(y_pred_over == y_test)))

F1... 	 0.743
ACC... 	 0.940


In [162]:
# UnderSampling

import random

random.seed(42)

x_train_positive = x_train[y_train == unique_classes[0]]
x_train_negative = x_train[y_train == unique_classes[1]]


if positive > negative :
  for i in range(positive - negative) : # 같은 비율로 설정
    random_int = random.randint(0, len(x_train_positive)-1)
    x_train_positive = np.delete(x_train_positive, random_int, axis = 0)

else :
  for i in range(negative - positive) :
    random_int = random.randint(0, len(x_train_negative)-1)
    x_train_negative = np.delete(x_train_negative, random_int, axis = 0)


In [164]:
# x_train_positive, x_train_negative 합치기

x_train_under = np.concatenate((x_train_positive, x_train_negative), axis = 0)
y_train_under = np.array([unique_classes[0]] * len(x_train_positive) + [unique_classes[1]] * len(x_train_negative))

In [167]:
clf_under = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
clf_under.fit(x_train_under, y_train_under)
y_pred_under = clf_under.predict(x_test)

print("F1... \t{: .3f}".format(f1_score(y_pred_under, y_test)))
print("ACC... \t{: .3f}".format(np.mean(y_pred_under == y_test)))

F1... 	 0.536
ACC... 	 0.827
