In [1]:
import numpy as np
import pandas as pd
from statistics import mean
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


In [3]:
dataset = pd.read_csv("./spambase.data").values[:, ]
print(dataset)
imputer = SimpleImputer(missing_values = 0, strategy ="mean")
imputer = imputer.fit(dataset[:, :-1])
dataset[:, :-1] = imputer.transform(dataset[:, :-1])
print(dataset)


[[2.100e-01 2.800e-01 5.000e-01 ... 1.010e+02 1.028e+03 1.000e+00]
 [6.000e-02 0.000e+00 7.100e-01 ... 4.850e+02 2.259e+03 1.000e+00]
 [0.000e+00 0.000e+00 0.000e+00 ... 4.000e+01 1.910e+02 1.000e+00]
 ...
 [3.000e-01 0.000e+00 3.000e-01 ... 6.000e+00 1.180e+02 0.000e+00]
 [9.600e-01 0.000e+00 0.000e+00 ... 5.000e+00 7.800e+01 0.000e+00]
 [0.000e+00 0.000e+00 6.500e-01 ... 5.000e+00 4.000e+01 0.000e+00]]
[[2.10000000e-01 2.80000000e-01 5.00000000e-01 ... 1.01000000e+02
  1.02800000e+03 1.00000000e+00]
 [6.00000000e-02 1.09190635e+00 7.10000000e-01 ... 4.85000000e+02
  2.25900000e+03 1.00000000e+00]
 [4.56837607e-01 1.09190635e+00 6.83974563e-01 ... 4.00000000e+01
  1.91000000e+02 1.00000000e+00]
 ...
 [3.00000000e-01 1.09190635e+00 3.00000000e-01 ... 6.00000000e+00
  1.18000000e+02 0.00000000e+00]
 [9.60000000e-01 1.09190635e+00 6.83974563e-01 ... 5.00000000e+00
  7.80000000e+01 0.00000000e+00]
 [4.56837607e-01 1.09190635e+00 6.50000000e-01 ... 5.00000000e+00
  4.00000000e+01 0.0000000

In [4]:
sc = StandardScaler()
dataset[:, :-1] = sc.fit_transform(dataset[:, :-1])
print(dataset)

[[-1.03934865 -0.66774627 -0.49003683 ...  0.2505455   1.22818869
   1.        ]
 [-1.67094729  0.          0.06932166 ...  2.22087495  3.25837649
   1.        ]
 [ 0.          0.          0.         ... -0.06244954 -0.15220708
   1.        ]
 ...
 [-0.66038946  0.         -1.02275921 ... -0.23690579 -0.27260002
   0.        ]
 [ 2.11864456  0.          0.         ... -0.24203686 -0.33856875
   0.        ]
 [ 0.          0.         -0.09049505 ... -0.24203686 -0.40123905
   0.        ]]


In [8]:
# split dataset into test set, train set and unlabel pool
def split(dataset, train_size, test_size):
	x = dataset[:, :-1]
	y = dataset[:, -1]
	x_train, x_pool, y_train, y_pool = train_test_split(
		x, y, train_size = train_size)
	unlabel, x_test, label, y_test = train_test_split(
		x_pool, y_pool, test_size = test_size)
	return x_train, y_train, x_test, y_test, unlabel, label

In [9]:
ac1, ac2 = [], [] # arrays to store accuracy of different models

In [16]:
# split dataset into train(5 %), test(25 %), unlabel(70 %)
x_train, y_train, x_test, y_test, unlabel, label = split(
	dataset, 0.05, 0.25)
# train model by active learning
for i in range(5):
	classifier1 = LogisticRegression()
	classifier1.fit(x_train, y_train)
	y_probab = classifier1.predict_proba(unlabel)[:, 0]
	p = 0.47 # range of uncertanity 0.47 to 0.53
	uncrt_pt_ind = []
	print(unlabel.shape[0])
	for i in range(unlabel.shape[0]):
		if(y_probab[i] >= p and y_probab[i] <= 1-p):
			print(y_probab[i])
			uncrt_pt_ind.append(i)
			break
	# print(uncrt_pt_ind)
	x_train = np.append(unlabel[uncrt_pt_ind, :], x_train, axis = 0)
	y_train = np.append(label[uncrt_pt_ind], y_train)
	unlabel = np.delete(unlabel, uncrt_pt_ind, axis = 0)
	label = np.delete(label, uncrt_pt_ind)

classifier2 = LogisticRegression()
classifier2.fit(x_train, y_train)
ac1.append(classifier2.score(x_test, y_test))


''' split dataset into train(same as generated by our model),
test(25 %), unlabel(rest) '''
train_size = x_train.shape[0]/dataset.shape[0]
x_train, y_train, x_test, y_test, unlabel, label = split(
	dataset, train_size, 0.25)
# train model without active learning
classifier3 = LogisticRegression()
classifier3.fit(x_train, y_train)
ac2.append(classifier3.score(x_test, y_test))

3277
0.5049463260084431
3276
0.47298371383178595
3275
0.5293022724948719
3274
0.5198400496723155
3273
0.5186072093702851


In [None]:
x_train_1, y_train_1, x_test_1, y_test_1, unlabel_1, label_1 = split(
	dataset, 0.05, 0.25)

In [11]:
for i in range(100):
	# split dataset into train(5 %), test(25 %), unlabel(70 %)
	x_train, y_train, x_test, y_test, unlabel, label = split(
		dataset, 0.05, 0.25)
	# train model by active learning
	for i in range(5):
		classifier1 = LogisticRegression()
		classifier1.fit(x_train, y_train)
		y_probab = classifier1.predict_proba(unlabel)[:, 0]
		p = 0.47 # range of uncertanity 0.47 to 0.53
		uncrt_pt_ind = []
		for i in range(unlabel.shape[0]):
			if(y_probab[i] >= p and y_probab[i] <= 1-p):
				uncrt_pt_ind.append(i)
		print(un)
		x_train = np.append(unlabel[uncrt_pt_ind, :], x_train, axis = 0)
		y_train = np.append(label[uncrt_pt_ind], y_train)
		unlabel = np.delete(unlabel, uncrt_pt_ind, axis = 0)
		label = np.delete(label, uncrt_pt_ind)
	classifier2 = LogisticRegression()
	classifier2.fit(x_train, y_train)
	ac1.append(classifier2.score(x_test, y_test))
	''' split dataset into train(same as generated by our model),
	test(25 %), unlabel(rest) '''
	train_size = x_train.shape[0]/dataset.shape[0]
	x_train, y_train, x_test, y_test, unlabel, label = split(
		dataset, train_size, 0.25)
	# train model without active learning
	classifier3 = LogisticRegression()
	classifier3.fit(x_train, y_train)
	ac2.append(classifier3.score(x_test, y_test))

print("Accuracy by active model :", mean(ac1)*100)
print("Accuracy by random sampling :", mean(ac2)*100)


[18, 36, 40, 55, 96, 110, 199, 266, 350, 391, 437, 439, 530, 578, 676, 685, 695, 697, 706, 723, 789, 795, 842, 911, 912, 931, 938, 960, 973, 979, 982, 1089, 1111, 1132, 1219, 1221, 1231, 1272, 1278, 1316, 1319, 1338, 1373, 1424, 1429, 1455, 1483, 1506, 1514, 1544, 1576, 1590, 1608, 1678, 1748, 1817, 1847, 1869, 1873, 1897, 1936, 1990, 1992, 2018, 2033, 2041, 2042, 2055, 2064, 2065, 2085, 2101, 2137, 2189, 2249, 2305, 2318, 2327, 2342, 2506, 2522, 2533, 2579, 2617, 2618, 2627, 2646, 2664, 2691, 2760, 2772, 2834, 2863, 2975, 2977, 2984, 3001, 3012, 3080, 3122, 3136, 3160, 3208, 3251, 3257, 3261]
[4, 110, 153, 195, 223, 265, 273, 297, 308, 338, 360, 366, 432, 433, 504, 534, 615, 652, 712, 792, 838, 849, 851, 858, 877, 880, 887, 947, 997, 1015, 1017, 1033, 1047, 1079, 1103, 1132, 1142, 1165, 1167, 1170, 1260, 1268, 1402, 1419, 1447, 1512, 1513, 1596, 1602, 1636, 1759, 1800, 1825, 1837, 1864, 1933, 2065, 2075, 2156, 2204, 2342, 2361, 2401, 2445, 2483, 2485, 2493, 2517, 2572, 2748, 2753, 278

Traceback (most recent call last):
  File "/home/melika/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3433, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_29490/2850622492.py", line 21, in <module>
    classifier2.fit(x_train, y_train)
  File "/home/melika/.local/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1291, in fit
    fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer=prefer)(
  File "/home/melika/.local/lib/python3.8/site-packages/joblib/parallel.py", line 1085, in __call__
    if self.dispatch_one_batch(iterator):
  File "/home/melika/.local/lib/python3.8/site-packages/joblib/parallel.py", line 901, in dispatch_one_batch
    self._dispatch(tasks)
  File "/home/melika/.local/lib/python3.8/site-packages/joblib/parallel.py", line 819, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/home/melika/.local/lib/python3.8/site-packages/joblib/_p