In [1]:
#!/usr/bin/env python
# coding: utf-8

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from math import ceil
from random import choice
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.datasets import load_breast_cancer


wine = ('./datasets/wine/wine.data',
        ['class', 'alcohol', 'ma', 'ash', 'aoa', 'mg', 'tp', 'fl', 'np', 'pr', 'color', 'hue', 'od', 'proline'], ',')
mammographic = ('./datasets/mammographic/mammographic_masses.data',
                ['birads', 'age', 'shape', 'margin', 'density', 'class'], ',')
heart = ('./datasets/stalog_dataset/heart.dat',
                  ['age', 'sex', 'cp', 'rbp', 'chol', 'sug', 'rer', 'mhra', 'eia', 'op', 'slop', 'mvess', 'thal', 'class'], ' ')
breast_cancer = ('./datasets/breast_cancer_coimbra/dataR2.csv', ['age', 'bmi', 'gl', 'insul', 'homa', 'leptin', 'adi', 'resis', 'mcp', 'class'], ',')

iris = ('./datasets/iris/iris.data', ['sl', 'sw', 'pl', 'pw', 'class'], ',')

dermatology =('./datasets/dermatology/dermatology.data', ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32','33', '34', 'class'], ',')

ionosphere = ('./datasets/ionosphere/ionosphere.data', ['1','2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30','31', '32', '33', '34', 'class'], ',')

balance = ('./datasets/balance/balance.data', ['class', '1', '2', '3', '4'], ',')

cmc = ('./datasets/cmc/cmc.data', ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'class'], ',')

def read_dataset(path, column_names, replace_missing=False, sep=','):
    dataset = pd.read_csv(path, names=column_names, sep=sep, index_col=False)
    if replace_missing:
        dataset = dataset.replace('?', np.nan)
        imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
        dataset = imp_mean.fit_transform(dataset)
        dataset = pd.DataFrame(dataset, columns = column_names)
    return dataset

def split_dataset(dataset, col_names, inital_learining_size, test_size):
    
    y = dataset['class']  # Klasy
    attrs = col_names[:]
    attrs.remove('class')
    x = dataset[attrs]  # Cechy
    
    # Dobrze wybrać zbiór treningowy
    # Jak wybierzemy źle, to model juz w stanie wyjść z lokalnego minimum
    x_initial_train, x_train, y_initial_train, y_train = train_test_split(x, y, test_size=1-inital_learining_size, stratify=y)

    # Utworzenie zbioru testowego i uczącego przed AL
    x_al_train, x_test, y_al_train, y_test = train_test_split(x_train, y_train, test_size=test_size)
    
    return (x_initial_train, x_al_train, x_test, y_initial_train, y_al_train, y_test)

In [2]:
# To co nas interesuje do Active Learing to wsparcia wygenerowane przez klasyfikator
# Dokumentacja https://scikit-learn.org/stable/modules/svm.html

def active_learning_session(clf, budget, x_initial_train,
                            x_al_train, 
                            x_test, 
                            y_initial_train, 
                            y_al_train, y_test, 
                            random_sampling=False):
    used_budget = 0
    
    while used_budget <= budget:
        # make predcitons on training set
        probabilities = pd.DataFrame(clf.predict_proba(x_al_train))
        probabilities.index = x_al_train.index

        # select most uncertain object
        probabilities['max_value'] = probabilities.max(axis=1)
        sorted_probab = probabilities.sort_values(by='max_value')
        
        if not random_sampling: 
            selected_index = sorted_probab[:1].index.values
        else:
            selected_index = probabilities.sample(1).index

        # add it to training set
        y_selected_sample = pd.DataFrame(y_al_train).loc[selected_index]
        x_selected_sample = pd.DataFrame(x_al_train).loc[selected_index]

        x_initial_train = pd.concat([x_initial_train, x_selected_sample])
        y_initial_train = np.concatenate([y_initial_train, y_selected_sample['class']])

        # drop index form AL set
        x_al_train = x_al_train.drop(selected_index)
        y_al_train = y_al_train.drop(selected_index)

        # fit the model
        clf.fit(x_initial_train, y_initial_train)
        
        # increment budget
        used_budget += 1
        
    # Print score
    score = accuracy_score(y_test, clf.predict(x_test))
    
    return score

In [3]:

def al_experiment(data, replace_missing=False, budget=0.03):
    dataset = read_dataset(*data[:-1], replace_missing, sep=data[2])
    
    scores = []
#     budget = 0.20  #0.03 .05 .10. .15 .20
    
    result_dataframe = pd.DataFrame([], columns=['AL', 'Random', 'Batch'])

    rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=5, random_state=3682164)
    
    attrs = data[1][:]
    attrs.remove('class')
    X = dataset[attrs]  # Cechy
    y = dataset['class']
    
#     d = load_breast_cancer()
#     X = pd.DataFrame(d.data)
#     y = pd.DataFrame(d.target.ravel(), columns=['class'])['class']
    
    for train_index, test_index in rskf.split(X, y):
        X_train = X.iloc[train_index]
        X_test = X.iloc[test_index]
        y_train = y.iloc[train_index]
        y_test = y.iloc[test_index]
        
        x_initial_train, x_al_train, y_initial_train, y_al_train =  train_test_split(X_train, y_train, test_size=0.95, stratify=y_train, random_state=5)
        
        #Pierwszy model
        clf_al = SVC(kernel='linear', probability=True, decision_function_shape='ovo')
        clf_al.fit(x_initial_train, y_initial_train)
        
        clf_rand = SVC(kernel='linear', probability=True, decision_function_shape='ovo')
        clf_rand.fit(x_initial_train, y_initial_train)
        
        clf_all = SVC(kernel='linear', probability=True, decision_function_shape='ovo')
        clf_all.fit(x_al_train, y_al_train)

        al = active_learning_session(clf_al,
                                budget*x_al_train.shape[0],
                                x_initial_train.copy(),
                                x_al_train.copy(),
                                X_test,
                                y_initial_train.copy(),
                                y_al_train.copy(),
                                y_test,
                                random_sampling=False)
    
        rnd = active_learning_session(clf_rand,
                                budget*x_al_train.shape[0],
                                x_initial_train.copy(),
                                x_al_train.copy(),
                                X_test,
                                y_initial_train.copy(),
                                y_al_train.copy(),
                                y_test,
                                random_sampling=True)
        
        score = accuracy_score(y_test, clf_all.predict(X_test))
        
        result_dataframe.loc[len(result_dataframe)] = [al, rnd, score]
    
    return result_dataframe   

In [4]:
budgets = [0.05, 0.10, 0.15, 0.20]

for budget in budgets:
    al_experiment(cmc, budget=budget).to_csv('cmc_%s.csv' % budget)