# Projekt 4 Super k-NN

Celem projektu jest stworzenie zupełnie nowego zespołowego klasyfikatora k-NN  i porównania jego jakości, czasów jego uczenia i odpowiedzi ze standardowym klasyfikatorem SVM. 

DoD.
Należy sporządzić raport z projektu.

1. Zbiór danych: TNG, ok 18000 próbek, 20 klas. Zbiór danych MNIST (70000 próbek 10 klas). Odnośnie TNG wykorzystujemy gotowe dane reprezentujące tekst blogów w postaci wektorów (dostarcza prowadzący). Dane dekorelujemy wykorzystując transformatę PCA.

2. Z jednego zbioru danych tworzymy kilka sub-zestawów danych (>=5 <=10) na różnych zestawach cech (maski mogą być losowane w sposób random, ale nie powinny być gęste). Maski mogą mieć różną długość. Prawdopodobieństwo wystąpienia cechy w zestawie może być proporcjonalne do jej istotności (np. mierzonej wielkością wartości własnych po transformacie PCA). Jednak nie może być takiej cechy, która nie dostała się do żadnego zestawu. 

3. Liczymy średnią przynależność każdej próbki do danej klasy na bazie klasyfikatora k-NN dla każdego sub-zestawu danych. Dokonujemy fuzji wyników klasyfikacji (jakiej?) dla każdej próbki po sub-zestawach danych. 

4. Jak zmieni się jakość klasyfikatora w zależności od k?

5. Wyniki jakości klasyfikatorów oceniać na bazie krzyżowej-walidacji, (accuracy - Loss-błąd, Krzywa ROC, Precision-Recall, (pola pod krzywymi) F1). 

## Utils

In [1]:
from sklearn.datasets import fetch_openml
import random
import numpy as np
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (accuracy_score, f1_score, log_loss, plot_roc_curve, precision_recall_curve,
plot_precision_recall_curve, average_precision_score, hinge_loss, precision_score, recall_score, classification_report)
from sklearn.svm import SVC
from scipy import stats
import time
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.ensemble import ExtraTreesClassifier

def divide_dataset(X: np.ndarray,
                   y: np.ndarray,
                   sub_num: int = 10,
                   rows_size: float = 0.6,
                   cols_size: float = 0.7):
    # get feature importances
    tree = ExtraTreesClassifier()
    tree.fit(X, y)
    feature_importances = tree.feature_importances_
    feature_importances /= feature_importances.sum()

    # prepare indices
    row_indices = list(range(X.shape[0]))
    col_indices = list(range(X.shape[1]))
    num_rows = round(X.shape[0] * rows_size)
    num_cols = round(X.shape[1] * cols_size)

    used_cols = set()
    subparts = []
    col_masks = []
    
    for i in range(sub_num):
        # randomly, uniformly sample rows
        rows = np.random.choice(row_indices,
                                size=num_rows,
                                replace=True)

        # randomly sample X columns with probability distribution relative to
        # the features' importances
        cols = np.random.choice(col_indices,
                                size=num_cols,
                                replace=False,
                                p=feature_importances)
        if i == sub_num - 1:
            # force usage of columns not used before
            used_cols |= set(cols)
            not_used_cols = set(col_indices) - used_cols
            not_used_cols = np.fromiter(not_used_cols,
                                        int,
                                        len(not_used_cols))
            cols = np.sort(np.concatenate((cols, not_used_cols)))

        
        X_part = X[rows, :]
        X_part = X_part[:, cols]
        
        y_part = y[rows]

        used_cols |= set(cols)
        subparts.append((X_part, y_part))
        col_masks.append(cols)
        
    return subparts, col_masks

In [3]:
class SuperKNN:
    def __init__(self, classes_count, n_neighbors=3, classifiers=[], masks=[],
                 current_predict_proba=[]):
        self.__classes_count = classes_count
        self.__n_neighbors = n_neighbors
        self.__classifiers = classifiers
        self.__masks = masks
        self.__current_predict_proba = current_predict_proba
    
    def fit(self, x_train, y_train, rows_size=0.8):
        divided_dataset_with_masks = divide_dataset(x_train, y_train, rows_size=rows_size)
        self.__classifiers = []
        self.__fit_classifiers(divided_dataset_with_masks[0])  
        self.__masks = divided_dataset_with_masks[1]
        
    def __fit_classifiers(self, dataset):
        for row in dataset:
            x, y = row
            neigh = KNeighborsClassifier(n_neighbors=self.__n_neighbors)
            neigh.fit(x, y)
            self.__classifiers.append(neigh)

    def predict(self, x):
        if not self.__masks:
            raise Exception('You need to call fit first')
        self.__current_predictions = []
        
        for row in x:
            current_result = int(np.argmax(self.__get_votes(row)))
            self.__current_predictions.append(current_result)
        return self.__current_predictions
    
    def __get_votes(self, row):
        votes = [0] * self.__classes_count

        for i, c in enumerate(self.__classifiers):
            
            # match shape to train set shape
            temp_x = row[self.__masks[i]] 

            for prediction in c.predict([temp_x]):
                votes[int(prediction)] += 1

        return votes
    
    def __get_current_predict_proba(self, row):
        votes = self.__get_votes(row)
        return [x / self.__classes_count for x in votes]
    
    def predict_proba(self, x):
        self.__current_predict_proba = []
        
        for row in x:
            cur_pred_proba = self.__get_current_predict_proba(row)
            self.__current_predict_proba.append(cur_pred_proba)
        return self.__current_predict_proba
    
    def decision_function(self, x_test):
        return self.predict_proba(x_test)
    
    @property
    def classifiers(self):
        return self.__classifiers
    
    @classifiers.setter
    def classifiers(self, classifiers):
        self.__classifiers = classifiers
    
    def get_params(self, deep=True):
        return {"classes_count": self.__classes_count, "n_neighbors": self.__n_neighbors,
                "classifiers": self.__classifiers, "masks": self.__masks, 
                "current_predict_proba": self.__current_predict_proba }
        

In [4]:
class ReportCreator():
    def __init__(self):
        self.columns = ['name',
                        'time',
                        'accuracy (cross-val)',
                        'accuracy',
                        'f1-score',
                        'precision',
                        'recall',
                        'loss'
                        ]
        self.base_df = pd.DataFrame(columns = self.columns)
        
    def get_df_row(self, report):
        return pd.DataFrame(report, columns=self.columns, index=[0])
    
    def concat_new_row(self, row_report):
        self.base_df = pd.concat([self.base_df, self.get_df_row(row_report)], ignore_index=True)

    def create_partial_report(self, classifier, X_train, X_test, y_train, y_test, result, end, start, kernel=None, C=None, k=None):
        partial_report = classification_report(y_test, result, output_dict=True)['weighted avg']
        if kernel is None or C is None:
            partial_report['name'] = 'Super k-NN, k={}'.format(k)
        else:
            partial_report['name'] = 'SVM, C = {}, kernel: {}'.format(C, kernel)
        partial_report['time'] = f'{end - start} s'
        partial_report['accuracy'] = accuracy_score(y_true=y_test, y_pred=result)
        partial_report['f1-score'] = f1_score(y_true=y_test, y_pred=result, average='macro')
        partial_report['accuracy (cross-val)'] = np.mean(cross_val_score(classifier, X_train, y_train, cv=5, scoring='accuracy'))
        partial_report['loss'] = log_loss(y_test, classifier.decision_function(X_test))
    
        self.concat_new_row(partial_report)
    
    def get_report(self):
        return self.base_df.sort_values(by='accuracy', ascending=False)


def evaluate_svm(report_creator, X_train, X_test, y_train, y_test, kernel='linear', C=1):
    start = time.time()
    classifier =  SVC(kernel=kernel, C=C, probability=True)
    classifier.fit(X_train, y_train)
    result = classifier.predict(X_test)
    end = time.time()

    report_creator.create_partial_report(classifier, X_train, X_test, y_train, y_test, result, end, start, kernel, C)

def evaluate_all(X_train, X_test, y_train, y_test):
    report_creator = ReportCreator()

    for C in [1, 5]:
        for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
            evaluate_svm(report_creator, X_train, X_test, y_train, y_test, C=C, kernel=kernel)
    return report_creator.get_report()

# MNIST

In [5]:
mnist = fetch_openml("mnist_784", data_home="data/mnist_784", cache=True)

### Super k-NN

In [6]:
zipped_mnist = list(zip(mnist.data, mnist.target))
mnist_random = random.sample(zipped_mnist, 7000)
x, y = zip(*(mnist_random))
x = np.asarray(x, dtype=np.int)
y = np.asarray(y, dtype=np.int)

In [7]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x) 

pca = PCA(n_components=30)
x_pca = pca.fit_transform(x_scaled)
x_train, x_test, y_train, y_test = train_test_split(x_pca, y, train_size=0.2, test_size=0.02, random_state=1)

In [8]:
report_creator = ReportCreator()

for i in [1, 3, 5, 10, 20]:
    start = time.time()
    sknn = SuperKNN(10, i)
    sknn.fit(x_train, y_train)
    result = sknn.predict(x_test)
    end = time.time()
    
    report_creator.create_partial_report(sknn, x_train, x_test, y_train, y_test, result, end, start, k=i)

report_creator.get_report()

Unnamed: 0,name,time,accuracy (cross-val),accuracy,f1-score,precision,recall,loss
1,"Super k-NN, k=3",0.852698564529419 s,0.858571,0.871429,0.867469,0.879797,0.871429,0.586515
0,"Super k-NN, k=1",0.8080718517303467 s,0.863571,0.857143,0.847774,0.857623,0.857143,1.284182
2,"Super k-NN, k=5",0.7628726959228516 s,0.859286,0.835714,0.835013,0.842509,0.835714,2.213377
3,"Super k-NN, k=10",0.7705655097961426 s,0.855,0.814286,0.809488,0.823784,0.814286,1.817662
4,"Super k-NN, k=20",0.8006076812744141 s,0.832857,0.785714,0.783972,0.805855,0.785714,2.280906


### SVM

In [9]:
X_train, X_test, y_train, y_test = train_test_split(mnist.data, mnist.target, train_size=0.008, test_size=0.002, random_state=1)

In [10]:
scaler_mnist = StandardScaler().fit(X_train)
x_train = scaler_mnist.transform(X_train)
x_test = scaler_mnist.transform(X_test)

In [11]:
evaluate_all(X_train, X_test, y_train, y_test)

Unnamed: 0,name,time,accuracy (cross-val),accuracy,f1-score,precision,recall,loss
6,"SVM, C = 5, kernel: rbf",1.7420830726623535 s,0.898214,0.864286,0.851268,0.874909,0.864286,2.173525
2,"SVM, C = 1, kernel: rbf",1.5960636138916016 s,0.885714,0.85,0.839032,0.863135,0.85,2.172164
0,"SVM, C = 1, kernel: linear",0.7889511585235596 s,0.844643,0.828571,0.810836,0.840569,0.828571,2.182134
4,"SVM, C = 5, kernel: linear",0.8032171726226807 s,0.844643,0.828571,0.810836,0.840569,0.828571,2.182134
5,"SVM, C = 5, kernel: poly",1.3066213130950928 s,0.835714,0.828571,0.82231,0.848176,0.828571,2.170233
3,"SVM, C = 1, kernel: sigmoid",1.0922794342041016 s,0.826786,0.785714,0.771453,0.810759,0.785714,2.173838
7,"SVM, C = 5, kernel: sigmoid",0.7015371322631836 s,0.8125,0.778571,0.749644,0.790616,0.778571,2.186161
1,"SVM, C = 1, kernel: poly",1.1978685855865479 s,0.823214,0.764286,0.754023,0.795958,0.764286,2.42081


# TNG

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing

categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

twenty_train = fetch_20newsgroups(shuffle=True, subset='train', random_state=42, categories=categories, remove=('headers', 'footers', 'quotes'))
twenty_test = fetch_20newsgroups(shuffle=True, subset='test', random_state=42, categories=categories, remove=('headers', 'footers', 'quotes'))

In [13]:
x_train = twenty_train.data[:3000]
y_train = twenty_train.target[:3000]
x_test = twenty_test.data[:600]
y_test = twenty_test.target[:600]

# convert text to vectors
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

### Super k-NN

In [16]:
x_train = preprocessing.scale(x_train, with_mean=False)
x_test = preprocessing.scale(x_test, with_mean=False)

svd = TruncatedSVD(n_components=70)
x_train = svd.fit_transform(x_train)
x_test = svd.transform(x_test)

In [17]:
report_creator = ReportCreator()

for i in [1, 3, 5, 10, 20]:
    start = time.time()
    sknn = SuperKNN(4, i)
    sknn.fit(x_train, y_train)
    result = sknn.predict(x_test)
    end = time.time()
    
    report_creator.create_partial_report(sknn, x_train, x_test, y_train, y_test, result, end, start, k=i)

report_creator.get_report()

Unnamed: 0,name,time,accuracy (cross-val),accuracy,f1-score,precision,recall,loss
1,"Super k-NN, k=3",3.530184745788574 s,0.661025,0.633333,0.630475,0.672147,0.633333,3.671825
3,"Super k-NN, k=10",3.4280014038085938 s,0.673422,0.631667,0.627277,0.681769,0.631667,4.455041
2,"Super k-NN, k=5",3.5425944328308105 s,0.672106,0.63,0.626106,0.657427,0.63,3.704864
0,"Super k-NN, k=1",3.5895490646362305 s,0.664573,0.616667,0.612409,0.652227,0.616667,3.84448
4,"Super k-NN, k=20",3.674116373062134 s,0.665015,0.61,0.602317,0.661508,0.61,4.887068


### SVM

In [18]:
evaluate_all(x_train, x_test, y_train, y_test)

Unnamed: 0,name,time,accuracy (cross-val),accuracy,f1-score,precision,recall,loss
0,"SVM, C = 1, kernel: linear",2.390442132949829 s,0.744335,0.715,0.708744,0.723791,0.715,2.192185
4,"SVM, C = 5, kernel: linear",8.264097690582275 s,0.739907,0.703333,0.695657,0.713193,0.703333,2.194563
6,"SVM, C = 5, kernel: rbf",2.2845520973205566 s,0.567102,0.545,0.510251,0.583742,0.545,4.183148
7,"SVM, C = 5, kernel: sigmoid",2.7038822174072266 s,0.51395,0.455,0.409336,0.494492,0.455,4.709762
2,"SVM, C = 1, kernel: rbf",2.5831878185272217 s,0.461661,0.44,0.364601,0.621249,0.44,5.166516
3,"SVM, C = 1, kernel: sigmoid",2.9830527305603027 s,0.431539,0.388333,0.314904,0.37811,0.388333,6.719651
1,"SVM, C = 1, kernel: poly",2.2965710163116455 s,0.269382,0.278333,0.108866,0.077469,0.278333,8.227491
5,"SVM, C = 5, kernel: poly",2.290579080581665 s,0.272484,0.278333,0.108866,0.077469,0.278333,8.235911
