In [0]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.multiclass import check_classification_targets
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import KFold

### Hyper Params

In [0]:
cats = ['rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.space', 'rec.motorcycles', 'misc.forsale']
newsgroups = fetch_20newsgroups(subset='train', categories=cats, remove=('headers', 'footers', 'quotes'))
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(newsgroups.data)
y = newsgroups.target
metric = 'euclidean'

In [14]:
check_classification_targets(y)
n_samples, n_features = X.shape
le = LabelEncoder()
y_indices = le.fit_transform(y)
classes = le.classes_
n_classes = classes.size

# Mask mapping each class to its members.
centroids = np.empty((n_classes, n_features), dtype=np.float64)
# Number of clusters in each class.
n_cluster = np.zeros(n_classes)

for current_class in range(n_classes):
    center_mask = y_indices == current_class
    n_cluster[current_class] = np.sum(center_mask)
    centroids[current_class] = X[center_mask].mean(axis=0)

(3562, 36156)


In [0]:
def get_vectorizer_array(query):
    return vectorizer.transform([query]).toarray()


def pred(X):
    return classes[pairwise_distances(X, centroids, metric=metric).argmin(axis=1)]



In [20]:

newsgroups_test = fetch_20newsgroups(subset='test', categories=cats, remove=('headers', 'footers', 'quotes'))
x_testdata = newsgroups_test.data
y_test = newsgroups_test.target
testdata = [[a_, b_] for a_, b_ in zip(x_testdata, y_test)]

correct = sum(str(pred(get_vectorizer_array(testcase[0]))[0]) == str(testcase[1]) for testcase in testdata)

# Print the accurency in percentage
result = str(correct / len(testdata) * 100) + " %"

print("Accuracy before K-Folding: %s" % result)

Accuracy before K-Folding: 76.45569620253164%


In [28]:
import numpy as np
newsgroups = fetch_20newsgroups(subset='all', categories=cats, remove=('headers', 'footers', 'quotes'))
X = np.asarray(newsgroups.data)
y = np.asarray(newsgroups.target)
print(X.shape)
K = 5
kf =  KFold(n_splits=K)
for train_index, test_index in kf.split(X):
    X_train, X_test = vectorizer.fit_transform(X[train_index]), X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    n_samples, n_features = X_train.shape
    le = LabelEncoder()
    y_indices = le.fit_transform(y_train)
    classes = le.classes_
    n_classes = classes.size

    # Mask mapping each class to its members.
    centroids = np.empty((n_classes, n_features), dtype=np.float64)
    # Number of clusters in each class.
    n_cluster = np.zeros(n_classes)
    for current_class in range(n_classes):
        center_mask = y_indices == current_class
        n_cluster[current_class] = np.sum(center_mask)
        centroids[current_class] = X_train[center_mask].mean(axis=0)
    
    
    testdata = [[a_, b_] for a_, b_ in zip(X_test, y_test)]

    correct = sum(str(pred(get_vectorizer_array(testcase[0]))[0]) == str(testcase[1]) for testcase in testdata)

    # Print the accurency in percentage
    result = str(correct / len(testdata) * 100) + "%"

    print("Accuracy after %d-Folding: %s" %( K, result))



(5932,)
Accuracy after 5-Folding: 71.44060657118787%
Accuracy after 5-Folding: 72.03032855939342%
Accuracy after 5-Folding: 73.10286677908938%
Accuracy after 5-Folding: 72.76559865092749%
Accuracy after 5-Folding: 72.93423271500843%
