In [1]:
# convert from VW dataset
output = open('train_filtered.csv', 'w')
for line in open('train_filtered.vw'):
    items = line.split()
    label = items[0]
    rest = ' '.join(items[3:])
    output.write('%s,%s\n' % (label, rest))


In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import os

import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

from sklearn import datasets
from sklearn.semi_supervised import label_propagation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

N_examples = 890393 

def create_dataset():
    y = np.zeros(N_examples)
    i = 0
    corpus = []

    for line in open('train_filtered.csv'):
        label, text = line.split(',')
        corpus.append(text)
        y[i] = int(label)
        i += 1
    
    vectorizer = CountVectorizer(ngram_range=(1, 1),
                                        token_pattern=r'\b\w+\b', min_df=100, max_features=1000)
    
    
    X = vectorizer.fit_transform(corpus)
    
    """
    We change label -1 to 0 and keep -1 to represent unlabeled instances
    """
    y[y==-1] = 0
    
    return X, y

def load_dataset():
    fname = "dataset.pickle"
    if os.path.exists(fname):
        dataset = pickle.load(open(fname, 'rb'))
    else:
        dataset = create_dataset()
        pickle.dump(dataset, open(fname, 'wb'))
    X, y = dataset
    return X, y


In [8]:
X_full, y_full = load_dataset()

In [10]:
"""
    Create reduced dataset
"""
def create_dataset_reduced():
    X_full, y_full = load_dataset()
    
    n_reduced_samples = 5000
    n_total_samples = len(y_full)
    rng = np.random.RandomState(0)
    indices = np.arange(n_total_samples)
    rng.shuffle(indices)
    X = X_full[indices[:n_reduced_samples]].toarray()
    y = y_full[indices[:n_reduced_samples]]

    n_labeled_points = 100
    unlabeled_indices = np.arange(n_reduced_samples)[n_labeled_points:]
    
    return X, y

def load_dataset_reduced():
    fname = "dataset_reduced.pickle"
    if os.path.exists(fname):
        dataset = pickle.load(open(fname, 'rb'))
    else:
        dataset = create_dataset_reduced()
        pickle.dump(dataset, open(fname, 'wb'))
    X, y = dataset
    return X, y


In [11]:
X, y = load_dataset_reduced()

In [12]:
X.shape

(5000, 1000)

In [None]:
"""
========================================
Label Propagation digits active learning
========================================

Demonstrates an active learning technique to learn handwritten digits
using label propagation.

We start by training a label propagation model with only 10 labeled points,
then we select the top five most uncertain points to label. Next, we train
with 15 labeled points (original 10 + 5 new ones). We repeat this process
four times to have a model trained with 30 labeled examples.

A plot will appear showing the top 5 most uncertain digits for each iteration
of training. These may or may not contain mistakes, but we will train the next
model with their true labels.
"""
print(__doc__)

for i in range(20):
    y_train = np.copy(y)
    y_train[unlabeled_indices] = -1

    
    # model = label_propagation.LabelSpreading(gamma=0.25, max_iter=5)
    # model.fit(X, y_train)
    
    model = LogisticRegression()
    model.fit(X, y_train)
    
    # predicted_labels = model.transduction_[unlabeled_indices]
    predicted_labels = model.predict(X[unlabeled_indices])
    true_labels = y[unlabeled_indices]

    cm = confusion_matrix(true_labels, predicted_labels,
                          labels=model.classes_)

    print('Iteration %i %s' % (i, 70 * '_'))
    print("Label Spreading model: %d labeled & %d unlabeled (%d total)"
          % (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples))

    print(classification_report(true_labels, predicted_labels))

    print("Confusion matrix")
    print(cm)

    # compute the entropies of transduced label distributions
    pred_entropies = stats.distributions.entropy(
        lp_model.label_distributions_.T)

    # select five examples that the classifier is most uncertain about
    uncertainty_index = uncertainty_index = np.argsort(pred_entropies)[-5:]

    # keep track of indices that we get labels for
    delete_indices = np.array([])

    for index, image_index in enumerate(uncertainty_index):
        # labeling 5 points, remote from labeled set
        delete_index, = np.where(unlabeled_indices == image_index)
        delete_indices = np.concatenate((delete_indices, delete_index))

    unlabeled_indices = np.delete(unlabeled_indices, delete_indices)
    n_labeled_points += 100
