In [1]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.feature_extraction.text import TfidfVectorizer

import pickle
import os
from sklearn import datasets
from sklearn.semi_supervised import label_propagation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

N_examples = 890393 # total

N_samples = 100000

def create_dataset(balanced=False):
    y = np.zeros(N_samples)
    i = 0
    n_neg = 0
    corpus = []

    for line in open('train_filtered.csv'):
        label, text = line.split(',')
        label = int(label)

        if label == -1:
            n_neg += 1
            if balanced and n_neg > N_samples / 2:
                continue

        corpus.append(text)
        y[i] = label        

        i += 1        
        if i == N_samples:
            break
    
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_df=0.5, stop_words='english')
    
    X = vectorizer.fit_transform(corpus)
    """
    We change label -1 to 0 and keep -1 to represent unlabeled instances
    """
    y[y==-1] = 0
    
    return X.toarray(), y

def load_dataset():
    fname = "dataset.pickle"
    if os.path.exists(fname):
        dataset = pickle.load(open(fname, 'rb'))
    else:
        dataset = create_dataset()
        pickle.dump(dataset, open(fname, 'wb'))
    X, y = dataset
    return X, y


In [2]:
#!/usr/bin/env python3
"""
The script helps guide the users to quickly understand how to use
libact by going through a simple active learning task with clear
descriptions.
"""

import copy
import os

import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split

# libact classes
from libact.base.dataset import Dataset, import_libsvm_sparse
from libact.models import *
from libact.query_strategies import *
from libact.labelers import IdealLabeler


def run(trn_ds, tst_ds, lbr, model, qs, quota, batch_size):
    E_in, E_out = [], []

    batch_i = 0
    for _ in range(quota):
        # Standard usage of libact objects
        ask_id = qs.make_query()
        X, _ = zip(*trn_ds.data)
        lb = lbr.label(X[ask_id])
        trn_ds.update(ask_id, lb)
        batch_i += 1
        if batch_i == batch_size:
            model.train(trn_ds)
            batch_i = 0            
            E_in = np.append(E_in, 1 - model.score(trn_ds))
            E_out = np.append(E_out, 1 - model.score(tst_ds))

    return E_in, E_out


def split_train_test(X, y, test_size, n_labeled):
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=test_size)
    trn_ds = Dataset(X_train, np.concatenate(
        [y_train[:n_labeled], [None] * (len(y_train) - n_labeled)]))
    tst_ds = Dataset(X_test, y_test)
    fully_labeled_trn_ds = Dataset(X_train, y_train)

    return trn_ds, tst_ds, y_train, fully_labeled_trn_ds


In [3]:
X, y = create_dataset(balanced=True)

In [4]:
X.shape

(100000, 97374)

In [5]:
sum(y==1)

50000

In [None]:
# Specifiy the parameters here:

test_size = 0.33    # the percentage of samples in the dataset that will be
                    # randomly selected and assigned to the test set
n_labeled = 100     # number of samples that are initially labeled
batch_size = 100    # number of new queries to fetch before retraining the classifier

# Load dataset
# X, y = load_dataset_reduced()
# X, y = load_dataset()
X, y = create_dataset(balanced=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

model = LogisticRegression(class_weight='balanced')

model.fit(X_train, y_train)



In [None]:
print("Confusion matrix")
pred_labels = model.predict(X_test)
cm = confusion_matrix(y, pred_labels)
print(cm)

error = sum(y != pred_labels)/len(y)