In [1]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.feature_extraction.text import TfidfVectorizer

import pickle
import os
from sklearn import datasets
from sklearn.semi_supervised import label_propagation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

N_examples = 890393 # total

N_samples = 100000

def create_dataset(balanced=False):
    y = np.zeros(N_samples)
    i = 0
    n_neg = 0
    corpus = []

    for line in open('train_filtered.csv'):
        label, text = line.split(',')
        label = int(label)

        if label == -1:
            n_neg += 1
            if balanced and n_neg > N_samples / 2:
                continue

        corpus.append(text)
        y[i] = label        

        i += 1        
        if i == N_samples:
            break
    
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_df=0.5, stop_words='english')
    
    X = vectorizer.fit_transform(corpus)
    """
    We change label -1 to 0 and keep -1 to represent unlabeled instances
    """
    y[y==-1] = 0
    
    return X.toarray(), y

def load_dataset():
    fname = "dataset.pickle"
    if os.path.exists(fname):
        dataset = pickle.load(open(fname, 'rb'))
    else:
        dataset = create_dataset()
        pickle.dump(dataset, open(fname, 'wb'))
    X, y = dataset
    return X, y


In [2]:
X, y = create_dataset(balanced=True)

In [3]:
X.shape

(100000, 97374)

In [4]:
from sklearn.feature_selection import RFE

In [8]:
from sklearn.svm import SVR
estimator = SVR(kernel="linear")
selector = RFE(estimator=estimator , n_features_to_select=5000, step=10)

In [None]:
Xr = selector.fit_transform(X, y)

In [None]:
# Specifiy the parameters here:

test_size = 0.33    # the percentage of samples in the dataset that will be
                    # randomly selected and assigned to the test setng the classifier
X, y = create_dataset(balanced=True)

trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \
    split_train_test(X, y, test_size, n_labeled)
trn_ds2 = copy.deepcopy(trn_ds)
trn_ds3 = copy.deepcopy(trn_ds)

lbr = IdealLabeler(fully_labeled_trn_ds)

quota = len(y_train) - n_labeled    # number of samples to query

# Comparing UncertaintySampling strategy with RandomSampling.
# model is the base learner, e.g. LogisticRegression, SVM ... etc.
qs = UncertaintySampling(trn_ds, method='lc', model=LogisticRegression(class_weight='balanced'))
model1 = LogisticRegression(class_weight='balanced')
E_in_1, E_out_1 = run(trn_ds, tst_ds, lbr, model1, qs, quota, batch_size)

qs2 = RandomSampling(trn_ds2)
model2 = LogisticRegression(class_weight='balanced')
E_in_2, E_out_2 = run(trn_ds2, tst_ds, lbr, model2, qs2, quota, batch_size)

qs3 = QUIRE(trn_ds3)
model3 = LogisticRegression(class_weight='balanced')
E_in_3, E_out_3 = run(trn_ds3, tst_ds, lbr, model3, qs3, quota, batch_size)


In [None]:
# Plot the learning curve of UncertaintySampling to RandomSampling
# The x-axis is the number of batches, and the y-axis is the corresponding
# error rate.
import math
query_num = np.arange(1, quota + 1)
n_batches = math.floor(quota / batch_size)
batch_num = np.arange(1, n_batches + 1)
plt.plot(batch_num, E_in_1, 'b', label='qs trainE')
plt.plot(batch_num, E_in_2, 'r', label='random trainE')
plt.plot(batch_num, E_in_3, 'r', label='QUIRE trainE')

plt.plot(batch_num, E_out_1, 'g', label='qs testE')
plt.plot(batch_num, E_out_2, 'k', label='random testE')
plt.plot(batch_num, E_out_3, 'k', label='QUIRE testE')
plt.xlabel('Number of Batches')
plt.ylabel('Error')
plt.title('Experiment Result')
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
           fancybox=True, shadow=True, ncol=5)
plt.show()

In [None]:
print("Confusion matrices")
for i, m in enumerate([model1, model2, model3]):
    pred_labels = model2.predict(X)
    print(['uncert', 'random', 'QUIRE'][i])
    cm = confusion_matrix(y, pred_labels)
    print(cm)