In [1]:
import json
import nltk
from nltk.tokenize import RegexpTokenizer
import string
import math
import time
import pandas as pd
import pyspark
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.classification import SVMWithSGD, SVMModel
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import random
from sklearn.metrics import coverage_error
import sklearn.metrics
from gensim.models.doc2vec import Doc2Vec, LabeledSentence
import logging

In [2]:
IS_SAMPLE = False

### Training functions

In [3]:
def get_term_dictionary(terms):
    """
    Maps string terms to indexes in an array
    """
    term_dictionary = {}
    term_array = [None] * len(terms)
    def put(key):
        hashvalue = hashfunction(key, len(term_array))
        if term_array[hashvalue] == None:
            term_array[hashvalue] = key
            return hashvalue
        else:
            nextslot = rehash(hashvalue, len(term_array))
            while term_array[nextslot] != None:
                nextslot = rehash(nextslot, len(term_array))
            if term_array[nextslot] == None:
                term_array[nextslot] = key
                return nextslot
    def hashfunction(key, size):
        return hash(key) % size
    def rehash(oldhash, size):
        return (oldhash + 1) % size
    i = 0
    for term in terms:
        corresponding_index = put(term)
        term_dictionary[term] = corresponding_index
        i+=1
        if i%10000 == 0: print "finished " + str(i)
    return term_dictionary

def jsonKV2str(x):
    """
    Change string keys to int
    """
    if isinstance(x, dict):
            #return {doc_id:{int(term_id):x[doc_id][term_id] for term_id in x[doc_id]} for doc_id in x }
        
            return {int(k):(int(v) if isinstance(v, unicode) else v) for k,v in x.items()}
    return x

def get_json(json_postings):
    return json.loads(json_postings)

def get_json_convert_num(json_postings):
    return json.loads(json_postings, object_hook=jsonKV2str)

def get_doc_index(term, postings_list, term_dictionary):
    #return [(doc_id, {term: postings_list[doc_id]}) for doc_id in postings_list]
    return [(doc_id, {term_dictionary[term]: postings_list[doc_id]}) for doc_id in postings_list]

def get_classes(ipc_classification):
    sections = []
    classes = []
    subclasses = []
    for classification in ipc_classification:
        # we do the check because some documents have repetitions
        section_name = classification['section']
        class_name = classification['section'] + "-" + classification['class']
        subclass_name = classification['section'] + "-" + classification['class'] + "-" + classification['subclass']
        if section_name not in sections:
            sections.append(section_name)
        if class_name not in classes:
            classes.append(class_name)
        if subclass_name not in subclasses:
            subclasses.append(subclass_name)
    return {"sections": sections, "classes": classes, "subclasses": subclasses}


def compare_classifications(x,y):
    len_comp = cmp(len(x), len(y))
    if len_comp == 0:
        return cmp(x,y)
    return len_comp


def get_error(svm, test_vectors):
    labelsAndPreds = test_vectors.map(lambda p: (p.label, svm.predict(p.features)))
    trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(test_vectors.count())
    return trainErr

get_binary = lambda x: 1 if x > 0 else 0
get_binary = np.vectorize(get_binary)

def get_row_top_N(y_score_row, y_true_row):
    desc_score_indices = np.argsort(y_score_row)[::-1]
    # print y_score_row
    # print y_true_row
    true_indices = np.where(y_true_row ==1)[0]
    # print desc_score_indices
    found = 0
    top_N = 0
    for i, score in enumerate(desc_score_indices):
        if score in true_indices:
            found += 1
            if found == len(true_indices):
                top_N = i + 1
    # print top_N
    return top_N


def get_metrics(y_true, y_binary_score):
    metrics = {}
    metrics['coverage_error'] = coverage_error(y_binary_score, y_true)
    metrics['average_num_of_labels'] = np.sum(np.sum(y_true, axis=1))/y_true.shape[0]
    metrics['average_precision_micro'] = sklearn.metrics.average_precision_score(y_true, y_binary_score, average='micro')
    metrics['average_precision_macro'] = sklearn.metrics.average_precision_score(y_true, y_binary_score, average='macro')
    metrics['precision_micro'] = sklearn.metrics.precision_score(y_true, y_binary_score, average='micro')
    metrics['precision_macro'] = sklearn.metrics.precision_score(y_true, y_binary_score, average='macro')
    metrics['recall_micro'] = sklearn.metrics.recall_score(y_true, y_binary_score, average='micro')
    metrics['recall_macro'] = sklearn.metrics.recall_score(y_true, y_binary_score, average='macro')
    metrics['f1_micro'] = sklearn.metrics.f1_score(y_true, y_binary_score, average='micro')
    metrics['f1_macro'] = sklearn.metrics.f1_score(y_true, y_binary_score, average='macro')

    precision_scores = np.zeros(y_true.shape[1])
    for i in range(0, y_true.shape[1]):
        precision_scores[i] = sklearn.metrics.precision_score(y_true[:,i], y_binary_score[:,i])
    metrics['precision_scores_array'] = precision_scores.tolist()

    recall_scores = np.zeros(y_true.shape[1])
    for i in range(0, y_true.shape[1]):
        recall_scores[i] = sklearn.metrics.recall_score(y_true[:,i], y_binary_score[:,i])
    metrics['recall_scores_array'] = recall_scores.tolist()

    f1_scores = np.zeros(y_true.shape[1])
    for i in range(0, y_true.shape[1]):
        f1_scores[i] = sklearn.metrics.f1_score(y_true[:,i], y_binary_score[:,i])
    metrics['f1_scores_array'] = f1_scores.tolist()

    tops = []
    for i in xrange(y_score.shape[0]):
        tops.append(get_row_top_N(y_score[i,:], y_true[i,:]))
    metrics['topN_list'] = np.array(tops).tolist()
    metrics['topN_avg'] = np.mean(tops)
    
    return metrics


class Evaluator:
    
    def __init__(self, labels, scores, threshold=0):
        self.threshold = 0
        self.count = len(labels)
        
        self.tp = 0
        self.fp = 0
        self.fn = 0
        self.tn = 0
        
        for (l,s) in zip(labels,scores):
            if self.is_true(l) and self.is_true(s):
                self.tp += 1
            if self.is_true(l) and not self.is_true(s):
                self.fn += 1
            if not self.is_true(l) and self.is_true(s):
                self.fp += 1
            if not self.is_true(l) and not self.is_true(s):
                self.tn += 1
        self.precision = self.get_precision()
        self.recall = self.get_precision()
        self.f1 = self.get_f1()
        self.error_rate = self.get_error_rate()
        
    def calculate_contingency(self, label, contingency):
        
        self.tp = 0
        self.fp = 0
        self.fn = 0
        self.tn = 0
        
        for (l,s) in zip(labels,scores):
            if self.is_true(l) and self.is_true(s):
                self.tp += 1
            if self.is_true(l) and not self.is_true(s):
                self.fn += 1
            if not self.is_true(l) and self.is_true(s):
                self.fp += 1
            if not self.is_true(l) and not self.is_true(s):
                self.tn += 1
    
    def is_true(self, label):
        return label > self.threshold
    
    def get_error_rate(self):
        return float(self.tp + self.tn) / self.count
    
    def get_precision(self):
        # self.calculate_contingency()
        if self.tp == 0: return 0
        return float(self.tp) / (self.tp + self.fp)
        
    def get_recall(self):
        # self.calculate_contingency()
        if self.tp == 0: return 0
        return float(self.tp) / (self.tp + self.fn)
    
    def get_f1(self):
        return 2 * (self.get_precision() * self.get_recall()) / (self.get_precision() + self.get_recall())

### SVM and Doc2vec parameters

In [4]:
SVM_ITERATIONS = 1000
SVM_CONVERGENCE = 0.001
SVM_REG = 0.1

In [5]:
DOC2VEC_SIZE = 400
DOC2VEC_WINDOW = 8
DOC2VEC_MAX_VOCAB_SIZE = None
DOC2VEC_SAMPLE = 1e-5
DOC2VEC_TYPE = 1
DOC2VEC_HIERARCHICAL_SAMPLE = 0
DOC2VEC_NEGATIVE_SAMPLE_SIZE = 10
DOC2VEC_CONCAT = 0
DOC2VEC_MEAN = 0
DOC2VEC_EPOCHS = 1
REPORT_DELAY = 30 # report the progress every x seconds

Setup logging for word2vec

In [6]:
root = logging.getLogger()
for handler in root.handlers[:]:
    root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # adds a default StreamHanlder
#root.addHandler(logging.StreamHandler())

### Input/Output directories

In [7]:
#sc = SparkContext("", "Generate Inverted Index Job")

classification_objects_save_location = "hdfs://deka.cip.ifi.lmu.de/svm/new/"
original_parent_save_location = "hdfs://deka.cip.ifi.lmu.de/pg-vectors/"
save_parent_location = original_parent_save_location
sample_save_parent_location = save_parent_location + "sample/"
if IS_SAMPLE: 
    save_parent_location = save_parent_location + "sample/"

doc2vec_model_location = '/big/s/shalaby/paragraph_vector_models/{}'
# file_name = "sample.json"
# test_file_name = "sample.json"
# #url = "/media/Work/workspace/thesis/benchmark/output/" + file_name
# sample_location = save_parent_location + file_name
# sample_test_location = save_parent_location + test_file_name

# accepted_terms_with_scores_list_output = original_parent_save_location + "accepted_terms_with_scores_list_{}.pkl"
# postings_list_chi_selected_output = original_parent_save_location + "postings_list_{}.json"
# term_df_map_output = original_parent_save_location + "term_df_map_output_{}.json"
# doc_index_chi_selected_output = original_parent_save_location + "doc_index_for_postings_{}.json"
# term_dictionary_output = original_parent_save_location + "term_dictionary_{}.pkl"


# postings_list_training_chi_selected_output = save_parent_location + "training_postings_list_{}.json"
# postings_list_validation_chi_selected_output = save_parent_location + "validation_postings_list_{}.json"
# postings_list_test_chi_selected_output = save_parent_location + "test_postings_list_{}.json"

# Classification objects, unrelated to sample size
classification_index_output = classification_objects_save_location + "classification_index.pkl"
doc_classification_map_output = classification_objects_save_location + "doc_classification_map.pkl"
sections_output = classification_objects_save_location + "sections.pkl"
classes_output = classification_objects_save_location + "classes.pkl"
subclasses_output = classification_objects_save_location + "subclasses.pkl"
classifications_output = classification_objects_save_location + "classifications.pkl"
doc_lengths_map_output = classification_objects_save_location + "doc_lengths_map.pkl"
# training, validation and test set lists
training_docs_list_output = classification_objects_save_location + "training_docs_list.pkl"
validation_docs_list_output = classification_objects_save_location + "validation_docs_list.pkl"
test_docs_list_output = classification_objects_save_location + "test_docs_list.pkl"
sample_training_docs_list_output = classification_objects_save_location + "training_docs_list.pkl"


# training_predictions_sections_output = save_parent_location + "training_predictions_sections_list.pkl"
# training_labels_sections_list_output = save_parent_location + "training_labels_sections_list.pkl"
# valdiation_predictions_sections_output = save_parent_location + "validation_predictions_sections_list.pkl"
# validation_labels_sections_list_output = save_parent_location + "validation_labels_sections_list.pkl"


# test_postings_list_output = save_parent_location + "test_postings_list_50000.json"
# training_errors_output = save_parent_location + "training_errors.json"
model_output = save_parent_location + "models/" + "iter_" + str(SVM_ITERATIONS) + "_reg_" + str(SVM_REG) + "/"

In [8]:
def get_model_name(method, classification, reg=SVM_REG, iterations=SVM_ITERATIONS):
    return save_parent_location + "models/" + "iter_" + str(iterations) + "_reg_" + str(reg) + "/" + method + "_" + classification + "_model.svm"
def get_data_output_name(method, data_type="training"):
    return save_parent_location + "models/" + data_type + "_data/" + method  + "_data.json"
def get_data_classification_output_name(method, classification, data_type="training"):
    return save_parent_location + "models/" + data_type + "_data/" + method + "_" + classification + "_data.json"
def get_prediction_output_name(method, data_type="training", subset="sections", reg=SVM_REG, iterations=SVM_ITERATIONS):
    return save_parent_location + "models/" + "iter_" + str(iterations) + "_reg_" + str(reg) + "/" + method + "_" + data_type + "_" + subset + "_predictions.svm"
def get_labels_output_name(data_type="training", subset="sections", reg=SVM_REG, iterations=SVM_ITERATIONS):
    return save_parent_location + "models/" + "iter_" + str(iterations) + "_reg_" + str(reg) + "/" + data_type + "_" + subset + "_labels.svm"
def get_metrics_output_name(method, data_type="training", subset="sections", reg=SVM_REG, iterations=SVM_ITERATIONS):
    return save_parent_location + "models/" + "iter_" + str(iterations) + "_reg_" + str(reg) + "/" + method + "_" + data_type + "_" + subset + "_metrics.pkl"
def get_save_location(location, sample=False):
    if sample:
        return location.replace(save_parent_location, sample_save_parent_location)
    return location

#### Load Classification Objects

In [9]:
%%time
doc_classification_map = dict(sc.pickleFile(doc_classification_map_output).collect())
doc_count = len(doc_classification_map)
classifications_index = dict(sc.pickleFile(classification_index_output).collect())
sections = sc.pickleFile(sections_output).collect()
classes = sc.pickleFile(classes_output).collect()
subclasses = sc.pickleFile(subclasses_output).collect()
classifications = sc.pickleFile(classifications_output).collect()

CPU times: user 14.2 s, sys: 2.1 s, total: 16.3 s
Wall time: 32.3 s


#### Load the training, validation and test document lists

In [10]:
%%time
training_documents = sc.pickleFile(training_docs_list_output).collect()
validation_documents = sc.pickleFile(validation_docs_list_output).collect()
test_documents = sc.pickleFile(test_docs_list_output).collect()

CPU times: user 3.6 s, sys: 788 ms, total: 4.38 s
Wall time: 7.93 s


In [11]:
def get_training_vector(classification, term_list, classifications, number_of_terms):
    clss = 1 if classification in classifications else 0
    return LabeledPoint(clss, SparseVector(number_of_terms, term_list))

def train_level_new(docs_index, classification, doc_classification_map, number_of_terms):
    training_vectors = docs_index.map(
        lambda (doc_id, postings): get_training_vector(classification, postings,
                                                        doc_classification_map[doc_id], number_of_terms))
    svm = SVMWithSGD.train(training_vectors, iterations=SVM_ITERATIONS, convergenceTol=SVM_CONVERGENCE, regParam=SVM_REG)
    return training_vectors, svm

### Load Doc2vec model

In [12]:
%%time
model_name = 'doc2vec_size_{}_w_{}_type_{}_concat_{}_mean_{}_hs_{}_iter_{}'.format(DOC2VEC_SIZE, DOC2VEC_WINDOW, 
                                                                'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow',
                                                                DOC2VEC_CONCAT, DOC2VEC_MEAN,
                                                                DOC2VEC_HIERARCHICAL_SAMPLE,DOC2VEC_NEGATIVE_SAMPLE_SIZE)
doc2vec_model = Doc2Vec.load(doc2vec_model_location.format(model_name), mmap='r')

2016-08-24 02:00:22,999 : INFO : loading Doc2Vec object from /big/s/shalaby/paragraph_vector_models/doc2vec_size_400_w_8_type_dm_concat_0_mean_0_hs_0_iter_10
2016-08-24 02:01:25,412 : INFO : loading docvecs recursively from /big/s/shalaby/paragraph_vector_models/doc2vec_size_400_w_8_type_dm_concat_0_mean_0_hs_0_iter_10.docvecs.* with mmap=r
2016-08-24 02:01:25,413 : INFO : loading doctag_syn0 from /big/s/shalaby/paragraph_vector_models/doc2vec_size_400_w_8_type_dm_concat_0_mean_0_hs_0_iter_10.docvecs.doctag_syn0.npy with mmap=r
2016-08-24 02:01:25,416 : INFO : loading syn1neg from /big/s/shalaby/paragraph_vector_models/doc2vec_size_400_w_8_type_dm_concat_0_mean_0_hs_0_iter_10.syn1neg.npy with mmap=r
2016-08-24 02:01:25,418 : INFO : loading syn0 from /big/s/shalaby/paragraph_vector_models/doc2vec_size_400_w_8_type_dm_concat_0_mean_0_hs_0_iter_10.syn0.npy with mmap=r
2016-08-24 02:01:25,419 : INFO : setting ignored attribute syn0norm to None
2016-08-24 02:01:25,420 : INFO : setting ignor

CPU times: user 1min 12s, sys: 6.67 s, total: 1min 19s
Wall time: 1min 22s


In [13]:
doc2vec_model

<gensim.models.doc2vec.Doc2Vec at 0x7f25cd2eb110>

## Actual Training

In [14]:
def model_exists(path):
    try:
        model = SVMModel.load(sc, path)
        return True;
    except:
        return False
    
def get_training_vector(classification, dense_vector, classifications):
    clss = 1 if classification in classifications else 0
    return LabeledPoint(clss, dense_vector)

def train_level_doc2vec(classification, doc_classification_map):
    training_vectors = []
    for doc_id in training_documents:
        # converting from memmap to a normal array as spark is unable to convert memmap to a spark Vector
        normal_array = []
        normal_array[:] = doc2vec_model.docvecs[doc_id][:]
        training_vectors.append(get_training_vector(classification, normal_array, 
                                                    doc_classification_map[doc_id]))
    print "Finished getting training vectors"
    training_vectors = sc.parallelize(training_vectors)
    print "Finished parallelization"
    svm = SVMWithSGD.train(training_vectors, iterations=SVM_ITERATIONS, convergenceTol=SVM_CONVERGENCE, regParam=SVM_REG)
    return training_vectors, svm

In [15]:
dd = []
dd[:] = doc2vec_model.docvecs['08887671'][:]

In [None]:
%%time
i=0
for section in sections:
    classification = section
# for clss in classes:
#     classification = clss
    print classification
    if classification == "A" or classification == "B" or classification == "C" \
        or classification == "D" or classification == "E": continue
    i+=1
    try:
        print "Trying: " + model_name
        model_path = get_model_name(model_name, classification)
        if not model_exists(model_path):
            training_vectors, svm = train_level_doc2vec(classification, doc_classification_map)
            svm.save(sc, model_path)
        else:
            print "Model Exists"
    except:
        print "Problem creating: %s: %s" % (classification, model_name)
        raise

A
B
C
D
E
F
Trying: doc2vec_size_400_w_8_type_dm_concat_0_mean_0_hs_0_iter_10
Finished getting training vectors
Finished parallelization
G
Trying: doc2vec_size_400_w_8_type_dm_concat_0_mean_0_hs_0_iter_10
Finished getting training vectors
Finished parallelization
H
Trying: doc2vec_size_400_w_8_type_dm_concat_0_mean_0_hs_0_iter_10
Finished getting training vectors
Finished parallelization
CPU times: user 15min 29s, sys: 27.8 s, total: 15min 57s
Wall time: 1d 5h 31min 51s


In [19]:
model_exists(get_model_name(model_name, "A"))

False

In [18]:
get_model_name(model_name, "A")

'hdfs://deka.cip.ifi.lmu.de/pg-vectors/models/iter_1000_reg_0.1/doc2vec_size_400_w_8_type_dm_concat_0_mean_0_hs_0_iter_10_A_model.svm'

In [43]:
training_evaluations = {}
validation_evaluations = {}

classification = "A-01"

training_evaluations[classification] = {}
validation_evaluations[classification] = {}
representations_to_test = [
#     ("tf", tf_doc_index_training, tf_doc_index_validation),
#     ("tf-sublinear", sublinear_tf_doc_index_training, sublinear_tf_doc_index_validation), 
#     ("bm25", bm25_doc_index_training, bm25_doc_index_validation),
#     ("tf-idf", tf_idf_doc_index_training, tf_idf_doc_index_validation)
    ("sublinear-tf-idf", sublinear_tf_idf_doc_index_training, sublinear_tf_idf_doc_index_validation), 
]
name, doc_index, val_doc_index = representations_to_test[0]

In [44]:
#doc_index.map(lambda postings: json.dumps(postings)).saveAsTextFile(get_data_output_name(name))

In [45]:
print "Trying: " + name
docs_with_classes = doc_index.map(lambda (doc_id, terms): (doc_id, (terms, doc_classification_map[doc_id])))
training_vectors, svm = train_level(docs_with_classes, classification, number_of_terms)
svm.save(sc, get_model_name(name, classification))

Trying: tf


In [None]:
training_vectors = docs_with_classes.map(
        lambda (doc_id, (term_list, classifications)): get_training_vector(classification, term_list,
                                                                           classifications, number_of_terms))
svm = SVMWithSGD.train(training_vectors, iterations=SVM_ITERATIONS, convergenceTol=SVM_CONVERGENCE, regParam=SVM_REG, validateData=False)

In [23]:
labels = training_vectors.map(lambda p: p.label).collect()
predictions = training_vectors.map(lambda p: svm.predict(p.features)).collect()
training_evaluations[classification][name] = Evaluator(labels, predictions)

In [None]:
# validation
print "Validating"
validation_vectors = get_labeled_points_from_doc_index(val_doc_index, doc_classification_map, number_of_terms)
labels_val = validation_vectors.map(lambda p: p.label).collect()
predictions_val = validation_vectors.map(lambda p: svm.predict(p.features)).collect()
validation_evaluations[classification][name] = Evaluator(labels_val, predictions_val)


### Validation

In [20]:
import cPickle as pickle
validation_vectors_matrix = pickle.load(open(doc2vec_model_location.format("validation_data/" + model_name + "_validation_vectors"),'r'))

In [39]:
def get_validation_doc2vec_vectors(classification, doc_classification_map):
    validation_vectors = []
    for (index, doc_id) in enumerate(validation_documents):
        # converting from memmap to a normal array as spark is unable to convert memmap to a spark Vector
        validation_vector = validation_vectors_matrix[index]
        validation_vectors.append(get_training_vector(classification, validation_vector, 
                                                    doc_classification_map[doc_id]))
    print "Finished getting training vectors"
    validation_vectors = sc.parallelize(validation_vectors)
    print "Finished parallelization"
    return validation_vectors

In [34]:
classification = "A"
binarySvm = SVMModel.load(sc, get_model_name(model_name, classification))

In [37]:
binarySvm.predict(validation_vectors.take(1)[0].features)

0

In [50]:
%%time
method = model_name
subset = "sections"

doc_count = len(validation_documents)
y_score = np.zeros((doc_count, len(sections)))
y_true = np.zeros((doc_count, len(sections)))
i=0

for section in sections:
    print section
    classification = section
# for clss in classes:
#     print clss
#     classification = clss
    
    validation_vectors = get_validation_doc2vec_vectors(classification, doc_classification_map)

    binarySvm = SVMModel.load(sc, get_model_name(model_name, classification))
    print "Loaded the model"
    binarySvm.clearThreshold()
    %time labels_predictions = validation_vectors.map(lambda p: (p.label, binarySvm.predict(p.features))).collect()
    #labels = test_labeled_points.map(lambda p: p.labels)
    y_true[:,i] = [label_pred[0] for label_pred in labels_predictions]
    y_score[:,i] = [label_pred[1] for label_pred in labels_predictions]
    i+=1
y_binary_score = get_binary(y_score)
# results[method]["y_true"] = y_true
# results[method]["y_score"] = y_score
# results[method]["y_binary_score"] = y_binary_score
metrics = get_metrics(y_true, y_binary_score)

sc.parallelize(y_true).repartition(1).saveAsPickleFile(get_labels_output_name(data_type="validation", subset=subset))
sc.parallelize(y_score).repartition(1).saveAsPickleFile(get_prediction_output_name(method=method, data_type="validation", subset=subset))
sc.parallelize(("metrics", json.dumps(metrics))).saveAsTextFile(get_metrics_output_name(method=method, data_type="validation", subset=subset))

CPU times: user 20.9 s, sys: 80 ms, total: 21 s
Wall time: 22.4 s


In [None]:
import cPickle as pickle
pickle.dump(y_true, open('/home/s/shalaby/y_true.pkl'))
pickle.dump(y_score, open('/home/s/shalaby/y_score.pkl'))

In [1]:
method

NameError: name 'method' is not defined

In [78]:
method = 'bm25'
y_true = results[method]['y_true']
y_score = results[method]['y_score']
y_binary_score = results[method]['y_binary_score']

In [None]:
SVMModel.load(sc, "/svm/new/lskdjflsdf")

In [42]:
sc.parallelize(y_true).repartition(1).saveAsPickleFile(get_labels_output_name(data_type="validation", subset="sections"))

In [43]:
sc.parallelize(y_score).repartition(1).saveAsPickleFile(get_prediction_output_name(method=model_name, data_type="validation", subset="sections"))

#### Load Labels and predictions

In [44]:
method = model_name

In [45]:
get_prediction_output_name(method=method, data_type="validation", subset="sections")

'hdfs://deka.cip.ifi.lmu.de/pg-vectors/models/iter_1000_reg_0.1/doc2vec_size_400_w_8_type_dm_concat_0_mean_0_hs_0_iter_10_validation_sections_predictions.svm'

In [12]:
y_true = np.array(sc.pickleFile(get_labels_output_name(data_type="validation", subset="sections")).collect())
y_score = np.array(sc.pickleFile(get_prediction_output_name(method=method, data_type="validation", subset="sections")).collect())
y_binary_score = get_binary(y_score)

In [25]:
y_true

array([[ 0.,  0.,  0., ...,  1.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  1.,  1.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.]])

In [46]:
y_score

array([[-3.68200138, -3.69682235, -3.91881962, ..., -4.10725345,
        -1.56971791, -2.29016149],
       [-0.61050217, -0.63617954, -0.75953608, ..., -0.75635142,
        -0.3489743 , -0.45303563],
       [-3.0328405 , -3.06993152, -3.22358702, ..., -3.40670266,
        -1.14895461, -2.03294231],
       ..., 
       [-0.77785263, -0.76533569, -0.94888619, ..., -0.92466885,
        -0.52546605, -0.55825734],
       [-1.37003043, -1.39745377, -1.53473263, ..., -1.51578402,
        -0.53428097, -0.8681844 ],
       [-0.51321361, -0.45298909, -0.66777607, ..., -0.60296964,
        -0.37040412, -0.42987291]])

In [85]:
y_score[:,1].shape

(321473,)

In [81]:
y_score.shape

(321473, 8)

In [51]:
y_binary_score

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [27]:
np.sum(y_binary_score[:,3])

1237

In [1]:
1+1

2

In [2]:
method = 'sublinear-tf-idf'
y_true = results[method]['y_true']
y_score = results[method]['y_score']
y_binary_score = results[method]['y_binary_score']

NameError: name 'results' is not defined

In [52]:
metrics = get_metrics(y_true, y_binary_score)
metrics

{'average_num_of_labels': 0.0,
 'average_precision_macro': nan,
 'average_precision_micro': nan,
 'coverage_error': 0.0,
 'f1_macro': 0.0,
 'f1_micro': 0.0,
 'f1_scores_array': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 'precision_macro': 0.0,
 'precision_micro': 0.0,
 'precision_scores_array': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 'recall_macro': 0.0,
 'recall_micro': 0.0,
 'recall_scores_array': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 'topN_avg': 0.0,
 'topN_list': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
 

In [14]:
sc.parallelize(("metrics", json.dumps(metrics))).saveAsTextFile(get_metrics_output_name(method=method, data_type="validation", subset="sections"))

In [102]:
get_metrics_output_name(method=method, data_type="validation", subset="sections")

'hdfs://deka.cip.ifi.lmu.de/svm/new/models/iter_1000_reg_0.01/bm25_validation_sections_metrics.pkl'

In [100]:
SVM_REG

0.01

#### Load Metrics

In [112]:
method = "tf"

In [113]:
loaded_metrics = json.loads(sc.textFile(get_metrics_output_name(method=method, data_type="validation", subset="sections")).collect()[1])

In [114]:
loaded_metrics

{u'average_num_of_labels': 1.1485630208446744,
 u'average_precision_macro': 0.6181398347172771,
 u'average_precision_micro': 0.6740106535881538,
 u'coverage_error': 3.347438198542335,
 u'f1_macro': 0.4453958787293348,
 u'f1_micro': 0.6465376531616135,
 u'f1_scores_array': [0.6598160048844598,
  0.5282436574152903,
  0.6680647596678131,
  0.0,
  0.0,
  0.3551263915369949,
  0.760959623844551,
  0.5909565924855693],
 u'precision_macro': 0.5022584534761925,
 u'precision_micro': 0.6678927795999445,
 u'precision_scores_array': [0.643975356024644,
  0.5218899823366137,
  0.750075346594334,
  0.0,
  0.0,
  0.5509016315237095,
  0.6441524294269122,
  0.9070728819033265],
 u'recall_macro': 0.43039851669590423,
 u'recall_micro': 0.6265058283139057,
 u'recall_scores_array': [0.6764556102529282,
  0.5347539429457899,
  0.6022201385318049,
  0.0,
  0.0,
  0.2620138857376231,
  0.9295125050037419,
  0.4382320510953461],
 u'topN_avg': 2.0154507532514394,
 u'topN_list': [5,
  1,
  1,
  1,
  2,
  1,
  

In [50]:
metrics_loaded = sc.pickleFile(get_metrics_output_name(method=method, data_type="validation", subset="sections")).collectAsMap()

ValueError: dictionary update sequence element #0 has length 23; 2 is required

In [49]:
metrics_loaded

['average_precision_micro',
 'f1_macro',
 'recall_macro',
 'precision_micro',
 'recall_micro',
 'average_precision_macro',
 'f1_micro',
 'precision_macro',
 'f1_scores_array',
 'coverage_error',
 'average_num_of_labels',
 'precision_scores_array',
 'recall_scores_array']

In [91]:
precision_scores = np.zeros((len(sections)))
for i in range(0,len(sections)):
    precision_scores[i] = sklearn.metrics.precision_score(y_true[:,i], y_binary_score[:,i])
metrics['precision_scores_array']precision_scores

array([ 0.64397536,  0.52188998,  0.75007535,  0.        ,  0.        ,
        0.55090163,  0.64415243,  0.90707288])

In [101]:
section_index = 5
print np.sum(y_binary_score[:, section_index])
print np.sum(y_true[:, section_index])

-1434134418.46
22037.0


In [92]:
e = Evaluator(y_true[:,1], y_binary_score[:,1])

2.0154507532514394

In [36]:
docu_index = 0

get_row_top_N(y_score[docu_index,:], y_true[docu_index,:])


[ -160.73221359   -63.53035615 -1389.43386875 -2086.39085259 -1708.9396494
  -257.07255332    -4.29960961   -15.17807309]
[ 0.  0.  0.  0.  0.  1.  0.  0.]
[6 7 1 0 5 2 4 3]


5

In [94]:
e.f1

0.5282436574152903

In [67]:
precision_scores

array([ 0.64397536,  0.52188998,  0.75007535,  0.        ,  0.        ,
        0.55090163,  0.64415243,  0.90707288])

In [64]:
precision_scores = np.zeros((len(sections),))

In [65]:
precision_scores

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [57]:
for i in xrange(0,len(sections)):
    print "lskdjfls"

lskdjfls
lskdjfls
lskdjfls
lskdjfls
lskdjfls
lskdjfls
lskdjfls
lskdjfls


In [39]:
import sklearn.metrics

0.67401065358815382

In [42]:
sklearn.metrics.f1_score(y_true, y_binary_score, average='micro')

0.64653765316161349

In [28]:
def get_coverage_error(test_labeled_points, classifications, method):
    #test_labeled_points.cache()
    y_score = np.zeros(test_labeled_points.count(), len(classifications))
    y_true = np.zeros(test_labeled_points.count(), len(classifications))
    
    i = 0
    for classification in classifications:
        binarySvm = SVMModel.load(sc, get_model_name(method, classification))
        binarySvm.clearThreshold()
        predictions = test_labeled_points.map(lambda p: binarySvm.predict(p.features))
        labels = test_labeled_points.map(lambda p: p.labels)
        y_score[:][i] = predictions
        y_true[:][i] = labels
        i += 1
    return coverage_error(y_score, y_true)

## Testing

In [None]:
%%time
tf_doc_index_test = create_doc_index(tf_postings, term_dictionary).filter(lambda (doc_id, postings): doc_id in validation_documents)
sublinear_tf_doc_index_test = create_doc_index(sublinear_tf_postings, term_dictionary).filter(lambda (doc_id, postings): doc_id in validation_documents)
tf_id_doc_index_test = create_doc_index(tf_postings, term_dictionary).filter(lambda (doc_id, postings): doc_id in validation_documents)
bm25_doc_index_test = create_doc_index(bm25_postings, term_dictionary).filter(lambda (doc_id, postings): doc_id in validation_documents)

In [None]:
method = "bm25"
test_vectors = get_labeled_points_from_doc_index(bm25_doc_index_test, doc_classification_map, number_of_terms)
get_coverage_error(test_vectors, sections, method)