In [1]:
import json
import nltk
from nltk.tokenize import RegexpTokenizer
import string
import math
import time
import pandas as pd
import pyspark
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.classification import SVMWithSGD, SVMModel
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import random
from sklearn.metrics import coverage_error
import sklearn.metrics

In [2]:
IS_SAMPLE = False
TRAINING_SAMPLE_PERCENTAGE = 0.001
MIN_TRAINING_SAMPLES = 20

In [3]:
STOP_WORDS = nltk.corpus.stopwords.words('english')
NUMBER_INDICATOR = "number_inidicator"
CURRENCY_INDICATOR = "currency_inidicator"
CHEMICAL_INDICATOR = "chemical_inidicator"
MIN_SIZE = 3
MIN_DOCUMENTS = 5
TOP_N_FEATURES = 10000

TEST_SET_PERCENTAGE = 0.2
VALIDATION_IN_TRAINING_PERCENTAGE = 0.2
MIN_DOCUMENTS_FOR_TEST = 1
MIN_DOCUMENTS_FOR_VALIDATION = 1

MIN_DOCUMENTS_FOR_TRAINING_SAMPLE = 10
MIN_DOCUMENTS_FOR_TEST_SAMPLE = 1
MIN_DOCUMENTS_FOR_VALIDATION_SAMPLE = 1

SVM_ITERATIONS = 1000
SVM_CONVERGENCE = 0.001
SVM_REG = 0.1

BM25_K = 1.5  # controls power of tf component
BM25_b = 0.75  # controls the BM25 length normalization

RANDOM_SEED = 10000
random.seed(RANDOM_SEED)

stemmer = nltk.stem.porter.PorterStemmer().stem

### Text Manipulation functions

In [4]:
def stemtokenizer(text, doc_id):
    """ MAIN FUNCTION to get clean stems out of a text. A list of clean stems are returned """
    tokenizer = RegexpTokenizer(r'\s+', gaps=True)
    tokens = tokenizer.tokenize(text)
    stems = []  # result
    previous_unigram = None
    for token in tokens:
        stem = token.lower()
        stem = stem.strip(string.punctuation)
        if stem:
            if is_number(stem):
                stem = NUMBER_INDICATOR
            elif is_currency(stem):
                stem = CURRENCY_INDICATOR
            elif is_chemical(stem):
                stem = CHEMICAL_INDICATOR
            elif is_stopword(stem):
                stem = None
            else:
                stem = stemmer(token)
                stem = stem.strip(string.punctuation)
            if stem and len(stem) >= MIN_SIZE:
                # extract uni-grams
                stems.append((stem,{doc_id: 1}))
                # extract bi-grams
                if previous_unigram: stems.append((previous_unigram + " " + stem,{doc_id: 1}))
                previous_unigram = stem
    del tokens
    return stems

def is_stopword(word):
  return word in STOP_WORDS

def is_number(str):
    """ Returns true if given string is a number (float or int)"""
    try:
        float(str.replace(",", ""))
        return True
    except ValueError:
        return False

def is_currency(str):
    return str[0] == "$"

def is_chemical(str):
    return str.count("-") > 3

### Training functions

In [5]:
def merge_postings(postings_list1, postings_list2):
    # key could be either a doc id or a term
    for key in postings_list2:
        if postings_list1.get(key):
            postings_list1[key] += postings_list2[key]
        else:
            postings_list1[key] = postings_list2[key]
    return postings_list1

def get_term_dictionary(terms):
    """
    Maps string terms to indexes in an array
    """
    term_dictionary = {}
    term_array = [None] * len(terms)
    def put(key):
        hashvalue = hashfunction(key, len(term_array))
        if term_array[hashvalue] == None:
            term_array[hashvalue] = key
            return hashvalue
        else:
            nextslot = rehash(hashvalue, len(term_array))
            while term_array[nextslot] != None:
                nextslot = rehash(nextslot, len(term_array))
            if term_array[nextslot] == None:
                term_array[nextslot] = key
                return nextslot
    def hashfunction(key, size):
        return hash(key) % size
    def rehash(oldhash, size):
        return (oldhash + 1) % size
    i = 0
    for term in terms:
        corresponding_index = put(term)
        term_dictionary[term] = corresponding_index
        i+=1
        if i%10000 == 0: print "finished " + str(i)
    return term_dictionary

def jsonKV2str(x):
    """
    Change string keys to int
    """
    if isinstance(x, dict):
            #return {doc_id:{int(term_id):x[doc_id][term_id] for term_id in x[doc_id]} for doc_id in x }
        
            return {int(k):(int(v) if isinstance(v, unicode) else v) for k,v in x.items()}
    return x

def get_json(json_postings):
    return json.loads(json_postings)

def get_json_convert_num(json_postings):
    return json.loads(json_postings, object_hook=jsonKV2str)

def get_doc_index(term, postings_list, term_dictionary):
    #return [(doc_id, {term: postings_list[doc_id]}) for doc_id in postings_list]
    return [(doc_id, {term_dictionary[term]: postings_list[doc_id]}) for doc_id in postings_list]

def get_classes(ipc_classification):
    sections = []
    classes = []
    subclasses = []
    for classification in ipc_classification:
        # we do the check because some documents have repetitions
        section_name = classification['section']
        class_name = classification['section'] + "-" + classification['class']
        subclass_name = classification['section'] + "-" + classification['class'] + "-" + classification['subclass']
        if section_name not in sections:
            sections.append(section_name)
        if class_name not in classes:
            classes.append(class_name)
        if subclass_name not in subclasses:
            subclasses.append(subclass_name)
    return {"sections": sections, "classes": classes, "subclasses": subclasses}


def get_training_vector_old(classification, term_list, classifications, classification_key_name, number_of_terms):
    clss = 1 if classification in classifications[classification_key_name] else 0
    return LabeledPoint(clss, SparseVector(number_of_terms, term_list))

def get_training_vector(classification, term_list, classifications, number_of_terms):
    clss = 1 if classification in classifications else 0
    return LabeledPoint(clss, SparseVector(number_of_terms, term_list))


def calculate_sublinear_tf(tf):
    # laplace smoothing with +1 in case of term with no documents (useful during testing)
    return math.log10(1 + tf)


def calculate_tf_idf(tf, df, N):
    # laplace smoothing with +1 in case of term with no documents (useful during testing)
    return tf * math.log10((N+1) / (df + 1))


def calculate_sublinear_tf_idf(tf, df, N):
    # laplace smoothing with +1 in case of term with no documents (useful during testing)
    return calculate_sublinear_tf(tf) * math.log10((N+1) / (df + 1))


def calculate_bm25(tf, df, N, d_len, d_avg):
    idf = max(0, math.log10((N-df + 0.5)/(df+0.5))) # in rare cases where the df is over 50% of N, this could become -ve, so we guard against that
    tf_comp = float(((BM25_K + 1) * tf)) / ( BM25_K * ((1-BM25_b) + BM25_b*(float(d_len)/d_avg)) + tf)
    return tf_comp * idf


def calculate_rf(df_relevant, df_non_relevant):
    return math.log( (2 + (float(df_relevant)/max(1, df_non_relevant))), 2)


def calculate_tf_rf(tf, df_relevant, df_non_relevant):
    return tf * calculate_rf(df_relevant, df_non_relevant)


def compare_classifications(x,y):
    len_comp = cmp(len(x), len(y))
    if len_comp == 0:
        return cmp(x,y)
    return len_comp


def create_doc_index(term_index, term_dictionary):
    return term_index \
        .flatMap(lambda (term, postings_list): get_doc_index(term, postings_list, term_dictionary)) \
        .reduceByKey(lambda x, y: merge_postings(x, y))


def get_rf_stats(postings, classification):
    a_plus_c = set(postings.keys())
    a_plus_b = set(classifications_index[classification])
    # first intersection is to get (a), second difference is to get (c) (checkout tf-rf paper for reference)
    a = a_plus_c.intersection(a_plus_b)
    c = a_plus_c.difference(a_plus_b)
    size_a = len(a)
    size_c = len(c)
    return size_a, size_c


def get_rf_postings(classification):
    def get_rf_postings_internal(postings):
        size_a, size_c = get_rf_stats(postings, classification)
        return {docId: calculate_rf(size_a, size_c)
                for docId, tf in postings.items()}
    return get_rf_postings_internal


def get_tf_rf_postings(classification):
    def get_tf_rf_postings_internal(postings):
        size_a, size_c = get_rf_stats(postings, classification)
        return {docId: calculate_tf_rf(tf, size_a, size_c)
                for docId, tf in postings.items()}
    return get_tf_rf_postings_internal


def train_level_old(docs_with_classes, classification, classification_label):
    training_vectors = docs_with_classes.map(
        lambda (doc_id, (term_list, classifications)): get_training_vector_old(classification, term_list, classifications,
                                                                           classification_label, number_of_terms))
    svm = SVMWithSGD.train(training_vectors, iterations=SVM_ITERATIONS, convergenceTol=SVM_CONVERGENCE)
    return training_vectors, svm


def train_level(docs_with_classes, classification, number_of_terms):
    training_vectors = docs_with_classes.map(
        lambda (doc_id, (term_list, classifications)): get_training_vector(classification, term_list,
                                                                           classifications, number_of_terms))
    svm = SVMWithSGD.train(training_vectors, iterations=SVM_ITERATIONS, convergenceTol=SVM_CONVERGENCE, regParam=SVM_REG)
    return training_vectors, svm


def train_level_new(docs_index, classification, doc_classification_map, number_of_terms):
    training_vectors = docs_index.map(
        lambda (doc_id, postings): get_training_vector(classification, postings,
                                                        doc_classification_map[doc_id], number_of_terms))
    svm = SVMWithSGD.train(training_vectors, iterations=SVM_ITERATIONS, convergenceTol=SVM_CONVERGENCE, regParam=SVM_REG)
    return training_vectors, svm


def get_error(svm, test_vectors):
    labelsAndPreds = test_vectors.map(lambda p: (p.label, svm.predict(p.features)))
    trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(test_vectors.count())
    return trainErr


def train_all(docs_with_classes):
    training_errors = {}
    for section in sections:
        training_vectors, svm = train_level(docs_with_classes, section, "sections")
        train_err = get_error(svm, training_vectors)
        training_errors[section] = train_err
    #
    with open(training_errors_output, 'w') as file:
        file.write(json.dumps(training_errors))
    #
    for clss in classes:
        training_vectors, svm = train_level(docs_with_classes, clss, "classes")
        train_err = get_error(svm, training_vectors)
        training_errors[clss] = train_err
    
    with open(training_errors_output, 'w') as file:
        file.write(json.dumps(training_errors))
    
    for subclass in subclasses:
        training_vectors, svm = train_level(docs_with_classes, subclass, "subclasses")
        train_err = get_error(svm, training_vectors)
        training_errors[subclass] = train_err
    return training_errors


def get_labeled_points_from_doc_index(doc_index, doc_classification_map, number_of_terms):
    docs_with_classes = doc_index.map(lambda (doc_id, terms): (doc_id, (terms, doc_classification_map[doc_id])))
    training_vectors = docs_with_classes.map(
        lambda (doc_id, (term_list, classifications)): get_training_vector(classification, term_list,
                                                                           classifications, number_of_terms))
    return training_vectors

get_binary = lambda x: 1 if x > 0 else 0
get_binary = np.vectorize(get_binary)

def get_row_top_N(y_score_row, y_true_row):
    desc_score_indices = np.argsort(y_score_row)[::-1]
    # print y_score_row
    # print y_true_row
    true_indices = np.where(y_true_row ==1)[0]
    # print desc_score_indices
    found = 0
    top_N = 0
    for i, score in enumerate(desc_score_indices):
        if score in true_indices:
            found += 1
            if found == len(true_indices):
                top_N = i + 1
    # print top_N
    return top_N


def get_metrics(y_true, y_binary_score):
    metrics = {}
    metrics['coverage_error'] = coverage_error(y_binary_score, y_true)
    metrics['average_num_of_labels'] = np.sum(np.sum(y_true, axis=1))/y_true.shape[0]
    metrics['average_precision_micro'] = sklearn.metrics.average_precision_score(y_true, y_binary_score, average='micro')
    metrics['average_precision_macro'] = sklearn.metrics.average_precision_score(y_true, y_binary_score, average='macro')
    metrics['precision_micro'] = sklearn.metrics.precision_score(y_true, y_binary_score, average='micro')
    metrics['precision_macro'] = sklearn.metrics.precision_score(y_true, y_binary_score, average='macro')
    metrics['recall_micro'] = sklearn.metrics.recall_score(y_true, y_binary_score, average='micro')
    metrics['recall_macro'] = sklearn.metrics.recall_score(y_true, y_binary_score, average='macro')
    metrics['f1_micro'] = sklearn.metrics.f1_score(y_true, y_binary_score, average='micro')
    metrics['f1_macro'] = sklearn.metrics.f1_score(y_true, y_binary_score, average='macro')

    precision_scores = np.zeros(y_true.shape[1])
    for i in range(0, y_true.shape[1]):
        precision_scores[i] = sklearn.metrics.precision_score(y_true[:,i], y_binary_score[:,i])
    metrics['precision_scores_array'] = precision_scores.tolist()

    recall_scores = np.zeros(y_true.shape[1])
    for i in range(0, y_true.shape[1]):
        recall_scores[i] = sklearn.metrics.recall_score(y_true[:,i], y_binary_score[:,i])
    metrics['recall_scores_array'] = recall_scores.tolist()

    f1_scores = np.zeros(y_true.shape[1])
    for i in range(0, y_true.shape[1]):
        f1_scores[i] = sklearn.metrics.f1_score(y_true[:,i], y_binary_score[:,i])
    metrics['f1_scores_array'] = f1_scores.tolist()

    tops = []
    for i in xrange(y_score.shape[0]):
        tops.append(get_row_top_N(y_score[i,:], y_true[i,:]))
    metrics['topN_list'] = np.array(tops).tolist()
    metrics['topN_avg'] = np.mean(tops)
    
    return metrics


class Evaluator:
    
    def __init__(self, labels, scores, threshold=0):
        self.threshold = 0
        self.count = len(labels)
        
        self.tp = 0
        self.fp = 0
        self.fn = 0
        self.tn = 0
        
        for (l,s) in zip(labels,scores):
            if self.is_true(l) and self.is_true(s):
                self.tp += 1
            if self.is_true(l) and not self.is_true(s):
                self.fn += 1
            if not self.is_true(l) and self.is_true(s):
                self.fp += 1
            if not self.is_true(l) and not self.is_true(s):
                self.tn += 1
        self.precision = self.get_precision()
        self.recall = self.get_precision()
        self.f1 = self.get_f1()
        self.error_rate = self.get_error_rate()
        
    def calculate_contingency(self, label, contingency):
        
        self.tp = 0
        self.fp = 0
        self.fn = 0
        self.tn = 0
        
        for (l,s) in zip(labels,scores):
            if self.is_true(l) and self.is_true(s):
                self.tp += 1
            if self.is_true(l) and not self.is_true(s):
                self.fn += 1
            if not self.is_true(l) and self.is_true(s):
                self.fp += 1
            if not self.is_true(l) and not self.is_true(s):
                self.tn += 1
    
    def is_true(self, label):
        return label > self.threshold
    
    def get_error_rate(self):
        return float(self.tp + self.tn) / self.count
    
    def get_precision(self):
        # self.calculate_contingency()
        if self.tp == 0: return 0
        return float(self.tp) / (self.tp + self.fp)
        
    def get_recall(self):
        # self.calculate_contingency()
        if self.tp == 0: return 0
        return float(self.tp) / (self.tp + self.fn)
    
    def get_f1(self):
        return 2 * (self.get_precision() * self.get_recall()) / (self.get_precision() + self.get_recall())

### Input/Output directories

In [6]:
#sc = SparkContext("", "Generate Inverted Index Job")
es_server = "deka.cip.ifi.lmu.de"
es_port = "9200"

original_parent_save_location = "hdfs://deka.cip.ifi.lmu.de/svm/new/"
save_parent_location = original_parent_save_location
sample_save_parent_location = save_parent_location + "sample/"
if IS_SAMPLE: 
    save_parent_location = save_parent_location + "sample/"

file_name = "sample.json"
test_file_name = "sample.json"
#url = "/media/Work/workspace/thesis/benchmark/output/" + file_name
sample_location = save_parent_location + file_name
sample_test_location = save_parent_location + test_file_name
docs_output = save_parent_location + "docs_output"
postings_list_output = save_parent_location + "postings_list_full.json"

accepted_terms_list_output = original_parent_save_location + "accepted_terms_list_{}.pkl"
accepted_terms_with_scores_list_output = original_parent_save_location + "accepted_terms_with_scores_list_{}.pkl"
postings_list_chi_selected_output = original_parent_save_location + "postings_list_{}.json"
term_df_map_output = original_parent_save_location + "term_df_map_output_{}.json"
doc_index_chi_selected_output = original_parent_save_location + "doc_index_for_postings_{}.json"
term_dictionary_output = original_parent_save_location + "term_dictionary_{}.pkl"


postings_list_training_chi_selected_output = save_parent_location + "training_postings_list_{}.json"
postings_list_validation_chi_selected_output = save_parent_location + "validation_postings_list_{}.json"
postings_list_test_chi_selected_output = save_parent_location + "test_postings_list_{}.json"

# Classification objects, unrelated to sample size
classification_index_output = original_parent_save_location + "classification_index.pkl"
doc_classification_map_output = original_parent_save_location + "doc_classification_map.pkl"
sections_output = original_parent_save_location + "sections.pkl"
classes_output = original_parent_save_location + "classes.pkl"
subclasses_output = original_parent_save_location + "subclasses.pkl"
classifications_output = original_parent_save_location + "classifications.pkl"
doc_lengths_map_output = original_parent_save_location + "doc_lengths_map.pkl"
# training, validation and test set lists
training_docs_list_output = original_parent_save_location + "training_docs_list.pkl"
validation_docs_list_output = original_parent_save_location + "validation_docs_list.pkl"
test_docs_list_output = original_parent_save_location + "test_docs_list.pkl"
sample_training_docs_list_output = sample_save_parent_location + "training_docs_list.pkl"


training_predictions_sections_output = save_parent_location + "training_predictions_sections_list.pkl"
training_labels_sections_list_output = save_parent_location + "training_labels_sections_list.pkl"
valdiation_predictions_sections_output = save_parent_location + "validation_predictions_sections_list.pkl"
validation_labels_sections_list_output = save_parent_location + "validation_labels_sections_list.pkl"


test_postings_list_output = save_parent_location + "test_postings_list_50000.json"
training_errors_output = save_parent_location + "training_errors.json"
model_output = save_parent_location + "models/" + "iter_" + str(SVM_ITERATIONS) + "_reg_" + str(SVM_REG) + "/"

In [7]:
def get_model_name(method, classification, reg=SVM_REG, iterations=SVM_ITERATIONS):
    return save_parent_location + "models/" + "iter_" + str(iterations) + "_reg_" + str(reg) + "/" + method + "_" + classification + "_model.svm"
def get_data_output_name(method, no_of_features=TOP_N_FEATURES, data_type="training"):
    return save_parent_location + "models/" + data_type + "_data/" + method  + "_data.json"
def get_data_classification_output_name(method, classification, data_type="training"):
    return save_parent_location + "models/" + data_type + "_data/" + method + "_" + classification + "_data.json"
def get_prediction_output_name(method, data_type="training", subset="sections", reg=SVM_REG, iterations=SVM_ITERATIONS):
    return save_parent_location + "models/" + "iter_" + str(iterations) + "_reg_" + str(reg) + "/" + method + "_" + data_type + "_" + subset + "_predictions.svm"
def get_labels_output_name(data_type="training", subset="sections", reg=SVM_REG, iterations=SVM_ITERATIONS):
    return save_parent_location + "models/" + "iter_" + str(iterations) + "_reg_" + str(reg) + "/" + data_type + "_" + subset + "_labels.svm"
def get_metrics_output_name(method, data_type="training", subset="sections", reg=SVM_REG, iterations=SVM_ITERATIONS):
    return save_parent_location + "models/" + "iter_" + str(iterations) + "_reg_" + str(reg) + "/" + method + "_" + data_type + "_" + subset + "_metrics.pkl"
def get_save_location(location, sample=False):
    if sample:
        return location.replace(save_parent_location, sample_save_parent_location)
    return location

### Document RDDs

In [8]:
%%time

read_conf = {
    'es.nodes': es_server,
    'es.port': es_port,
    'es.resource': 'patents3/patent',
    'es.query': '{ "query" : { "match_all" : {} }}',
    'es.scroll.keepalive': '120m',
    'es.scroll.size': '1000',
    'es.http.timeout': '20m'
}
data = sc.newAPIHadoopRDD(
    inputFormatClass = 'org.elasticsearch.hadoop.mr.EsInputFormat',
    keyClass = 'org.apache.hadoop.io.NullWritable', 
    valueClass = 'org.elasticsearch.hadoop.mr.LinkedMapWritable',
    conf = read_conf
)

#data = sc.textFile(sample_location)
#doc_count = data.count()
#doc_objs = data.persist(pyspark.StorageLevel.MEMORY_AND_DISK_SER)
doc_objs = data

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.newAPIHadoopRDD.
: org.elasticsearch.hadoop.EsHadoopIllegalArgumentException: Cannot detect ES version - typically this happens if the network/Elasticsearch cluster is not accessible or when targeting a WAN/Cloud instance without the proper setting 'es.nodes.wan.only'
	at org.elasticsearch.hadoop.rest.InitializationUtils.discoverEsVersion(InitializationUtils.java:190)
	at org.elasticsearch.hadoop.rest.RestService.findPartitions(RestService.java:231)
	at org.elasticsearch.hadoop.mr.EsInputFormat.getSplits(EsInputFormat.java:457)
	at org.elasticsearch.hadoop.mr.EsInputFormat.getSplits(EsInputFormat.java:438)
	at org.apache.spark.rdd.NewHadoopRDD.getPartitions(NewHadoopRDD.scala:120)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
	at scala.Option.getOrElse(Option.scala:120)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
	at scala.Option.getOrElse(Option.scala:120)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
	at org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1307)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
	at org.apache.spark.rdd.RDD.take(RDD.scala:1302)
	at org.apache.spark.api.python.SerDeUtil$.pairRDDToPython(SerDeUtil.scala:201)
	at org.apache.spark.api.python.PythonRDD$.newAPIHadoopRDD(PythonRDD.scala:530)
	at org.apache.spark.api.python.PythonRDD.newAPIHadoopRDD(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
	at py4j.Gateway.invoke(Gateway.java:259)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:209)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.elasticsearch.hadoop.rest.EsHadoopNoNodesLeftException: Connection error (check network and/or proxy settings)- all nodes failed; tried [[141.84.220.203:9200]] 
	at org.elasticsearch.hadoop.rest.NetworkClient.execute(NetworkClient.java:142)
	at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:434)
	at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:414)
	at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:418)
	at org.elasticsearch.hadoop.rest.RestClient.get(RestClient.java:122)
	at org.elasticsearch.hadoop.rest.RestClient.esVersion(RestClient.java:564)
	at org.elasticsearch.hadoop.rest.InitializationUtils.discoverEsVersion(InitializationUtils.java:178)
	... 32 more


#### Loading document texts from HDFS

In [18]:
doc_text_objs = sc.textFile(docs_output).map(lambda x: eval(x))

In [15]:
%%time
### doc_objs = data.map(lambda x: json.loads(x))

doc_class_map = doc_objs.map(lambda (doc_id, doc): (doc_id, get_classes(doc['classification-ipc']))).cache()
doc_classification_map = doc_class_map.map(lambda (doc_id, classification_obj): (doc_id, sorted(reduce(lambda x, lst: x + lst, classification_obj.values(), [])))).collectAsMap()
doc_count = len(doc_classification_map)
# contains [(classification,  list of docs)]
# second list comprehension is to get list of lists [["A", "B"],["A-01","B-03"]] to one list ["A", "B", "A-01","B-03"], we could have also used a reduce as in doc_classifications_map
classifications_index = doc_class_map.flatMap(lambda (doc_id, classifications_obj): [(classification, doc_id) for classification in [classif for cat in classifications_obj.values() for classif in cat]])\
    .groupByKey().map(lambda (classf, classf_docs): (classf, list(set(classf_docs)))).collectAsMap()

sections = sorted(doc_class_map.flatMap(lambda (doc_id, classifications): classifications['sections']).distinct().collect())
classes = sorted(doc_class_map.flatMap(lambda (doc_id, classifications): classifications['classes']).distinct().collect())
subclasses = sorted(doc_class_map.flatMap(lambda (doc_id, classifications): classifications['subclasses']).distinct().collect())
classifications = sorted(classifications_index.keys(), cmp=compare_classifications)
# classifications = sorted(set(reduce(lambda x, lst: x + lst, map(lambda doc_id: classifications_index[doc_id], classifications_index), [])))

CPU times: user 28.3 s, sys: 6.48 s, total: 34.8 s
Wall time: 15min 29s


#### Save classification objects

In [16]:
%%time
sc.parallelize(doc_classification_map.items()).repartition(1).saveAsPickleFile(doc_classification_map_output)
sc.parallelize(classifications_index.items()).repartition(1).saveAsPickleFile(classification_index_output)
sc.parallelize(sections).repartition(1).saveAsPickleFile(sections_output)
sc.parallelize(classes).repartition(1).saveAsPickleFile(classes_output)
sc.parallelize(subclasses).repartition(1).saveAsPickleFile(subclasses_output)
sc.parallelize(classifications).repartition(1).saveAsPickleFile(classifications_output)

CPU times: user 32.2 s, sys: 2.54 s, total: 34.8 s
Wall time: 1min 16s


#### Load Classification Objects

In [9]:
doc_classification_map = dict(sc.pickleFile(doc_classification_map_output).collect())
doc_count = len(doc_classification_map)
classifications_index = dict(sc.pickleFile(classification_index_output).collect())
sections = sc.pickleFile(sections_output).collect()
classes = sc.pickleFile(classes_output).collect()
subclasses = sc.pickleFile(subclasses_output).collect()
classifications = sc.pickleFile(classifications_output).collect()

In [11]:
import cPickle as pickle
pickle.dump(classes, open('/big/s/shalaby/exported_data/classes.pkl', 'w'))
pickle.dump(subclasses, open('/big/s/shalaby/exported_data/subclasses.pkl', 'w'))
pickle.dump(classifications, open('/big/s/shalaby/exported_data/classifications.pkl', 'w'))
pickle.dump(classifications_index, open('/big/s/shalaby/exported_data/classification_index.pkl', 'w'))

In [31]:
# accelerates the chi squared calculation a lot
classifications_index_set = {k:set(docs) for k,docs in classifications_index.iteritems()}

In [49]:
doc_count

2009750

In [11]:
classifications_index.items()[0]

(u'G-20-B', [u'07433566', u'07896523', u'06985663', u'07116477', u'07218441'])

In [14]:
doc_classification_map.items()[10]

(u'07007598', [u'B', u'B-30', u'B-30-B'])

In [50]:
sections

[u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H']

## Creating Training, Validation and Test Splits

In [None]:
# Get min number of documents for any classification
min = 1000
from collections import defaultdict
min_classf = defaultdict(list)
for (classf, documents) in classifications_index.items():
    if len(documents) == 2: 
        min = len(documents)
        min_classf[classf].append(min)
min_classf, min
        

In [35]:
len(min_classf)

760

In [20]:
len(classifications_index)

2235

In [None]:
training_documents = set()
validation_documents = set()
test_documents = set()
for (classf, documents) in classifications_index.items():
    # only worry about subclasses, classes and sections will be already included
    if(classf in sections or classf in classes): pass
    
    # remove any documents that have already been picked before
    docs_set = set(documents)
    docs_set-=training_documents
    docs_set-=validation_documents
    docs_set-=test_documents
    
    base_test_docs_num = int(len(docs_set)* TEST_SET_PERCENTAGE)
    num_test_docs = base_test_docs_num if base_test_docs_num > 0 else MIN_DOCUMENTS_FOR_TEST if MIN_DOCUMENTS_FOR_TEST < len(docs_set) else 0
    print len(docs_set), num_test_docs
    classif_test_docs = random.sample(docs_set, num_test_docs)
    
    remaining_docs = docs_set.difference(set(classif_test_docs))
    base_validation_docs_num = int(len(remaining_docs)* VALIDATION_IN_TRAINING_PERCENTAGE)
    num_validation_docs = base_validation_docs_num if base_validation_docs_num > 0 else MIN_DOCUMENTS_FOR_VALIDATION if MIN_DOCUMENTS_FOR_VALIDATION < len(remaining_docs) else 0
    classif_validation_docs = random.sample(remaining_docs, num_validation_docs)
    
    classif_training_docs = set(remaining_docs).difference(set(classif_validation_docs))
    
    training_documents.update(classif_training_docs)
    validation_documents.update(classif_validation_docs)
    test_documents.update(classif_test_docs)

#### Save the training, validation and test document lists

In [25]:
sc.parallelize(training_documents).saveAsPickleFile(training_docs_list_output)
sc.parallelize(validation_documents).saveAsPickleFile(validation_docs_list_output)
sc.parallelize(test_documents).saveAsPickleFile(test_docs_list_output)

#### Load the training, validation and test document lists

In [8]:
training_documents = sc.pickleFile(training_docs_list_output).collect()
validation_documents = sc.pickleFile(validation_docs_list_output).collect()
test_documents = sc.pickleFile(test_docs_list_output).collect()

In [10]:
len(set(test_documents))

401877

In [9]:
len(training_documents)

1286325

In [12]:
len(validation_documents)

321473

In [10]:
import cPickle as pickle
pickle.dump(training_documents, open('/big/s/shalaby/exported_data_merged/training_docs_list.pkl', 'w'))

## Creating Sample

In [58]:
sample_training_documents = set()
i = 0
for (classf, documents) in classifications_index.items():
    if len(documents) > MIN_TRAINING_SAMPLES:
        base_sample_docs_len = int(len(documents)* TRAINING_SAMPLE_PERCENTAGE)
        num_sample_docs = base_sample_docs_len if base_sample_docs_len > 0 else MIN_TRAINING_SAMPLES
        #print "%s: Total %d, sample: %d" % (classf, len(documents), num_sample_docs)
        classif_training_docs = random.sample(documents, num_sample_docs)
        
        sample_training_documents.update(set(classif_training_docs))
    else:
        sample_training_documents.update(documents)
    i+=1
    
    #if i > 100: break
len(sample_training_documents)
#sc.parallelize(sample_training_documents).saveAsPickleFile(sample_training_docs_list_output)

17229

In [59]:
sc.parallelize(sample_training_documents).saveAsPickleFile(sample_save_parent_location + str(TRAINING_SAMPLE_PERCENTAGE) + "_sample.pkl")

In [11]:
sample_training_documents = sc.pickleFile(sample_training_docs_list_output).collect()

In [60]:
training_documents = sample_training_documents

### Section Distribution

In [None]:
for classif in sorted(classifications_index.keys()):
    if len(classif) == 1:
        print "%s : %d, %.3f" % (classif, len(set(classifications_index[classif])), float(len(classifications_index[classif]))/doc_count)

### Section Overlap

In [None]:
%%time
overlap_df = pd.DataFrame({section: [0]*len(sections) for section in sections} , index=sections, columns=sections)
for doc_id in doc_classification_map:
    for classif in doc_classification_map[doc_id]:
        if len(classif) == 1:
            for classif2 in doc_classification_map[doc_id]:
                if len(classif2) == 1:
                    overlap_df[classif][classif2] += 1
overlap_df

In [None]:
mpl.colors.Normalize(1,3)

In [None]:
overlap_df.values

In [None]:
fig = plt.figure(figsize=(16,8), dpi=120)
#ax = fig.add_subplot(111, frameon=True, xticks=[], yticks=[])
vals = overlap_df.values
normal = mpl.colors.Normalize()
normal = mpl.colors.Normalize(vals.min()-1, vals.max()+vals.max()/2)
formatter = lambda x: "{:,d}".format(int(x))

the_table=plt.table(cellText=np.vectorize(formatter)(vals), rowLabels=overlap_df.index, colLabels=overlap_df.columns, 
                    colWidths = [0.1]*(vals.shape[1]+3), loc='center',
                    cellColours=plt.cm.YlGn(normal(vals)))
the_table.set_fontsize(30)
the_table.scale(2, 4)
plt.axis("off")
plt.show()

### Create Postings List

In [10]:
#%%time
# Create Postings List (old one)
#postings_lists = doc_text_objs.flatMap(lambda (doc_id, doc): stemtokenizer(doc['description'], doc_id)).reduceByKey(lambda x,y: merge_postings(x,y))
### postings_lists = doc_objs.flatMap(lambda x: stemtokenizer(x['description'], x['id'])).reduceByKey(lambda x,y: merge_postings(x,y))
#min_doc_postings_lists = postings_lists.filter(lambda (x,y): len(y) > MIN_DOCUMENTS)
#number_of_terms = min_doc_postings_lists.count()

In [None]:
%%time
# Create Postings List
postings_lists = doc_text_objs.flatMap(lambda (doc_id, doc): stemtokenizer(doc, doc_id)).reduceByKey(lambda x,y: merge_postings(x,y))
### postings_lists = doc_objs.flatMap(lambda x: stemtokenizer(x['description'], x['id'])).reduceByKey(lambda x,y: merge_postings(x,y))
min_doc_postings_lists = postings_lists.filter(lambda (x,y): len(y) > MIN_DOCUMENTS)
#number_of_terms = min_doc_postings_lists.count()

# min_doc_postings_lists.map(lambda (term, postings_list): ",".join([term, json.dumps(postings_list)])).repartition(1).saveAsTextFile(postings_list_output)
min_doc_postings_lists.map(lambda postings: json.dumps(postings)).saveAsTextFile(postings_list_output)

def get_chi_index(term_index, classifications_index, subclasses, number_of_docs):
    return term_index.map(lambda (term, postings_list): (term, calculate_chi_squared(postings_list.keys(), classifications_index, subclasses, number_of_docs)))

def calculate_chi_squared(document_list, classifications_index, subclasses, number_of_docs):
    chi_score = 0
    for subclass in subclasses:
        Nt1 = len(document_list) # actual collection frequency of having the word
        Nt0 = number_of_docs - len(document_list) # actual collection frequency of not having the word
        Pt1 = float(len(document_list))/ number_of_docs
        Pt0 = float(number_of_docs - len(document_list))/ number_of_docs
        Pc1 = float(len(classifications_index[subclass]))/ number_of_docs
        Et1c1 = Pt1 * Pc1 * number_of_docs # expected frequency of docs in subclass with term (assuming independence)
        Et0c1 = Pt0 * Pc1 * number_of_docs # expected frequency of docs in subclass without term (assuming independence)
        chi_score += math.pow( Nt1 - Et1c1, 2) / Et1c1 
        chi_score += math.pow( Nt0 - Et0c1, 2) / Et0c1
    return chi_score

term_accepted_chi_list = get_chi_index(min_doc_postings_lists, classifications_index, subclasses, doc_count).takeOrdered(TOP_N_FEATURES, lambda (term,score): -score)
term_accepted_chi_list = map(lambda (x,y): x, term_accepted_chi_list)

# gets a bit slower at the end but finishes eventually 
term_dictionary = get_term_dictionary(term_accepted_chi_list)

min_doc_postings_lists = min_doc_postings_lists.filter(lambda (term, postings): term in term_accepted_chi_list).cache()

number_of_terms = min_doc_postings_lists.count()
number_of_terms

min_doc_postings_lists.map(lambda postings: json.dumps(postings)).saveAsTextFile(postings_list_chi_selected_output.format(str(TOP_N_FEATURES)))
sc.parallelize(term_dictionary.items()).saveAsPickleFile(term_dictionary_output)

### Get Document Lengths

In [19]:
doc_lengths_dict = doc_text_objs.map(lambda (doc_id, document_text): (doc_id, len(document_text))).collectAsMap()

In [23]:
avg_doc_length = sum(doc_lengths_dict.values())/len(doc_lengths_dict)

In [21]:
doc_lengths_dict.items()[0]

(u'08369259', 85861)

In [24]:
avg_doc_length

46477

### Save Postings List

In [None]:
%%time
# Save Postings List
# min_doc_postings_lists.map(lambda (term, postings_list): ",".join([term, json.dumps(postings_list)])).repartition(1).saveAsTextFile(postings_list_output)
min_doc_postings_lists.map(lambda postings: json.dumps(postings)).saveAsTextFile(postings_list_output)

### Load Postings List

In [12]:
# Load Postings Lists
min_doc_postings_lists = sc.textFile(postings_list_output).map(lambda json_postings: json.loads(json_postings))

In [13]:
def get_chi_index(term_index, classifications_index_set, subclasses, number_of_docs):
    return term_index.map(lambda (term, postings_list): (term, calculate_chi_squared(postings_list.keys(), classifications_index_set, subclasses, number_of_docs)))

def calculate_chi_squared(document_list, classifications_index_set, subclasses, number_of_docs):
    """
    Chi squared is the ratio of the difference between actual frequency and expected frequency of a term relative to the expected frequency
    summed up across all classes and whether the term appears or not
    Here we calculate the average chi squared score which is one of two options in multi-lable classification (the other being max)
    """
#     chi_score = 0
#     Nt1 = len(document_list) # actual collection frequency of having the word
#     Nt0 = number_of_docs - len(document_list) # actual collection frequency of not having the word
#     Pt1 = float(len(document_list))/ number_of_docs # probability of the term happening
#     Pt0 = float(number_of_docs - len(document_list))/ number_of_docs # probablility of the term not happening
#     print "Docs Stats: Term present in %d (%.7f), Not Present in %d (%.7f) " % (Nt1, Pt1, Nt0, Pt0)
#     for subclass in subclasses:
#         Pc1 = float(len(classifications_index[subclass]))/ number_of_docs # probability of the class happening
#         Et1c1 = Pt1 * Pc1 * number_of_docs # expected frequency of docs in subclass with term (assuming independence)
#         Et0c1 = Pt0 * Pc1 * number_of_docs # expected frequency of docs in subclass without term (assuming independence)
#         chi_score += float(math.pow( Nt1 - Et1c1, 2)) / Et1c1
#         chi_score += float(math.pow( Nt0 - Et0c1, 2)) / Et0c1
#         print "subclass %s: %.7f, %d, %d, %.7f" % (subclass, Pc1, Et1c1, Et0c1, chi_score)
#     return chi_score
    chi_score = 0
    N = len(document_list)
    doc_set = set(document_list)
    Nt1 = N # actual collection frequency of having the word
    Nt0 = number_of_docs - N # actual collection frequency of not having the word
    Pt1 = float(N)/ number_of_docs # probability of the term happening
    Pt0 = float(number_of_docs - N)/ number_of_docs # probablility of the term not happening
    #print "Docs Stats: Term present in %d (%.7f), Not Present in %d (%.7f) " % (Nt1, Pt1, Nt0, Pt0)
    for subclass in subclasses:
        Pc1 = float(len(classifications_index_set[subclass]))/ number_of_docs # probability of the class happening
        Pc0 = 1 - Pc1
        Pt1c1 = float(len(doc_set & classifications_index_set[subclass])) / number_of_docs
        Pt1c0 = Pt1 - Pt1c1
        Pt0c1 = Pc1 - Pt1c1
        Pt0c0 = 1 - Pt1c0 - Pt0c1 - Pt1c1
        
        cat_chi_score = (number_of_docs * math.pow(Pt1c1 * Pt0c0 - Pt1c0 * Pt0c1, 2))/(Pt1 * Pt0 * Pc1 * Pc0)
        # calculate average chi score
        chi_score += Pc1 * cat_chi_score
        #print "subclass %s: %.7f, %.7f, %.7f, %.7f, %.7f, %.7f" % (subclass, Pc1, Pt1c1, Pt1c0, Pt0c1, Pt0c0, chi_score)
    return chi_score

In [12]:
min_doc_postings_lists.count()

44846888

In [None]:
# min_doc_postings_lists = sc.parallelize(min_doc_postings_lists.take(10000))

# term_accepted_chi_list_with_scores = get_chi_index(min_doc_postings_lists, classifications_index, subclasses, doc_count).takeOrdered(TOP_N_FEATURES, lambda (term,score): -score)


### Order by Chi Squared and get Top features

In [None]:
term_accepted_chi_list_with_scores = get_chi_index(min_doc_postings_lists, classifications_index_set, subclasses, doc_count).takeOrdered(TOP_N_FEATURES, lambda (term,score): -score)
term_accepted_chi_list = map(lambda (x,y): x, term_accepted_chi_list_with_scores)
# gets a bit slower at the end but finishes eventually 
term_dictionary = get_term_dictionary(term_accepted_chi_list)
min_doc_postings_lists = min_doc_postings_lists.filter(lambda (term, postings): term in term_accepted_chi_list).cache()
number_of_terms = min_doc_postings_lists.count()
term_df_map = min_doc_postings_lists.map(lambda (term, postings): (term, len(postings))).collectAsMap()

# Save Postings List and the supporting objects
# min_doc_postings_lists.map(lambda (term, postings_list): ",".join([term, json.dumps(postings_list)])).repartition(1).saveAsTextFile(postings_list_output)
min_doc_postings_lists.map(lambda postings: json.dumps(postings)).saveAsTextFile(postings_list_chi_selected_output.format(str(TOP_N_FEATURES)))
sc.parallelize(term_dictionary.items()).saveAsPickleFile(term_dictionary_output.format(str(TOP_N_FEATURES)))
sc.parallelize(term_accepted_chi_list).saveAsPickleFile(accepted_terms_list_output.format(str(TOP_N_FEATURES)))
sc.parallelize(term_accepted_chi_list_with_scores).saveAsPickleFile(accepted_terms_with_scores_list_output.format(str(TOP_N_FEATURES)))

In [43]:
#min_doc_postings_lists.map(lambda postings: json.dumps(postings)).repartition(100).saveAsTextFile(postings_list_chi_selected_output.format(str(TOP_N_FEATURES)))
#term_df_map = min_doc_postings_lists.map(lambda (term, postings): (term, len(postings))).collectAsMap()
sc.parallelize(term_dictionary.items()).repartition(1).saveAsPickleFile(term_dictionary_output.format(str(TOP_N_FEATURES)))
sc.parallelize(term_df_map.items()).saveAsPickleFile(term_df_map_output.format(str(TOP_N_FEATURES)))
sc.parallelize(term_accepted_chi_list).repartition(1).saveAsPickleFile(accepted_terms_list_output.format(str(TOP_N_FEATURES)))
sc.parallelize(term_accepted_chi_list_with_scores).repartition(1).saveAsPickleFile(accepted_terms_with_scores_list_output.format(str(TOP_N_FEATURES)))

In [28]:
term_accepted_chi_list_with_scores[:10]

[(u'server', 65022.23210679769),
 (u'execut', 60767.31792863743),
 (u'network', 58732.915422222875),
 (u'request', 58148.748110792854),
 (u'comput', 55771.137483335304),
 (u'softwar', 52868.551734217064),
 (u'Internet', 51907.35286510473),
 (u'program', 50474.68119787968),
 (u'comput system', 50072.134620861594),
 (u'pharmaceut accept', 49650.26318563324)]

In [30]:
sc.parallelize(term_accepted_chi_list_with_scores).saveAsPickleFile(accepted_terms_with_scores_list_output)

In [26]:
term_accepted_chi_list[:100]

[u'server',
 u'execut',
 u'network',
 u'request',
 u'comput',
 u'softwar',
 u'Internet',
 u'program',
 u'comput system',
 u'pharmaceut accept',
 u'memori',
 u'pharmaceut',
 u'hardwar',
 u'client',
 u'instruct',
 u'inform',
 u'manag',
 u'pharmaceut composit',
 u'oper system',
 u'oral',
 u'processor',
 u'updat',
 u'surfac',
 u'information',
 u'data',
 u'memory',
 u'administration',
 u'store',
 u'therapeut',
 u'user',
 u'substrat number_inidicator',
 u'administ',
 u'protein',
 u'access',
 u'dosag',
 u'computer',
 u'assay',
 u'disk',
 u'etch',
 u'code',
 u'diseas',
 u'resourc',
 u'logic',
 u'commun',
 u'memori number_inidicator',
 u'server number_inidicator',
 u'substrat',
 u'network number_inidicator',
 u'acid',
 u'software',
 u'vivo',
 u'amino',
 u'implement',
 u'databas',
 u'vitro',
 u'retriev',
 u'RAM',
 u'parenter',
 u'Pharmaceut',
 u'send',
 u'accept salt',
 u'block diagram',
 u'messag',
 u'effect amount',
 u'storag',
 u'interfac',
 u'purifi',
 u'substrate',
 u'incub',
 u'number_inid

#### Recreate term dictionary with just the accepted terms

In [36]:
# gets a bit slower at the end but finishes eventually 
term_dictionary = get_term_dictionary(term_accepted_chi_list)

finished 10000


In [37]:
min_doc_postings_lists = min_doc_postings_lists.filter(lambda (term, postings): term in term_accepted_chi_list).cache()

In [22]:
number_of_terms = min_doc_postings_lists.count()
number_of_terms

100000

#### Save Reduced Postings List

In [50]:
# Save Postings List
## min_doc_postings_lists.map(lambda (term, postings_list): ",".join([term, json.dumps(postings_list)])).repartition(1).saveAsTextFile(postings_list_output)
min_doc_postings_lists.map(lambda postings: json.dumps(postings)).saveAsTextFile(postings_list_chi_selected_output.format(str(TOP_N_FEATURES)))
#sc.parallelize(term_dictionary.items()).saveAsPickleFile(term_dictionary_output)
#sc.parallelize(term_accepted_chi_list).saveAsPickleFile(accepted_terms_list_output)

#### Load Reduced Postings List

In [11]:
min_doc_postings_lists = sc.textFile(postings_list_chi_selected_output.format(str(TOP_N_FEATURES)).map(lambda json_postings: json.loads(json_postings)).cache()
term_dictionary = dict(sc.pickleFile(term_dictionary_output).collect())
number_of_terms = min_doc_postings_lists.count()

SyntaxError: invalid syntax (<ipython-input-11-89dbcbd1ab08>, line 2)

#### Collect document lengths

In [None]:
# need to collect the document lengths since they are used in the BM25 calculation
all_doc_index = create_doc_index(min_doc_postings_lists, term_dictionary)

doc_lengths_rdd = all_doc_index.mapValues(lambda postings_dictionary: reduce(lambda x, term: x + postings_dictionary[term], postings_dictionary, 0))
avg_doc_length = doc_lengths_rdd.map(lambda (term, count): count).reduce(lambda count1, count2: count1 + count2) / doc_count
doc_lengths_dict = doc_lengths_rdd.collectAsMap()

In [53]:
all_doc_index.map(lambda postings: json.dumps(postings)).saveAsTextFile(doc_index_chi_selected_output.format(str(TOP_N_FEATURES)))

Save Document Lengths

In [22]:
sc.parallelize(doc_lengths_dict.items()).saveAsPickleFile(doc_lengths_map_output)

In [None]:
all_doc_index.take(1)

In [None]:
# all_doc_index.saveAsPickleFile(doc_index_chi_selected_output)

Load Document Lengths

In [None]:
doc_lengths_dict = dict(sc.pickleFile(doc_lengths_map_output).collect())
avg_doc_length = sum(doc_lengths_dict.values())/len(doc_lengths_dict)

In [24]:
doc_lengths_dict.items()[0]

(u'08226314', 3466)

In [25]:
len(doc_lengths_dict)

2009750

### Load everything for training

In [13]:
min_doc_postings_lists = sc.textFile(postings_list_chi_selected_output.format(str(TOP_N_FEATURES))).map(lambda json_postings: json.loads(json_postings)).cache()
term_dictionary = dict(sc.pickleFile(term_dictionary_output.format(str(TOP_N_FEATURES))).collect())
term_df_map = dict(sc.pickleFile(term_df_map_output.format(str(TOP_N_FEATURES))).collect())
number_of_terms = len(term_df_map) # min_doc_postings_lists.count()
doc_lengths_dict = dict(sc.pickleFile(doc_lengths_map_output).collect())
avg_doc_length = sum(doc_lengths_dict.values())/len(doc_lengths_dict)
#all_doc_index = sc.textFile(doc_index_chi_selected_output.format(str(TOP_N_FEATURES))).map(lambda json_postings: json.loads(json_postings)).cache()

In [15]:
all_doc_index = all_doc_index.map(lambda (doc_id, postings): (doc_id, {int(key): postings[key] for key in postings})).cache()

### Get min_doc_postings_lists for the sample only

In [62]:
training_docs_set = set(training_documents)
validation_docs_set = set(validation_documents)
test_docs_set = set(test_documents)

In [137]:
min_doc_postings_lists = sc.textFile(postings_list_chi_selected_output.format(str(TOP_N_FEATURES))).map(lambda json_postings: json.loads(json_postings)).cache()
min_doc_postings_lists = min_doc_postings_lists.map(lambda (term, postings): (term, {doc_id:postings[doc_id] for doc_id in postings if doc_id in training_docs_set or doc_id in validation_docs_set or doc_id in test_docs_set}))

In [39]:
min_doc_postings_lists.map(lambda postings: json.dumps(postings)).saveAsTextFile(get_save_location(postings_list_chi_selected_output.format(str(TOP_N_FEATURES)), sample=IS_SAMPLE))

#### Only training

In [19]:
training_postings_output = get_save_location(postings_list_training_chi_selected_output.format(str(TOP_N_FEATURES)), sample=IS_SAMPLE)

Creating

In [63]:
training_min_doc_postings_lists = min_doc_postings_lists.map(lambda (term, postings): (term, {doc_id:postings[doc_id] for doc_id in postings if doc_id in training_docs_set}))
# training_min_doc_postings_lists.map(lambda postings: json.dumps(postings)).saveAsTextFile(training_postings_output)

In [20]:
training_min_doc_postings_lists = sc.textFile(training_postings_output).map(get_json)

#### Only validation

In [40]:
validation_postings_output = get_save_location(postings_list_validation_chi_selected_output.format(str(TOP_N_FEATURES)), sample=IS_SAMPLE)

Creating

In [121]:
validation_min_doc_postings_lists = min_doc_postings_lists.map(lambda (term, postings): (term, {doc_id:postings[doc_id] for doc_id in postings if doc_id in validation_docs_set}))
validation_min_doc_postings_lists.map(lambda postings: json.dumps(postings)).saveAsTextFile(validation_postings_output)

In [41]:
validation_min_doc_postings_lists = sc.textFile(validation_postings_output).map(get_json)

#### Only test

In [16]:
test_postings_output = get_save_location(postings_list_test_chi_selected_output.format(str(TOP_N_FEATURES)), sample=IS_SAMPLE)

In [124]:
test_min_doc_postings_lists = min_doc_postings_lists.map(lambda (term, postings): (term, {doc_id:postings[doc_id] for doc_id in postings if doc_id in test_docs_set}))
test_min_doc_postings_lists.map(lambda postings: json.dumps(postings)).saveAsTextFile(test_postings_output)

In [17]:
test_min_doc_postings_lists = sc.textFile(test_postings_output).map(get_json)

### Start creating term weighting postings

In [15]:
def create_written_doc_index(term_index, name, data_type="training"):
    doc_index = create_doc_index(term_index, term_dictionary)
    output_name = get_data_output_name(name, data_type=data_type)
    doc_index.map(lambda postings: json.dumps(postings)).repartition(100).saveAsTextFile(output_name)
    doc_index = sc.textFile(output_name).map(get_json_convert_num)# .cache()
    return doc_index

def read_written_doc_index(name, data_type="training"):
    output_name = get_data_output_name(name, data_type=data_type)
    doc_index = sc.textFile(output_name).map(get_json_convert_num)
    return doc_index

#### Create Training Set

In [None]:
%%time

tf_postings = training_min_doc_postings_lists
# tf_doc_index_training = create_written_doc_index(tf_postings, "tf")

# sublinear_tf_postings = tf_postings.mapValues(lambda postings: {docId:  calculate_sublinear_tf(tf) for docId, tf in postings.items()})
# sublinear_tf_doc_index_training = create_written_doc_index(sublinear_tf_postings, "tf-sublinear")

# tf_idf_postings = tf_postings.mapValues(lambda postings: {docId:  calculate_tf_idf(tf, len(postings), len(training_documents)) for docId, tf in postings.items()})
# tf_idf_doc_index_training = create_written_doc_index(tf_idf_postings, "tf-idf")

# sublinear_tf_idf_postings = tf_postings.mapValues(lambda postings: {docId:  calculate_sublinear_tf_idf(tf, len(postings), len(training_documents)) for docId, tf in postings.items()})
# sublinear_tf_idf_doc_index_training = create_written_doc_index(sublinear_tf_idf_postings, "sublinear-tf-idf")

# bm25_postings = tf_postings.mapValues(lambda postings: {docId: calculate_bm25(tf, len(postings), len(training_documents), doc_lengths_dict[docId], avg_doc_length) for docId, tf in postings.items()})
# bm25_doc_index_training = create_written_doc_index(bm25_postings, "bm25")

bm25_postings = tf_postings.mapValues(lambda postings: {docId: calculate_bm25(tf, len(postings), len(training_documents), doc_lengths_dict[docId], avg_doc_length) for docId, tf in postings.items()})
bm25_doc_index_training = create_written_doc_index(bm25_postings, "bm25_0.001_sample")

Read Training Set

In [16]:
tf_doc_index_training = read_written_doc_index("tf")
sublinear_tf_doc_index_training = read_written_doc_index("tf-sublinear")
tf_idf_doc_index_training = read_written_doc_index("tf-idf")
sublinear_tf_idf_doc_index_training = read_written_doc_index("sublinear-tf-idf")
bm25_doc_index_training = read_written_doc_index("bm25")

#### Create Validation Set

In [62]:
%%time

tf_postings_validation = validation_min_doc_postings_lists
# tf_doc_index_validation = create_written_doc_index(tf_postings_validation, "tf", data_type="validation")

# sublinear_tf_postings_validation = tf_postings_validation.mapValues(lambda postings: {docId:  calculate_sublinear_tf(tf) for docId, tf in postings.items()})
# sublinear_tf_doc_index_validation = create_written_doc_index(sublinear_tf_postings_validation, "tf-sublinear", data_type="validation")

# tf_idf_postings_validation = tf_postings_validation.mapValues(lambda postings: {docId:  calculate_tf_idf(tf, len(postings), len(validation_documents)) for docId, tf in postings.items()})
# tf_idf_doc_index_validation = create_written_doc_index(tf_idf_postings_validation, "tf-idf", data_type="validation")

sublinear_tf_idf_postings_validation = tf_postings_validation.mapValues(lambda postings: {docId:  calculate_sublinear_tf_idf(tf, len(postings), len(validation_documents)) for docId, tf in postings.items()})
sublinear_tf_idf_doc_index_validation = create_written_doc_index(sublinear_tf_idf_postings_validation, "sublinear-tf-idf", data_type="validation")

# bm25_postings_validation = tf_postings_validation.mapValues(lambda postings: {docId: calculate_bm25(tf, len(postings), len(validation_documents), doc_lengths_dict[docId], avg_doc_length) for docId, tf in postings.items()})
# bm25_doc_index_validation = create_written_doc_index(bm25_postings_validation, "bm25", data_type="validation")

CPU times: user 3.07 s, sys: 156 ms, total: 3.23 s
Wall time: 12min 39s


Read Validation Set

In [17]:
tf_doc_index_validation = read_written_doc_index("tf", data_type="validation")
sublinear_tf_doc_index_validation = read_written_doc_index("tf-sublinear", data_type="validation")
tf_idf_doc_index_validation = read_written_doc_index("tf-idf", data_type="validation")
sublinear_tf_idf_doc_index_validation = read_written_doc_index("sublinear-tf-idf", data_type="validation")
bm25_doc_index_validation = read_written_doc_index("bm25", data_type="validation")

In [71]:
def jsonKV2str(x):
    if isinstance(x, dict):
            return {int(k):(int(v) if isinstance(v, unicode) else v) for k,v in x.items()}
    return x

output_namee = "hdfs://deka.cip.ifi.lmu.de/svm/new/lskd4.json"
dd = {"232323":{3:2},"oooidii": {3:4}}
#sc.parallelize(dd.items()).take(1)
#sc.parallelize(dd.items()).map(lambda postings: json.dumps(postings)).saveAsTextFile(output_namee)
sc.parallelize(dd.items()).map(lambda postings: json.dumps(postings)).take(1)
sc.parallelize(dd.items()).map(lambda postings: json.dumps(postings)).map(lambda postings: json.loads(postings, object_hook=jsonKV2str)).collect()

#map(json.dumps,dd.items() )

[[u'oooidii', {3: 4}], [u'232323', {3: 2}]]

In [None]:
tf_idf_doc_index_validation.take(1)

### (OLD) Start creating term weighting postings

In [None]:
%%time
tf_postings = min_doc_postings_lists
tf_doc_index_training = all_doc_index.filter(lambda (doc_id, postings): doc_id in training_documents).cache()

sublinear_tf_postings = tf_postings.mapValues(lambda postings: {docId:  calculate_sublinear_tf(tf) for docId, tf in postings.items()})
sublinear_tf_doc_index = create_doc_index(sublinear_tf_postings, term_dictionary)
sublinear_tf_doc_index_training = sublinear_tf_doc_index.filter(lambda (doc_id, postings): doc_id in training_documents).cache()

tf_idf_postings = tf_postings.mapValues(lambda postings: {docId:  calculate_tf_idf(tf, len(postings), doc_count) for docId, tf in postings.items()})
tf_id_doc_index = create_doc_index(tf_postings, term_dictionary)
tf_id_doc_index_training = tf_id_doc_index.filter(lambda (doc_id, postings): doc_id in training_documents).cache()
tf_id_doc_index.map(lambda postings: json.dumps(postings)).saveAsTextFile(get_data_output_name("tf-idf"))

bm25_postings = tf_postings.mapValues(lambda postings: {docId: calculate_bm25(tf, len(postings), doc_count, doc_lengths_dict[docId], avg_doc_length) for docId, tf in postings.items()})
bm25_doc_index = create_doc_index(bm25_postings, term_dictionary)
bm25_doc_index_training = bm25_doc_index.filter(lambda (doc_id, postings): doc_id in training_documents).cache()
bm25_doc_index.map(lambda postings: json.dumps(postings)).saveAsTextFile(get_data_output_name("bm25"))

#### Create Validation Set

In [17]:
%%time
tf_doc_index_val = all_doc_index.filter(lambda (doc_id, postings): doc_id in validation_documents).cache()
sublinear_tf_doc_index_val = sublinear_tf_doc_index.filter(lambda (doc_id, postings): doc_id in validation_documents).cache()
tf_id_doc_index_val = tf_id_doc_index.filter(lambda (doc_id, postings): doc_id in validation_documents).cache()
bm25_doc_index_val = bm25_doc_index.filter(lambda (doc_id, postings): doc_id in validation_documents).cache()

CPU times: user 11.2 s, sys: 332 ms, total: 11.5 s
Wall time: 12.5 s


## Actual Training

In [17]:
training_evaluations = {}
validation_evaluations = {}

In [18]:
def model_exists(path):
    try:
        model = SVMModel.load(sc, path)
        return True;
    except:
        return False

In [None]:
%%time
i=0
# for section in sections:
#     classification = section
for clss in classes:
    classification = clss
    print classification
    #if classification == "A" or classification == "B" or classification == "C" or classification == "D": continue
    i+=1
    training_evaluations[classification] = {}
    validation_evaluations[classification] = {}
    representations_to_test = [
#                                ("tf", tf_doc_index_training, tf_doc_index_validation), 
#                                ("tf-sublinear", sublinear_tf_doc_index_training, sublinear_tf_doc_index_validation),
#                                ("tf-idf", tf_idf_doc_index_training, tf_idf_doc_index_validation),  
#                                ("sublinear-tf-idf", sublinear_tf_idf_doc_index_training, sublinear_tf_idf_doc_index_validation), 
                               ("bm25", bm25_doc_index_training, bm25_doc_index_validation)
                                ]
    #representations_to_test = [("tf", tf_doc_index), ("tf-sublinear", sublinear_tf_doc_index), ("tf-idf", tf_id_doc_index), ("bm25", bm25_doc_index)]
    
    for name, doc_index, val_doc_index in representations_to_test:
        try:
            print "Trying: " + name
            model_path = get_model_name(name, classification)
            if not model_exists(model_path):
                training_vectors, svm = train_level_new(doc_index, classification, doc_classification_map, number_of_terms)
                svm.save(sc, model_path)
            else:
                print "Model Exists"
        except:
            print "Problem creating: %s: %s" % (classification, name)
            raise
#         print "Trying: " + name
#         docs_with_classes = doc_index.map(lambda (doc_id, terms): (doc_id, (terms, doc_classification_map[doc_id])))
#         training_vectors, svm = train_level(docs_with_classes, classification, number_of_terms)
#         svm.save(sc, get_model_name(name, classification))
#         labels = training_vectors.map(lambda p: p.label).collect()
#         predictions = training_vectors.map(lambda p: svm.predict(p.features)).collect()
#         training_evaluations[classification][name] = Evaluator(labels, predictions)
#         # validation
#         print "Validating"
#         validation_vectors = get_labeled_points_from_doc_index(val_doc_index, doc_classification_map, number_of_terms)
#         labels_val = validation_vectors.map(lambda p: p.label).collect()
#         predictions_val = validation_vectors.map(lambda p: svm.predict(p.features)).collect()
#         validation_evaluations[classification][name] = Evaluator(labels_val, predictions_val)
    
#     rf_postings = tf_postings.mapValues(get_rf_postings(classification))
#     rf_doc_index = create_doc_index(rf_postings, term_dictionary)
#     # save the doc index so we don't have to create it again
#     rf_doc_index.map(lambda postings: json.dumps(postings)).saveAsTextFile(get_data_classification_output_name("rf", classification))
#     rf_doc_index_training = rf_doc_index.filter(lambda (doc_id, postings): doc_id in training_documents)
#     rf_doc_index_val = rf_doc_index.filter(lambda (doc_id, postings): doc_id in validation_documents)
#     docs_with_classes = rf_doc_index_training.map(lambda (doc_id, terms): (doc_id, (terms, doc_classification_map[doc_id])))
#     training_vectors, svm = train_level(docs_with_classes, classification, number_of_terms)
#     svm.save(sc, get_model_name("rf", classification))
#     labels = training_vectors.map(lambda p: p.label).collect()
#     predictions = training_vectors.map(lambda p: svm.predict(p.features)).collect()
#     training_evaluations[classification]["rf"] = Evaluator(labels, predictions)
#     # validation
#     validation_vectors = get_labeled_points_from_doc_index(rf_doc_index_val, doc_classification_map, number_of_terms)
#     labels_val = validation_vectors.map(lambda p: p.label).collect()
#     predictions_val = validation_vectors.map(lambda p: svm.predict(p.features)).collect()
#     validation_evaluations[classification][name] = Evaluator(labels_val, predictions_val)
    
    
#     tf_rf_postings = tf_postings.mapValues(get_tf_rf_postings(classification))
#     tf_rf_doc_index = create_doc_index(tf_rf_postings, term_dictionary)
#     # save the doc index so we don't have to create it again
#     tf_rf_doc_index.map(lambda postings: json.dumps(postings)).saveAsTextFile(get_data_classification_output_name("tf-rf", classification))
#     tf_rf_doc_index_training = tf_rf_doc_index.filter(lambda (doc_id, postings): doc_id in training_documents)
#     tf_rf_doc_index_val = tf_rf_doc_index.filter(lambda (doc_id, postings): doc_id in validation_documents)
#     docs_with_classes = tf_rf_doc_index_training.map(lambda (doc_id, terms): (doc_id, (terms, doc_classification_map[doc_id])))
#     training_vectors, svm = train_level(docs_with_classes, classification, number_of_terms)
#     svm.save(sc, get_model_name("tf-rf", classification))
#     labels = training_vectors.map(lambda p: p.label).collect()
#     predictions = training_vectors.map(lambda p: svm.predict(p.features)).collect()
#     training_evaluations[classification]["tf-rf"] = Evaluator(labels, predictions)
#     # validation
#     validation_vectors = get_labeled_points_from_doc_index(tf_rf_doc_index_val, doc_classification_map, number_of_terms)
#     labels_val = validation_vectors.map(lambda p: p.label).collect()
#     predictions_val = validation_vectors.map(lambda p: svm.predict(p.features)).collect()
#     validation_evaluations[classification][name] = Evaluator(labels_val, predictions_val)

A-00
Trying: bm25
Model Exists
A-01
Trying: bm25
Model Exists
A-02
Trying: bm25
Model Exists
A-03
Trying: bm25
Model Exists
A-04
Trying: bm25
Model Exists
A-05
Trying: bm25
Model Exists
A-06
Trying: bm25
Model Exists
A-07
Trying: bm25
Model Exists
A-10
Trying: bm25
Model Exists
A-11
Trying: bm25
Model Exists
A-12
Trying: bm25
Model Exists
A-13
Trying: bm25
Model Exists
A-15
Trying: bm25
Model Exists
A-16
Trying: bm25
Model Exists
A-18
Trying: bm25
Model Exists
A-21
Trying: bm25
Model Exists
A-22
Trying: bm25
Model Exists
A-23
Trying: bm25
Model Exists
A-24
Trying: bm25
Model Exists
A-25
Trying: bm25
Model Exists
A-26
Trying: bm25
Model Exists
A-27
Trying: bm25
Model Exists
A-28
Trying: bm25
Model Exists
A-29
Trying: bm25
Model Exists
A-31
Trying: bm25
Model Exists
A-32
Trying: bm25
Model Exists
A-33
Trying: bm25
Model Exists
A-34
Trying: bm25
Model Exists
A-35
Trying: bm25


In [28]:
model_exists(get_model_name("tf", "A"))

True

In [43]:
training_evaluations = {}
validation_evaluations = {}

classification = "A-01"

training_evaluations[classification] = {}
validation_evaluations[classification] = {}
representations_to_test = [
#     ("tf", tf_doc_index_training, tf_doc_index_validation),
#     ("tf-sublinear", sublinear_tf_doc_index_training, sublinear_tf_doc_index_validation), 
#     ("bm25", bm25_doc_index_training, bm25_doc_index_validation),
#     ("tf-idf", tf_idf_doc_index_training, tf_idf_doc_index_validation)
    ("sublinear-tf-idf", sublinear_tf_idf_doc_index_training, sublinear_tf_idf_doc_index_validation), 
]
name, doc_index, val_doc_index = representations_to_test[0]

In [44]:
#doc_index.map(lambda postings: json.dumps(postings)).saveAsTextFile(get_data_output_name(name))

In [45]:
print "Trying: " + name
docs_with_classes = doc_index.map(lambda (doc_id, terms): (doc_id, (terms, doc_classification_map[doc_id])))
training_vectors, svm = train_level(docs_with_classes, classification, number_of_terms)
svm.save(sc, get_model_name(name, classification))

Trying: tf


In [None]:
training_vectors = docs_with_classes.map(
        lambda (doc_id, (term_list, classifications)): get_training_vector(classification, term_list,
                                                                           classifications, number_of_terms))
svm = SVMWithSGD.train(training_vectors, iterations=SVM_ITERATIONS, convergenceTol=SVM_CONVERGENCE, regParam=SVM_REG, validateData=False)

In [23]:
labels = training_vectors.map(lambda p: p.label).collect()
predictions = training_vectors.map(lambda p: svm.predict(p.features)).collect()
training_evaluations[classification][name] = Evaluator(labels, predictions)

In [None]:
# validation
print "Validating"
validation_vectors = get_labeled_points_from_doc_index(val_doc_index, doc_classification_map, number_of_terms)
labels_val = validation_vectors.map(lambda p: p.label).collect()
predictions_val = validation_vectors.map(lambda p: svm.predict(p.features)).collect()
validation_evaluations[classification][name] = Evaluator(labels_val, predictions_val)


### Validation

In [None]:
%%time
representations_to_test = [
#    ("tf", tf_doc_index_validation), 
#    ("tf-sublinear", sublinear_tf_doc_index_validation), 
#    ("tf-idf", tf_idf_doc_index_validation), 
#    ("sublinear-tf-idf", sublinear_tf_idf_doc_index_validation), 
   ("bm25", bm25_doc_index_validation)
]
results = {}
# subset = "sections"
subset = "classes"
for method, val_doc_set in representations_to_test:
# method = representations_fto_test[0][0]
# val_doc_set = representations_to_test[0][1]
    results[method] = {}
    #val_doc_set.cache()
    doc_count = len(validation_documents)
    print doc_count
#     y_score = np.zeros((doc_count, len(sections)))
#     y_true = np.zeros((doc_count, len(sections)))
    y_score = np.zeros((doc_count, len(classes)))
    y_true = np.zeros((doc_count, len(classes)))
    i=0

#     for section in sections:
#         print section
#         classification = section
    for clss in classes:
        print clss
        classification = clss
        
        val_doc_set_vectors = val_doc_set.map(lambda (doc_id, postings): 
                                              get_training_vector(classification, postings, doc_classification_map[doc_id], 
                                                                  number_of_terms) )

        binarySvm = SVMModel.load(sc, get_model_name(method, classification))
        print "Loaded the model"
        binarySvm.clearThreshold()
        %time labels_predictions = val_doc_set_vectors.map(lambda p: (p.label, binarySvm.predict(p.features))).collect()
        #labels = test_labeled_points.map(lambda p: p.labels)
        y_true[:,i] = [label_pred[0] for label_pred in labels_predictions]
        y_score[:,i] = [label_pred[1] for label_pred in labels_predictions]
        i+=1
    y_binary_score = get_binary(y_score)
    results[method]["y_true"] = y_true
    results[method]["y_score"] = y_score
    results[method]["y_binary_score"] = y_binary_score
    metrics = get_metrics(y_true, y_binary_score)
    
    sc.parallelize(y_true).repartition(1).saveAsPickleFile(get_labels_output_name(data_type="validation", subset=subset))
    sc.parallelize(y_score).repartition(1).saveAsPickleFile(get_prediction_output_name(method=method, data_type="validation", subset=subset))
    sc.parallelize(("metrics", json.dumps(metrics))).saveAsTextFile(get_metrics_output_name(method=method, data_type="validation", subset=subset))

321473
A-00
Loaded the model


In [None]:
import cPickle as pickle
pickle.dump(y_true, open('/home/s/shalaby/y_true.pkl'))
pickle.dump(y_score, open('/home/s/shalaby/y_score.pkl'))

In [1]:
method

NameError: name 'method' is not defined

In [78]:
method = 'bm25'
y_true = results[method]['y_true']
y_score = results[method]['y_score']
y_binary_score = results[method]['y_binary_score']

In [None]:
SVMModel.load(sc, "/svm/new/lskdjflsdf")

In [26]:
sc.parallelize(y_true).repartition(1).saveAsPickleFile(get_labels_output_name(data_type="validation", subset="sections"))

In [None]:
sc.parallelize(y_score).repartition(1).saveAsPickleFile(get_prediction_output_name(method=method, data_type="validation", subset="sections"))

#### Load Labels and predictions

In [10]:
method = "sublinear-tf-idf"

In [11]:
get_prediction_output_name(method=method, data_type="validation", subset="sections")

'hdfs://deka.cip.ifi.lmu.de/svm/new/models/iter_1000_reg_0.01/sublinear-tf-idf_validation_sections_predictions.svm'

In [12]:
y_true = np.array(sc.pickleFile(get_labels_output_name(data_type="validation", subset="sections")).collect())
y_score = np.array(sc.pickleFile(get_prediction_output_name(method=method, data_type="validation", subset="sections")).collect())
y_binary_score = get_binary(y_score)

In [25]:
y_true

array([[ 0.,  0.,  0., ...,  1.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  1.,  1.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.]])

In [117]:
y_score

array([[ 0.62894004, -4.5963389 , -0.86146197, ..., -9.05835662,
        -4.38929899, -8.42909693],
       [-1.81517444, -2.07990448, -2.53543284, ..., -2.37632697,
        -0.19371968,  0.43062112],
       [-2.57868501, -2.65605152, -3.48412654, ..., -2.90845723,
         0.21642679,  0.02513824],
       ..., 
       [-2.3536382 , -1.56966217, -2.2091566 , ..., -2.19340324,
        -0.28346742,  0.68576182],
       [-0.92857442, -0.63666347, -1.38252274, ..., -0.73433494,
        -0.90276557, -0.79875533],
       [-0.73324228, -0.55649702, -1.26487436, ..., -0.71490296,
        -0.88660098, -0.8348988 ]])

In [85]:
y_score[:,1].shape

(321473,)

In [81]:
y_score.shape

(321473, 8)

In [113]:
y_binary_score

array([[0, 0, 0, ..., 0, 0, 1],
       [1, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ..., 
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 1, 0, ..., 0, 0, 0]])

In [27]:
np.sum(y_binary_score[:,3])

1237

In [1]:
1+1

2

In [2]:
method = 'sublinear-tf-idf'
y_true = results[method]['y_true']
y_score = results[method]['y_score']
y_binary_score = results[method]['y_binary_score']

NameError: name 'results' is not defined

In [13]:
metrics = get_metrics(y_true, y_binary_score)
metrics

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'average_num_of_labels': 1.1485630208446744,
 'average_precision_macro': 0.29945386345673969,
 'average_precision_micro': 0.28740309049121615,
 'coverage_error': 4.1294914347394647,
 'f1_macro': 0.1032281117589451,
 'f1_micro': 0.21762015163857384,
 'f1_scores_array': [0.10546985326752116,
  0.025147841804357647,
  0.09304370481339153,
  0.0,
  0.0,
  0.0049892774300844675,
  0.33211480927449516,
  0.26505940748171086],
 'precision_macro': 0.13921380824027155,
 'precision_micro': 0.27817967202057248,
 'precision_scores_array': [0.1440220385674931,
  0.1426342767996434,
  0.10077245952523721,
  0.0,
  0.0,
  0.07019704433497537,
  0.35879643657960125,
  0.297288210115222],
 'recall_macro': 0.091781605664028196,
 'recall_micro': 0.17871419595268015,
 'recall_scores_array': [0.08319894754710576,
  0.013789537188658106,
  0.08641601887420224,
  0.0,
  0.0,
  0.0025865589690066706,
  0.30912682527803403,
  0.23913495745521876],
 'topN_avg': 3.2088231360020902,
 'topN_list': [5,
  4,
  3,
 

In [14]:
sc.parallelize(("metrics", json.dumps(metrics))).saveAsTextFile(get_metrics_output_name(method=method, data_type="validation", subset="sections"))

In [102]:
get_metrics_output_name(method=method, data_type="validation", subset="sections")

'hdfs://deka.cip.ifi.lmu.de/svm/new/models/iter_1000_reg_0.01/bm25_validation_sections_metrics.pkl'

In [100]:
SVM_REG

0.01

#### Load Metrics

In [112]:
method = "tf"

In [113]:
loaded_metrics = json.loads(sc.textFile(get_metrics_output_name(method=method, data_type="validation", subset="sections")).collect()[1])

In [114]:
loaded_metrics

{u'average_num_of_labels': 1.1485630208446744,
 u'average_precision_macro': 0.6181398347172771,
 u'average_precision_micro': 0.6740106535881538,
 u'coverage_error': 3.347438198542335,
 u'f1_macro': 0.4453958787293348,
 u'f1_micro': 0.6465376531616135,
 u'f1_scores_array': [0.6598160048844598,
  0.5282436574152903,
  0.6680647596678131,
  0.0,
  0.0,
  0.3551263915369949,
  0.760959623844551,
  0.5909565924855693],
 u'precision_macro': 0.5022584534761925,
 u'precision_micro': 0.6678927795999445,
 u'precision_scores_array': [0.643975356024644,
  0.5218899823366137,
  0.750075346594334,
  0.0,
  0.0,
  0.5509016315237095,
  0.6441524294269122,
  0.9070728819033265],
 u'recall_macro': 0.43039851669590423,
 u'recall_micro': 0.6265058283139057,
 u'recall_scores_array': [0.6764556102529282,
  0.5347539429457899,
  0.6022201385318049,
  0.0,
  0.0,
  0.2620138857376231,
  0.9295125050037419,
  0.4382320510953461],
 u'topN_avg': 2.0154507532514394,
 u'topN_list': [5,
  1,
  1,
  1,
  2,
  1,
  

In [50]:
metrics_loaded = sc.pickleFile(get_metrics_output_name(method=method, data_type="validation", subset="sections")).collectAsMap()

ValueError: dictionary update sequence element #0 has length 23; 2 is required

In [49]:
metrics_loaded

['average_precision_micro',
 'f1_macro',
 'recall_macro',
 'precision_micro',
 'recall_micro',
 'average_precision_macro',
 'f1_micro',
 'precision_macro',
 'f1_scores_array',
 'coverage_error',
 'average_num_of_labels',
 'precision_scores_array',
 'recall_scores_array']

In [91]:
precision_scores = np.zeros((len(sections)))
for i in range(0,len(sections)):
    precision_scores[i] = sklearn.metrics.precision_score(y_true[:,i], y_binary_score[:,i])
metrics['precision_scores_array']precision_scores

array([ 0.64397536,  0.52188998,  0.75007535,  0.        ,  0.        ,
        0.55090163,  0.64415243,  0.90707288])

In [101]:
section_index = 5
print np.sum(y_binary_score[:, section_index])
print np.sum(y_true[:, section_index])

-1434134418.46
22037.0


In [92]:
e = Evaluator(y_true[:,1], y_binary_score[:,1])

2.0154507532514394

In [36]:
docu_index = 0

get_row_top_N(y_score[docu_index,:], y_true[docu_index,:])


[ -160.73221359   -63.53035615 -1389.43386875 -2086.39085259 -1708.9396494
  -257.07255332    -4.29960961   -15.17807309]
[ 0.  0.  0.  0.  0.  1.  0.  0.]
[6 7 1 0 5 2 4 3]


5

In [94]:
e.f1

0.5282436574152903

In [67]:
precision_scores

array([ 0.64397536,  0.52188998,  0.75007535,  0.        ,  0.        ,
        0.55090163,  0.64415243,  0.90707288])

In [64]:
precision_scores = np.zeros((len(sections),))

In [65]:
precision_scores

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [57]:
for i in xrange(0,len(sections)):
    print "lskdjfls"

lskdjfls
lskdjfls
lskdjfls
lskdjfls
lskdjfls
lskdjfls
lskdjfls
lskdjfls


In [39]:
import sklearn.metrics

0.67401065358815382

In [42]:
sklearn.metrics.f1_score(y_true, y_binary_score, average='micro')

0.64653765316161349

In [28]:
def get_coverage_error(test_labeled_points, classifications, method):
    #test_labeled_points.cache()
    y_score = np.zeros(test_labeled_points.count(), len(classifications))
    y_true = np.zeros(test_labeled_points.count(), len(classifications))
    
    i = 0
    for classification in classifications:
        binarySvm = SVMModel.load(sc, get_model_name(method, classification))
        binarySvm.clearThreshold()
        predictions = test_labeled_points.map(lambda p: binarySvm.predict(p.features))
        labels = test_labeled_points.map(lambda p: p.labels)
        y_score[:][i] = predictions
        y_true[:][i] = labels
        i += 1
    return coverage_error(y_score, y_true)

## Testing

In [None]:
%%time
tf_doc_index_test = create_doc_index(tf_postings, term_dictionary).filter(lambda (doc_id, postings): doc_id in validation_documents)
sublinear_tf_doc_index_test = create_doc_index(sublinear_tf_postings, term_dictionary).filter(lambda (doc_id, postings): doc_id in validation_documents)
tf_id_doc_index_test = create_doc_index(tf_postings, term_dictionary).filter(lambda (doc_id, postings): doc_id in validation_documents)
bm25_doc_index_test = create_doc_index(bm25_postings, term_dictionary).filter(lambda (doc_id, postings): doc_id in validation_documents)

In [None]:
method = "bm25"
test_vectors = get_labeled_points_from_doc_index(bm25_doc_index_test, doc_classification_map, number_of_terms)
get_coverage_error(test_vectors, sections, method)