In [1]:
import json
import nltk
from nltk.tokenize import RegexpTokenizer
import string
import math
import time
import pandas as pd
import pyspark
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.classification import SVMWithSGD, SVMModel
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import random
from sklearn.metrics import coverage_error
import sklearn.metrics
import cPickle as pickle

from thesis.utils.metrics import *

In [48]:
IS_SAMPLE = False

In [2]:
STOP_WORDS = nltk.corpus.stopwords.words('english')
NUMBER_INDICATOR = "number_inidicator"
CURRENCY_INDICATOR = "currency_inidicator"
CHEMICAL_INDICATOR = "chemical_inidicator"
MIN_SIZE = 3
MIN_DOCUMENTS = 5
TOP_N_FEATURES = 10000

SVM_ITERATIONS = 100
SVM_CONVERGENCE = 0.001
SVM_REG = 0.001

BM25_K = 1.5  # controls power of tf component
BM25_b = 0.75  # controls the BM25 length normalization

RANDOM_SEED = 10000
random.seed(RANDOM_SEED)

### Text Manipulation functions

In [3]:
def simple_tokenizer(text, doc_id):
    """ 
    Get clean stems out of a text where number, chemical, currency indicators have already been identified.
    A list of clean stems are returned 
    """
    tokenizer = RegexpTokenizer(r'\s+', gaps=True)
    tokens = tokenizer.tokenize(text)
    stems = []  # result
    previous_unigram = None
    for token in tokens:
        stem = token.lower()
        stem = stem.strip(string.punctuation)
        if stem:
            if is_stopword(stem):
                stem = None
            if stem and len(stem) >= MIN_SIZE:
                # extract uni-grams
                stems.append((stem,{doc_id: 1}))
                # extract bi-grams
                if previous_unigram: stems.append((previous_unigram + " " + stem,{doc_id: 1}))
                previous_unigram = stem
    del tokens
    return stems

def is_stopword(word):
    return word in STOP_WORDS

### Training functions

In [5]:
def merge_postings(postings_list1, postings_list2):
    # key could be either a doc id or a term
    for key in postings_list2:
        if postings_list1.get(key):
            postings_list1[key] += postings_list2[key]
        else:
            postings_list1[key] = postings_list2[key]
    return postings_list1

def get_term_dictionary(terms):
    """
    Maps string terms to indexes in an array
    """
    term_dictionary = {}
    term_array = [None] * len(terms)
    def put(key):
        hashvalue = hashfunction(key, len(term_array))
        if term_array[hashvalue] == None:
            term_array[hashvalue] = key
            return hashvalue
        else:
            nextslot = rehash(hashvalue, len(term_array))
            while term_array[nextslot] != None:
                nextslot = rehash(nextslot, len(term_array))
            if term_array[nextslot] == None:
                term_array[nextslot] = key
                return nextslot
    def hashfunction(key, size):
        return hash(key) % size
    def rehash(oldhash, size):
        return (oldhash + 1) % size
    i = 0
    for term in terms:
        corresponding_index = put(term)
        term_dictionary[term] = corresponding_index
        i+=1
        if i%10000 == 0: print "finished " + str(i)
    return term_dictionary

def jsonKV2str(x):
    """
    Change string keys to int
    """
    if isinstance(x, dict):
            #return {doc_id:{int(term_id):x[doc_id][term_id] for term_id in x[doc_id]} for doc_id in x }
        
            return {int(k):(int(v) if isinstance(v, unicode) else v) for k,v in x.items()}
    return x

def get_json(json_postings):
    return json.loads(json_postings)

def get_json_convert_num(json_postings):
    return json.loads(json_postings, object_hook=jsonKV2str)

def get_doc_index(term, postings_list, term_dictionary):
    #return [(doc_id, {term: postings_list[doc_id]}) for doc_id in postings_list]
    return [(doc_id, {term_dictionary[term]: postings_list[doc_id]}) for doc_id in postings_list]


def calculate_sublinear_tf(tf):
    # laplace smoothing with +1 in case of term with no documents (useful during testing)
    return math.log10(1 + tf)


def calculate_tf_idf(tf, df, N):
    # laplace smoothing with +1 in case of term with no documents (useful during testing)
    return tf * math.log10((N+1) / (df + 1))


def calculate_sublinear_tf_idf(tf, df, N):
    # laplace smoothing with +1 in case of term with no documents (useful during testing)
    return calculate_sublinear_tf(tf) * math.log10((N+1) / (df + 1))


def calculate_bm25(tf, df, N, d_len, d_avg):
    idf = max(0, math.log10((N-df + 0.5)/(df+0.5))) # in rare cases where the df is over 50% of N, this could become -ve, so we guard against that
    tf_comp = float(((BM25_K + 1) * tf)) / ( BM25_K * ((1-BM25_b) + BM25_b*(float(d_len)/d_avg)) + tf)
    return tf_comp * idf


def calculate_rf(df_relevant, df_non_relevant):
    return math.log( (2 + (float(df_relevant)/max(1, df_non_relevant))), 2)


def calculate_tf_rf(tf, df_relevant, df_non_relevant):
    return tf * calculate_rf(df_relevant, df_non_relevant)


def compare_classifications(x,y):
    len_comp = cmp(len(x), len(y))
    if len_comp == 0:
        return cmp(x,y)
    return len_comp


def create_doc_index(term_index, term_dictionary):
    return term_index \
        .flatMap(lambda (term, postings_list): get_doc_index(term, postings_list, term_dictionary)) \
        .reduceByKey(lambda x, y: merge_postings(x, y))


def get_rf_stats(postings, classification):
    a_plus_c = set(postings.keys())
    a_plus_b = set(classifications_index[classification])
    # first intersection is to get (a), second difference is to get (c) (checkout tf-rf paper for reference)
    a = a_plus_c.intersection(a_plus_b)
    c = a_plus_c.difference(a_plus_b)
    size_a = len(a)
    size_c = len(c)
    return size_a, size_c


def get_rf_postings(classification):
    def get_rf_postings_internal(postings):
        size_a, size_c = get_rf_stats(postings, classification)
        return {docId: calculate_rf(size_a, size_c)
                for docId, tf in postings.items()}
    return get_rf_postings_internal


def get_tf_rf_postings(classification):
    def get_tf_rf_postings_internal(postings):
        size_a, size_c = get_rf_stats(postings, classification)
        return {docId: calculate_tf_rf(tf, size_a, size_c)
                for docId, tf in postings.items()}
    return get_tf_rf_postings_internal


get_binary = lambda x: 1 if x > 0 else 0
get_binary = np.vectorize(get_binary)

def get_row_top_N(y_score_row, y_true_row):
    desc_score_indices = np.argsort(y_score_row)[::-1]
    # print y_score_row
    # print y_true_row
    true_indices = np.where(y_true_row ==1)[0]
    # print desc_score_indices
    found = 0
    top_N = 0
    for i, score in enumerate(desc_score_indices):
        if score in true_indices:
            found += 1
            if found == len(true_indices):
                top_N = i + 1
    # print top_N
    return top_N


### Input/Output directories

In [6]:
SAMPLE_RATIO = 0.15

In [37]:
#sc = SparkContext("", "Generate Inverted Index Job")
es_server = "deka.cip.ifi.lmu.de"
es_port = "9200"

original_parent_save_location = "hdfs://deka.cip.ifi.lmu.de/extended_pv/"
save_parent_location = original_parent_save_location

root_location = "/big/s/shalaby/"
exports_location = root_location + "exported_data/"


docs_output = save_parent_location + "docs_output"
postings_list_output = save_parent_location + "postings_list_full.json"

accepted_terms_list_output = original_parent_save_location + "accepted_terms_list_{}.pkl"
accepted_terms_with_scores_list_output = original_parent_save_location + "accepted_terms_with_scores_list_{}.pkl"
postings_list_chi_selected_output = original_parent_save_location + "postings_list_{}.json"
term_df_map_output = original_parent_save_location + "term_df_map_output_{}.json"
doc_index_chi_selected_output = original_parent_save_location + "doc_index_for_postings_{}.json"
term_dictionary_output = original_parent_save_location + "term_dictionary_{}.pkl"


postings_list_training_chi_selected_output = save_parent_location + "training_postings_list_{}.json"
postings_list_validation_chi_selected_output = save_parent_location + "validation_postings_list_{}.json"
postings_list_test_chi_selected_output = save_parent_location + "test_postings_list_{}.json"


# data location
preprocessed_docs_location = root_location + "preprocessed_data/" + "extended_pv_docs_only_for_spark"


# Classification objects, unrelated to sample size
classifications_index_output = exports_location + "extended_pv_classifications_index.pkl"
doc_classifications_map_file = exports_location + "extended_pv_doc_classification_map.pkl"
sections_file = exports_location + "sections.pkl"
classes_file = exports_location + "classes.pkl"
subclasses_file = exports_location + "subclasses.pkl"
valid_classes_file = exports_location + "valid_classes.pkl"
valid_subclasses_file = exports_location + "valid_subclasses.pkl"
classifications_output = exports_location + "classifications.pkl"
doc_lengths_map_output = exports_location + "extended_pv_doc_lengths_map.pkl"
training_docs_list_file = exports_location + "extended_pv_training_docs_list_" + str(SAMPLE_RATIO) + ".pkl"
validation_docs_list_file = exports_location + "extended_pv_validation_docs_list_" + str(SAMPLE_RATIO) + ".pkl"
test_docs_list_file = exports_location + "extended_pv_test_docs_list_" + str(SAMPLE_RATIO) + ".pkl"


training_predictions_sections_output = save_parent_location + "training_predictions_sections_list.pkl"
training_labels_sections_list_output = save_parent_location + "training_labels_sections_list.pkl"
valdiation_predictions_sections_output = save_parent_location + "validation_predictions_sections_list.pkl"
validation_labels_sections_list_output = save_parent_location + "validation_labels_sections_list.pkl"


test_postings_list_output = save_parent_location + "test_postings_list_50000.json"
training_errors_output = save_parent_location + "training_errors.json"
model_output = save_parent_location + "models/" + "iter_" + str(SVM_ITERATIONS) + "_reg_" + str(SVM_REG) + "/"

In [64]:
def get_data_output_name(method, no_of_features=TOP_N_FEATURES, data_type="training"):
    return save_parent_location + "models/" + data_type + "_data/" + method  + "_data.json"
def get_save_location(location, sample=False):
    if sample:
        return location.replace(save_parent_location, sample_save_parent_location)
    return location

In [10]:
%%time
doc_classification_map = pickle.load(open(doc_classifications_map_file))
sections = pickle.load(open(sections_file))
classes = pickle.load(open(classes_file))
subclasses = pickle.load(open(subclasses_file))
valid_classes = pickle.load(open(valid_classes_file))
valid_subclasses = pickle.load(open(valid_subclasses_file))
training_docs_list = pickle.load(open(training_docs_list_file))
validation_docs_list = pickle.load(open(validation_docs_list_file))
test_docs_list = pickle.load(open(test_docs_list_file))
classifications_index = pickle.load(open(classifications_index_output))

CPU times: user 7.75 s, sys: 348 ms, total: 8.1 s
Wall time: 8.11 s


In [11]:
extended_pv_docs = set(training_docs_list) | set(validation_docs_list) | set(test_docs_list)
doc_count = len(extended_pv_docs)

395509


In [14]:
# accelerates the chi squared calculation a lot
classifications_index_set = {k:set(docs) for k,docs in classifications_index.iteritems()}

In [13]:
doc_count

395509

In [32]:
classifications_index.items()[0]

(u'G-20-B', [u'07433566', u'07896523', u'07218441', u'07116477', u'06985663'])

In [33]:
doc_classification_map.items()[10]

(u'07390846', [u'C', u'C-08', u'C-08-K', u'C-08-L'])

In [50]:
sections

[u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H']

In [35]:
len(classes)

387

## (Once only) Create the doc classification and classifications index for extended pv docs

In [27]:
extended_pv_doc_classification_map = {}
for doc_id in extended_pv_docs:
    extended_pv_doc_classification_map[doc_id] = doc_classification_map[doc_id]
pickle.dump(extended_pv_doc_classification_map, open(exports_location + "extended_pv_doc_classification_map.pkl", "w"))

extended_pv_classifications_index = {}
for classf in classifications_index:
    classf_docs_set = set(classifications_index[classf])
    valid_extended_pv_docs = classf_docs_set & extended_pv_docs
    extended_pv_classifications_index[classf] = list(valid_extended_pv_docs)
pickle.dump(extended_pv_classifications_index, open(exports_location + "extended_pv_classifications_index.pkl", "w"))

## Loading document texts from HDFS

In [38]:
doc_text_objs = sc.textFile(preprocessed_docs_location).map(lambda line: line.split(" ", 1))

### Create Postings List

In [39]:
%%time
# Create Postings List
postings_lists = doc_text_objs.flatMap(lambda (doc_id, doc): simple_tokenizer(doc, doc_id)).reduceByKey(lambda x,y: merge_postings(x,y))
# the second condition is made specifically for num_indic, as it usually is in all docs and that messes up chi
min_doc_postings_lists = postings_lists.filter(lambda (x,y): len(y) > MIN_DOCUMENTS and len(y) < doc_count)
#number_of_terms = min_doc_postings_lists.count()

# min_doc_postings_lists.map(lambda (term, postings_list): ",".join([term, json.dumps(postings_list)])).repartition(1).saveAsTextFile(postings_list_output)
min_doc_postings_lists.map(lambda postings: json.dumps(postings)).saveAsTextFile(postings_list_output)

CPU times: user 880 ms, sys: 236 ms, total: 1.12 s
Wall time: 2h 15min 42s


### Get Document Lengths

In [18]:
doc_lengths_dict = doc_text_objs.map(lambda (doc_id, document_text): (doc_id, len(document_text))).collectAsMap()

In [19]:
avg_doc_length = sum(doc_lengths_dict.values())/len(doc_lengths_dict)

In [21]:
doc_lengths_dict.items()[0]

(u'08369259', 85861)

In [24]:
avg_doc_length

46477

### Save Document Lengths

In [27]:
pickle.dump(doc_lengths_dict, open(doc_lengths_map_output, "w"))

### Save Postings List

In [None]:
%%time
# Save Postings List
# min_doc_postings_lists.map(lambda (term, postings_list): ",".join([term, json.dumps(postings_list)])).repartition(1).saveAsTextFile(postings_list_output)
min_doc_postings_lists.map(lambda postings: json.dumps(postings)).saveAsTextFile(postings_list_output)

### Load Postings List

In [12]:
# Load Postings Lists
min_doc_postings_lists = sc.textFile(postings_list_output).map(lambda json_postings: json.loads(json_postings))

In [40]:
def get_chi_index(term_index, classifications_index_set, subclasses, number_of_docs):
    return term_index.map(lambda (term, postings_list): (term, calculate_chi_squared(postings_list.keys(), classifications_index_set, subclasses, number_of_docs)))

def calculate_chi_squared(document_list, classifications_index_set, subclasses, number_of_docs):
    """
    Chi squared is the ratio of the difference between actual frequency and expected frequency of a term relative to the expected frequency
    summed up across all classes and whether the term appears or not
    Here we calculate the average chi squared score which is one of two options in multi-label classification (the other being max)
    """
    chi_score = 0
    N = len(document_list)
    doc_set = set(document_list)
    Nt1 = N # actual collection frequency of having the word
    Nt0 = number_of_docs - N # actual collection frequency of not having the word
    Pt1 = float(N)/ number_of_docs # probability of the term happening
    Pt0 = float(number_of_docs - N)/ number_of_docs # probablility of the term not happening
    #print "Docs Stats: Term present in %d (%.7f), Not Present in %d (%.7f) " % (Nt1, Pt1, Nt0, Pt0)
    for subclass in subclasses:
        # this condition is only required for when using a sample because some subclasses may not have docs
        if len(classifications_index_set[subclass]) > 0:
            Pc1 = float(len(classifications_index_set[subclass]))/ number_of_docs # probability of the class happening
            Pc0 = 1 - Pc1
            Pt1c1 = float(len(doc_set & classifications_index_set[subclass])) / number_of_docs
            Pt1c0 = Pt1 - Pt1c1
            Pt0c1 = Pc1 - Pt1c1
            Pt0c0 = 1 - Pt1c0 - Pt0c1 - Pt1c1

            cat_chi_score = (number_of_docs * math.pow(Pt1c1 * Pt0c0 - Pt1c0 * Pt0c1, 2))/(Pt1 * Pt0 * Pc1 * Pc0)
            # calculate average chi score
            chi_score += Pc1 * cat_chi_score
            #print "subclass %s: %.7f, %.7f, %.7f, %.7f, %.7f, %.7f" % (subclass, Pc1, Pt1c1, Pt1c0, Pt0c1, Pt0c0, chi_score)
    return chi_score

In [23]:
for subclass in subclasses:
    if len(classifications_index_set[subclass]) == 0:
        print subclass

In [22]:
min_doc_postings_lists.count()

16714599

In [29]:
postings_length_dict = min_doc_postings_lists.map(lambda (term, postings_list): (term, len(postings_list))).collectAsMap()

In [31]:
np.max(postings_length_dict.values())

395509

In [32]:
postings_length_dict.keys()[395509]

u'disease inclusion'

In [33]:
postings_length_dict['disease inclusion']

10

In [34]:
max(postings_length_dict.items(), key= lambda x: x[1])

(u'num_indic', 395509)

In [35]:
sorted_postings_items = sorted(postings_length_dict.items(), key= lambda x: x[1], reverse=True)

In [None]:
sorted_postings_items[:10]

In [None]:
# min_doc_postings_lists = sc.parallelize(min_doc_postings_lists.take(10000))

# term_accepted_chi_list_with_scores = get_chi_index(min_doc_postings_lists, classifications_index, subclasses, doc_count).takeOrdered(TOP_N_FEATURES, lambda (term,score): -score)


### Order by Chi Squared and get Top features

In [41]:
%%time
term_accepted_chi_list_with_scores = get_chi_index(min_doc_postings_lists, classifications_index_set, subclasses, doc_count).takeOrdered(TOP_N_FEATURES, lambda (term,score): -score)
term_accepted_chi_list = map(lambda (x,y): x, term_accepted_chi_list_with_scores)
# gets a bit slower at the end but finishes eventually 
term_dictionary = get_term_dictionary(term_accepted_chi_list)
min_doc_postings_lists = min_doc_postings_lists.filter(lambda (term, postings): term in term_accepted_chi_list).cache()
number_of_terms = min_doc_postings_lists.count()
term_df_map = min_doc_postings_lists.map(lambda (term, postings): (term, len(postings))).collectAsMap()

# Save Postings List and the supporting objects
# min_doc_postings_lists.map(lambda (term, postings_list): ",".join([term, json.dumps(postings_list)])).repartition(1).saveAsTextFile(postings_list_output)
min_doc_postings_lists.map(lambda postings: json.dumps(postings)).saveAsTextFile(postings_list_chi_selected_output.format(str(TOP_N_FEATURES)))
sc.parallelize(term_dictionary.items()).saveAsPickleFile(term_dictionary_output.format(str(TOP_N_FEATURES)))
sc.parallelize(term_accepted_chi_list).saveAsPickleFile(accepted_terms_list_output.format(str(TOP_N_FEATURES)))
sc.parallelize(term_accepted_chi_list_with_scores).saveAsPickleFile(accepted_terms_with_scores_list_output.format(str(TOP_N_FEATURES)))

finished 10000
CPU times: user 45.7 s, sys: 4.08 s, total: 49.7 s
Wall time: 2h 56min 58s


In [43]:
#min_doc_postings_lists.map(lambda postings: json.dumps(postings)).repartition(100).saveAsTextFile(postings_list_chi_selected_output.format(str(TOP_N_FEATURES)))
#term_df_map = min_doc_postings_lists.map(lambda (term, postings): (term, len(postings))).collectAsMap()
sc.parallelize(term_dictionary.items()).repartition(1).saveAsPickleFile(term_dictionary_output.format(str(TOP_N_FEATURES)))
sc.parallelize(term_df_map.items()).saveAsPickleFile(term_df_map_output.format(str(TOP_N_FEATURES)))
sc.parallelize(term_accepted_chi_list).repartition(1).saveAsPickleFile(accepted_terms_list_output.format(str(TOP_N_FEATURES)))
sc.parallelize(term_accepted_chi_list_with_scores).repartition(1).saveAsPickleFile(accepted_terms_with_scores_list_output.format(str(TOP_N_FEATURES)))

In [42]:
term_accepted_chi_list_with_scores[:10]

[(u'pharmaceutically', 15411.494644150624),
 (u'pharmaceutically acceptable', 15121.398337111134),
 (u'pharmaceutical', 13925.085275880954),
 (u'server', 13792.377376291814),
 (u'administered', 13558.053009137388),
 (u'network', 13110.796900314415),
 (u'protein', 12995.0009762267),
 (u'pharmaceutical composition', 12956.506131944068),
 (u'administering', 12904.26814543066),
 (u'oral', 12873.917238804137)]

In [30]:
sc.parallelize(term_accepted_chi_list_with_scores).saveAsPickleFile(accepted_terms_with_scores_list_output)

In [43]:
term_accepted_chi_list[:100]

[u'pharmaceutically',
 u'pharmaceutically acceptable',
 u'pharmaceutical',
 u'server',
 u'administered',
 u'network',
 u'protein',
 u'pharmaceutical composition',
 u'administering',
 u'oral',
 u'assay',
 u'therapeutic',
 u'effective amount',
 u'purified',
 u'computer',
 u'vitro',
 u'pharmaceutical compositions',
 u'amino',
 u'disease',
 u'executed',
 u'vivo',
 u'dosage',
 u'memory',
 u'acids',
 u'internet',
 u'incubated',
 u'hardware',
 u'acid',
 u'administration',
 u'request',
 u'proteins',
 u'diseases',
 u'gene',
 u'culture',
 u'therapeutically',
 u'processor',
 u'parenteral',
 u'serum',
 u'program',
 u'excipients',
 u'capsules',
 u'software',
 u'sodium',
 u'acceptable carrier',
 u'recombinant',
 u'phosphate',
 u'intravenous',
 u'enzyme',
 u'drug',
 u'amino acid',
 u'dose',
 u'chromatography',
 u'dna',
 u'peptide',
 u'37\xb0',
 u'acceptable salts',
 u'inhibition',
 u'washed',
 u'biol',
 u'store',
 u'receptor',
 u'amino acids',
 u'assays',
 u'animal',
 u'oral administration',
 u'accep

#### Recreate term dictionary with just the accepted terms

In [36]:
# gets a bit slower at the end but finishes eventually 
term_dictionary = get_term_dictionary(term_accepted_chi_list)

finished 10000


In [37]:
min_doc_postings_lists = min_doc_postings_lists.filter(lambda (term, postings): term in term_accepted_chi_list).cache()

In [22]:
number_of_terms = min_doc_postings_lists.count()
number_of_terms

100000

#### Save Reduced Postings List

In [50]:
# Save Postings List
## min_doc_postings_lists.map(lambda (term, postings_list): ",".join([term, json.dumps(postings_list)])).repartition(1).saveAsTextFile(postings_list_output)
min_doc_postings_lists.map(lambda postings: json.dumps(postings)).saveAsTextFile(postings_list_chi_selected_output.format(str(TOP_N_FEATURES)))
#sc.parallelize(term_dictionary.items()).saveAsPickleFile(term_dictionary_output)
#sc.parallelize(term_accepted_chi_list).saveAsPickleFile(accepted_terms_list_output)

#### Load Reduced Postings List

In [11]:
min_doc_postings_lists = sc.textFile(postings_list_chi_selected_output.format(str(TOP_N_FEATURES)).map(lambda json_postings: json.loads(json_postings)).cache()
term_dictionary = dict(sc.pickleFile(term_dictionary_output).collect())
number_of_terms = min_doc_postings_lists.count()

SyntaxError: invalid syntax (<ipython-input-11-89dbcbd1ab08>, line 2)

#### Collect document lengths

In [45]:
all_doc_index = create_doc_index(min_doc_postings_lists, term_dictionary)

In [None]:
# need to collect the document lengths since they are used in the BM25 calculation
all_doc_index = create_doc_index(min_doc_postings_lists, term_dictionary)

doc_lengths_rdd = all_doc_index.mapValues(lambda postings_dictionary: reduce(lambda x, term: x + postings_dictionary[term], postings_dictionary, 0))
avg_doc_length = doc_lengths_rdd.map(lambda (term, count): count).reduce(lambda count1, count2: count1 + count2) / doc_count
doc_lengths_dict = doc_lengths_rdd.collectAsMap()

In [46]:
all_doc_index.map(lambda postings: json.dumps(postings)).saveAsTextFile(doc_index_chi_selected_output.format(str(TOP_N_FEATURES)))

Save Document Lengths

In [22]:
sc.parallelize(doc_lengths_dict.items()).saveAsPickleFile(doc_lengths_map_output)

In [None]:
all_doc_index.take(1)

In [None]:
# all_doc_index.saveAsPickleFile(doc_index_chi_selected_output)

Load Document Lengths

In [10]:
doc_lengths_dict = dict(sc.pickleFile(doc_lengths_map_output).collect())
avg_doc_length = sum(doc_lengths_dict.values())/len(doc_lengths_dict)

In [24]:
doc_lengths_dict.items()[0]

(u'08226314', 3466)

In [25]:
len(doc_lengths_dict)

2009750

### Load everything for training

In [13]:
min_doc_postings_lists = sc.textFile(postings_list_chi_selected_output.format(str(TOP_N_FEATURES))).map(lambda json_postings: json.loads(json_postings)).cache()
term_dictionary = dict(sc.pickleFile(term_dictionary_output.format(str(TOP_N_FEATURES))).collect())
term_df_map = dict(sc.pickleFile(term_df_map_output.format(str(TOP_N_FEATURES))).collect())
number_of_terms = len(term_df_map) # min_doc_postings_lists.count()
doc_lengths_dict = dict(sc.pickleFile(doc_lengths_map_output).collect())
avg_doc_length = sum(doc_lengths_dict.values())/len(doc_lengths_dict)
#all_doc_index = sc.textFile(doc_index_chi_selected_output.format(str(TOP_N_FEATURES))).map(lambda json_postings: json.loads(json_postings)).cache()

In [15]:
all_doc_index = all_doc_index.map(lambda (doc_id, postings): (doc_id, {int(key): postings[key] for key in postings})).cache()

### Get min_doc_postings_lists for the sample only

In [54]:
training_docs_set = set(training_docs_list)
validation_docs_set = set(validation_docs_list)
test_docs_set = set(test_docs_list)

In [67]:
training_documents = training_docs_list
validation_documents = validation_docs_list
test_documents = test_docs_list

In [137]:
min_doc_postings_lists = sc.textFile(postings_list_chi_selected_output.format(str(TOP_N_FEATURES))).map(lambda json_postings: json.loads(json_postings)).cache()
min_doc_postings_lists = min_doc_postings_lists.map(lambda (term, postings): (term, {doc_id:postings[doc_id] for doc_id in postings if doc_id in training_docs_set or doc_id in validation_docs_set or doc_id in test_docs_set}))

In [39]:
min_doc_postings_lists.map(lambda postings: json.dumps(postings)).saveAsTextFile(get_save_location(postings_list_chi_selected_output.format(str(TOP_N_FEATURES)), sample=IS_SAMPLE))

#### Only training

In [49]:
training_postings_output = get_save_location(postings_list_training_chi_selected_output.format(str(TOP_N_FEATURES)), sample=IS_SAMPLE)

Creating

In [56]:
training_min_doc_postings_lists = min_doc_postings_lists.map(lambda (term, postings): (term, {doc_id:postings[doc_id] for doc_id in postings if doc_id in training_docs_set}))
training_min_doc_postings_lists.map(lambda postings: json.dumps(postings)).saveAsTextFile(training_postings_output)

In [20]:
training_min_doc_postings_lists = sc.textFile(training_postings_output).map(get_json)

#### Only validation

In [58]:
validation_postings_output = get_save_location(postings_list_validation_chi_selected_output.format(str(TOP_N_FEATURES)), sample=IS_SAMPLE)

Creating

In [59]:
validation_min_doc_postings_lists = min_doc_postings_lists.map(lambda (term, postings): (term, {doc_id:postings[doc_id] for doc_id in postings if doc_id in validation_docs_set}))
validation_min_doc_postings_lists.map(lambda postings: json.dumps(postings)).saveAsTextFile(validation_postings_output)

In [41]:
validation_min_doc_postings_lists = sc.textFile(validation_postings_output).map(get_json)

#### Only test

In [60]:
test_postings_output = get_save_location(postings_list_test_chi_selected_output.format(str(TOP_N_FEATURES)), sample=IS_SAMPLE)

Creating

In [61]:
test_min_doc_postings_lists = min_doc_postings_lists.map(lambda (term, postings): (term, {doc_id:postings[doc_id] for doc_id in postings if doc_id in test_docs_set}))
test_min_doc_postings_lists.map(lambda postings: json.dumps(postings)).saveAsTextFile(test_postings_output)

In [17]:
test_min_doc_postings_lists = sc.textFile(test_postings_output).map(get_json)

### Start creating term weighting postings

In [62]:
def create_written_doc_index(term_index, name, data_type="training"):
    doc_index = create_doc_index(term_index, term_dictionary)
    output_name = get_data_output_name(name, data_type=data_type)
    doc_index.map(lambda postings: json.dumps(postings)).repartition(100).saveAsTextFile(output_name)
    doc_index = sc.textFile(output_name).map(get_json_convert_num)# .cache()
    return doc_index

def read_written_doc_index(name, data_type="training"):
    output_name = get_data_output_name(name, data_type=data_type)
    doc_index = sc.textFile(output_name).map(get_json_convert_num)
    return doc_index

#### Create Training Set

In [71]:
%%time

tf_postings = training_min_doc_postings_lists
# tf_doc_index_training = create_written_doc_index(tf_postings, "tf")

# sublinear_tf_postings = tf_postings.mapValues(lambda postings: {docId:  calculate_sublinear_tf(tf) for docId, tf in postings.items()})
# sublinear_tf_doc_index_training = create_written_doc_index(sublinear_tf_postings, "tf-sublinear")

# tf_idf_postings = tf_postings.mapValues(lambda postings: {docId:  calculate_tf_idf(tf, len(postings), len(training_documents)) for docId, tf in postings.items()})
# tf_idf_doc_index_training = create_written_doc_index(tf_idf_postings, "tf-idf")

sublinear_tf_idf_postings = tf_postings.mapValues(lambda postings: {docId:  calculate_sublinear_tf_idf(tf, len(postings), len(training_documents)) for docId, tf in postings.items()})
sublinear_tf_idf_doc_index_training = create_written_doc_index(sublinear_tf_idf_postings, "sublinear-tf-idf")

bm25_postings = tf_postings.mapValues(lambda postings: {docId: calculate_bm25(tf, len(postings), len(training_documents), doc_lengths_dict[docId], avg_doc_length) for docId, tf in postings.items()})
bm25_doc_index_training = create_written_doc_index(bm25_postings, "bm25")

CPU times: user 12.4 s, sys: 832 ms, total: 13.2 s
Wall time: 33min 4s


Read Training Set

In [12]:
tf_doc_index_training = read_written_doc_index("tf")
sublinear_tf_doc_index_training = read_written_doc_index("tf-sublinear")
tf_idf_doc_index_training = read_written_doc_index("tf-idf")
sublinear_tf_idf_doc_index_training = read_written_doc_index("sublinear-tf-idf")
bm25_doc_index_training = read_written_doc_index("bm25")

#### Create Validation Set

In [74]:
%%time

tf_postings_validation = validation_min_doc_postings_lists
# tf_doc_index_validation = create_written_doc_index(tf_postings_validation, "tf", data_type="validation")

# sublinear_tf_postings_validation = tf_postings_validation.mapValues(lambda postings: {docId:  calculate_sublinear_tf(tf) for docId, tf in postings.items()})
# sublinear_tf_doc_index_validation = create_written_doc_index(sublinear_tf_postings_validation, "tf-sublinear", data_type="validation")

# tf_idf_postings_validation = tf_postings_validation.mapValues(lambda postings: {docId:  calculate_tf_idf(tf, len(postings), len(validation_documents)) for docId, tf in postings.items()})
# tf_idf_doc_index_validation = create_written_doc_index(tf_idf_postings_validation, "tf-idf", data_type="validation")

# sublinear_tf_idf_postings_validation = tf_postings_validation.mapValues(lambda postings: {docId:  calculate_sublinear_tf_idf(tf, len(postings), len(validation_documents)) for docId, tf in postings.items()})
# sublinear_tf_idf_doc_index_validation = create_written_doc_index(sublinear_tf_idf_postings_validation, "sublinear-tf-idf", data_type="validation")

bm25_postings_validation = tf_postings_validation.mapValues(lambda postings: {docId: calculate_bm25(tf, len(postings), len(validation_documents), doc_lengths_dict[docId], avg_doc_length) for docId, tf in postings.items()})
bm25_doc_index_validation = create_written_doc_index(bm25_postings_validation, "bm25", data_type="validation")

CPU times: user 6.38 s, sys: 420 ms, total: 6.8 s
Wall time: 13min 10s


Read Validation Set

In [23]:
tf_doc_index_validation = read_written_doc_index("tf", data_type="validation")
sublinear_tf_doc_index_validation = read_written_doc_index("tf-sublinear", data_type="validation")
tf_idf_doc_index_validation = read_written_doc_index("tf-idf", data_type="validation")
sublinear_tf_idf_doc_index_validation = read_written_doc_index("sublinear-tf-idf", data_type="validation")
bm25_doc_index_validation = read_written_doc_index("bm25", data_type="validation")

#### Create Testing Set

In [75]:
%%time

tf_postings_test = test_min_doc_postings_lists
tf_doc_index_test = create_written_doc_index(tf_postings_test, "tf", data_type="test")

sublinear_tf_postings_test = tf_postings_test.mapValues(lambda postings: {docId:  calculate_sublinear_tf(tf) for docId, tf in postings.items()})
sublinear_tf_doc_index_test = create_written_doc_index(sublinear_tf_postings_test, "tf-sublinear", data_type="test")

tf_idf_postings_test = tf_postings_test.mapValues(lambda postings: {docId:  calculate_tf_idf(tf, len(postings), len(test_documents)) for docId, tf in postings.items()})
tf_idf_doc_index_test = create_written_doc_index(tf_idf_postings_test, "tf-idf", data_type="test")

sublinear_tf_idf_postings_test = tf_postings_test.mapValues(lambda postings: {docId:  calculate_sublinear_tf_idf(tf, len(postings), len(test_documents)) for docId, tf in postings.items()})
sublinear_tf_idf_doc_index_test = create_written_doc_index(sublinear_tf_idf_postings_test, "sublinear-tf-idf", data_type="test")

bm25_postings_test = tf_postings_test.mapValues(lambda postings: {docId: calculate_bm25(tf, len(postings), len(test_documents), doc_lengths_dict[docId], avg_doc_length) for docId, tf in postings.items()})
bm25_doc_index_test = create_written_doc_index(bm25_postings_test, "bm25", data_type="test")

CPU times: user 11.3 s, sys: 884 ms, total: 12.2 s
Wall time: 1h 4min 45s


In [None]:
tf_doc_index_test = read_written_doc_index("tf", data_type="test")
sublinear_tf_doc_index_test = read_written_doc_index("tf-sublinear", data_type="test")
tf_idf_doc_index_test = read_written_doc_index("tf-idf", data_type="test")
sublinear_tf_idf_doc_index_test = read_written_doc_index("sublinear-tf-idf", data_type="test")
bm25_doc_index_test = read_written_doc_index("bm25", data_type="test")

In [71]:
# def jsonKV2str(x):
#     if isinstance(x, dict):
#             return {int(k):(int(v) if isinstance(v, unicode) else v) for k,v in x.items()}
#     return x

# output_namee = "hdfs://deka.cip.ifi.lmu.de/svm/new/lskd4.json"
# dd = {"232323":{3:2},"oooidii": {3:4}}
# #sc.parallelize(dd.items()).take(1)
# #sc.parallelize(dd.items()).map(lambda postings: json.dumps(postings)).saveAsTextFile(output_namee)
# sc.parallelize(dd.items()).map(lambda postings: json.dumps(postings)).take(1)
# sc.parallelize(dd.items()).map(lambda postings: json.dumps(postings)).map(lambda postings: json.loads(postings, object_hook=jsonKV2str)).collect()

#map(json.dumps,dd.items() )

[[u'oooidii', {3: 4}], [u'232323', {3: 2}]]

In [None]:
tf_idf_doc_index_validation.take(1)

#### Create Validation Set

In [17]:
%%time
tf_doc_index_val = all_doc_index.filter(lambda (doc_id, postings): doc_id in validation_documents).cache()
sublinear_tf_doc_index_val = sublinear_tf_doc_index.filter(lambda (doc_id, postings): doc_id in validation_documents).cache()
tf_id_doc_index_val = tf_id_doc_index.filter(lambda (doc_id, postings): doc_id in validation_documents).cache()
bm25_doc_index_val = bm25_doc_index.filter(lambda (doc_id, postings): doc_id in validation_documents).cache()

CPU times: user 11.2 s, sys: 332 ms, total: 11.5 s
Wall time: 12.5 s


## Actual Training