In [2]:
import json
import nltk
from nltk.tokenize import RegexpTokenizer
import string
import math
import time
import pandas as pd
import pyspark
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.classification import SVMWithSGD, SVMModel
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import random
from sklearn.metrics import coverage_error

In [3]:
IS_SAMPLE = False

In [4]:
STOP_WORDS = nltk.corpus.stopwords.words('english')
NUMBER_INDICATOR = "number_inidicator"
CURRENCY_INDICATOR = "currency_inidicator"
CHEMICAL_INDICATOR = "chemical_inidicator"
MIN_SIZE = 3
MIN_DOCUMENTS = 5
TOP_N_FEATURES = 10000

TEST_SET_PERCENTAGE = 0.2
VALIDATION_IN_TRAINING_PERCENTAGE = 0.2
MIN_DOCUMENTS_FOR_TEST = 1
MIN_DOCUMENTS_FOR_VALIDATION = 1

MIN_DOCUMENTS_FOR_TRAINING_SAMPLE = 10
MIN_DOCUMENTS_FOR_TEST_SAMPLE = 1
MIN_DOCUMENTS_FOR_VALIDATION_SAMPLE = 1

SVM_ITERATIONS = 1000
SVM_CONVERGENCE = 0.1
SVM_REG = 0.01

BM25_K = 1.5  # controls power of tf component
BM25_b = 0.75  # controls the BM25 length normalization

RANDOM_SEED = 10000

stemmer = nltk.stem.porter.PorterStemmer().stem

### Text Manipulation functions

In [5]:
def stemtokenizer(text, doc_id):
    """ MAIN FUNCTION to get clean stems out of a text. A list of clean stems are returned """
    tokenizer = RegexpTokenizer(r'\s+', gaps=True)
    tokens = tokenizer.tokenize(text)
    stems = []  # result
    previous_unigram = None
    for token in tokens:
        stem = token.lower()
        stem = stem.strip(string.punctuation)
        if stem:
            if is_number(stem):
                stem = NUMBER_INDICATOR
            elif is_currency(stem):
                stem = CURRENCY_INDICATOR
            elif is_chemical(stem):
                stem = CHEMICAL_INDICATOR
            elif is_stopword(stem):
                stem = None
            else:
                stem = stemmer(token)
                stem = stem.strip(string.punctuation)
            if stem and len(stem) >= MIN_SIZE:
                # extract uni-grams
                stems.append((stem,{doc_id: 1}))
                # extract bi-grams
                if previous_unigram: stems.append((previous_unigram + " " + stem,{doc_id: 1}))
                previous_unigram = stem
    del tokens
    return stems

def is_stopword(word):
  return word in STOP_WORDS

def is_number(str):
    """ Returns true if given string is a number (float or int)"""
    try:
        float(str.replace(",", ""))
        return True
    except ValueError:
        return False

def is_currency(str):
    return str[0] == "$"

def is_chemical(str):
    return str.count("-") > 3

### Training functions

In [6]:
def merge_postings(postings_list1, postings_list2):
    # key could be either a doc id or a term
    for key in postings_list2:
        if postings_list1.get(key):
            postings_list1[key] += postings_list2[key]
        else:
            postings_list1[key] = postings_list2[key]
    return postings_list1

def get_term_dictionary(terms):
    """
    Maps string terms to indexes in an array
    """
    term_dictionary = {}
    term_array = [None] * len(terms)
    def put(key):
        hashvalue = hashfunction(key, len(term_array))
        if term_array[hashvalue] == None:
            term_array[hashvalue] = key
            return hashvalue
        else:
            nextslot = rehash(hashvalue, len(term_array))
            while term_array[nextslot] != None:
                nextslot = rehash(nextslot, len(term_array))
            if term_array[nextslot] == None:
                term_array[nextslot] = key
                return nextslot
    def hashfunction(key, size):
        return hash(key) % size
    def rehash(oldhash, size):
        return (oldhash + 1) % size
    i = 0
    for term in terms:
        corresponding_index = put(term)
        term_dictionary[term] = corresponding_index
        i+=1
        if i%10000 == 0: print "finished " + str(i)
    return term_dictionary

def get_doc_index(term, postings_list, term_dictionary):
    #return [(doc_id, {term: postings_list[doc_id]}) for doc_id in postings_list]
    return [(doc_id, {term_dictionary[term]: postings_list[doc_id]}) for doc_id in postings_list]

def get_classes(ipc_classification):
    sections = []
    classes = []
    subclasses = []
    for classification in ipc_classification:
        # we do the check because some documents have repetitions
        section_name = classification['section']
        class_name = classification['section'] + "-" + classification['class']
        subclass_name = classification['section'] + "-" + classification['class'] + "-" + classification['subclass']
        if section_name not in sections:
            sections.append(section_name)
        if class_name not in classes:
            classes.append(class_name)
        if subclass_name not in subclasses:
            subclasses.append(subclass_name)
    return {"sections": sections, "classes": classes, "subclasses": subclasses}


def get_training_vector_old(classification, term_list, classifications, classification_key_name, number_of_terms):
    clss = 1 if classification in classifications[classification_key_name] else 0
    return LabeledPoint(clss, SparseVector(number_of_terms, term_list))

def get_training_vector(classification, term_list, classifications, number_of_terms):
    clss = 1 if classification in classifications else 0
    return LabeledPoint(clss, SparseVector(number_of_terms, term_list))


def calculate_sublinear_tf(tf):
    # laplace smoothing with +1 in case of term with no documents (useful during testing)
    return math.log10(1 + tf)


def calculate_tf_idf(tf, df, N):
    # laplace smoothing with +1 in case of term with no documents (useful during testing)
    return tf * math.log10((N+1) / (df + 1))


def calculate_bm25(tf, df, N, d_len, d_avg):
    idf = max(0, math.log10((N-df + 0.5)/(df+0.5))) # in rare cases where the df is over 50% of N, this could become -ve, so we guard against that
    tf_comp = float(((BM25_K + 1) * tf)) / ( BM25_K * ((1-BM25_b) + BM25_b*(float(d_len)/d_avg)) + tf)
    return tf_comp * idf


def calculate_rf(df_relevant, df_non_relevant):
    return math.log( (2 + (float(df_relevant)/max(1, df_non_relevant))), 2)


def calculate_tf_rf(tf, df_relevant, df_non_relevant):
    return tf * calculate_rf(df_relevant, df_non_relevant)


def compare_classifications(x,y):
    len_comp = cmp(len(x), len(y))
    if len_comp == 0:
        return cmp(x,y)
    return len_comp


def create_doc_index(term_index, term_dictionary):
    return term_index \
        .flatMap(lambda (term, postings_list): get_doc_index(term, postings_list, term_dictionary)) \
        .reduceByKey(lambda x, y: merge_postings(x, y))


def get_rf_stats(postings, classification):
    a_plus_c = set(postings.keys())
    a_plus_b = set(classifications_index[classification])
    # first intersection is to get (a), second difference is to get (c) (checkout tf-rf paper for reference)
    a = a_plus_c.intersection(a_plus_b)
    c = a_plus_c.difference(a_plus_b)
    size_a = len(a)
    size_c = len(c)
    return size_a, size_c


def get_rf_postings(classification):
    def get_rf_postings_internal(postings):
        size_a, size_c = get_rf_stats(postings, classification)
        return {docId: calculate_rf(size_a, size_c)
                for docId, tf in postings.items()}
    return get_rf_postings_internal


def get_tf_rf_postings(classification):
    def get_tf_rf_postings_internal(postings):
        size_a, size_c = get_rf_stats(postings, classification)
        return {docId: calculate_tf_rf(tf, size_a, size_c)
                for docId, tf in postings.items()}
    return get_tf_rf_postings_internal


def train_level_old(docs_with_classes, classification, classification_label):
    training_vectors = docs_with_classes.map(
        lambda (doc_id, (term_list, classifications)): get_training_vector_old(classification, term_list, classifications,
                                                                           classification_label, number_of_terms))
    svm = SVMWithSGD.train(training_vectors, iterations=SVM_ITERATIONS, convergenceTol=SVM_CONVERGENCE)
    return training_vectors, svm


def train_level(docs_with_classes, classification, number_of_terms):
    training_vectors = docs_with_classes.map(
        lambda (doc_id, (term_list, classifications)): get_training_vector(classification, term_list,
                                                                           classifications, number_of_terms))
    svm = SVMWithSGD.train(training_vectors, iterations=SVM_ITERATIONS, convergenceTol=SVM_CONVERGENCE, regParam=SVM_REG)
    return training_vectors, svm


def get_error(svm, test_vectors):
    labelsAndPreds = test_vectors.map(lambda p: (p.label, svm.predict(p.features)))
    trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(test_vectors.count())
    return trainErr


def train_all(docs_with_classes):
    training_errors = {}
    for section in sections:
        training_vectors, svm = train_level(docs_with_classes, section, "sections")
        train_err = get_error(svm, training_vectors)
        training_errors[section] = train_err
    #
    with open(training_errors_output, 'w') as file:
        file.write(json.dumps(training_errors))
    #
    for clss in classes:
        training_vectors, svm = train_level(docs_with_classes, clss, "classes")
        train_err = get_error(svm, training_vectors)
        training_errors[clss] = train_err
    
    with open(training_errors_output, 'w') as file:
        file.write(json.dumps(training_errors))
    
    for subclass in subclasses:
        training_vectors, svm = train_level(docs_with_classes, subclass, "subclasses")
        train_err = get_error(svm, training_vectors)
        training_errors[subclass] = train_err
    return training_errors


def get_labeled_points_from_doc_index(doc_index, doc_classification_map, number_of_terms):
    docs_with_classes = doc_index.map(lambda (doc_id, terms): (doc_id, (terms, doc_classification_map[doc_id])))
    training_vectors = docs_with_classes.map(
        lambda (doc_id, (term_list, classifications)): get_training_vector(classification, term_list,
                                                                           classifications, number_of_terms))
    return training_vectors


class Evaluator:
    
    def __init__(self, labels, scores, threshold=0.5):
        self.threshold = 0
        self.count = len(self.labels)
        
        self.tp = 0
        self.fp = 0
        self.fn = 0
        self.tn = 0
        
        for (l,s) in zip(labels,scores):
            if self.is_true(l) and self.is_true(s):
                self.tp += 1
            if self.is_true(l) and not self.is_true(s):
                self.fn += 1
            if not self.is_true(l) and self.is_true(s):
                self.fp += 1
            if not self.is_true(l) and not self.is_true(s):
                self.tn += 1
        self.precision = self.get_precision()
        self.recall = self.get_precision()
        self.fa = self.get_f1()
        self.error_rate = self.get_error_rate()
        
    def calculate_contingency(self, label, contingency):
        
        self.tp = 0
        self.fp = 0
        self.fn = 0
        self.tn = 0
        
        for (l,s) in zip(labels,scores):
            if self.is_true(l) and self.is_true(s):
                self.tp += 1
            if self.is_true(l) and not self.is_true(s):
                self.fn += 1
            if not self.is_true(l) and self.is_true(s):
                self.fp += 1
            if not self.is_true(l) and not self.is_true(s):
                self.tn += 1
    
    def is_true(self, label):
        return label > self.threshold
    
    def get_error_rate(self):
        return float(self.tp + self.tn) / len(labels)
    
    def get_precision(self):
        # self.calculate_contingency()
        if self.tp == 0: return 0
        return float(self.tp) / (self.tp + self.fp)
        
    def get_recall(self):
        # self.calculate_contingency()
        if self.tp == 0: return 0
        return float(self.tp) / (self.tp + self.fn)
    
    def get_f1(self):
        return 2 * (self.get_precision() * self.get_recall()) / (self.get_precision() + self.get_recall())

### Input/Output directories

In [7]:
#sc = SparkContext("", "Generate Inverted Index Job")
es_server = "deka.cip.ifi.lmu.de"
es_port = "9200"

save_parent_location = "hdfs://deka.cip.ifi.lmu.de/svm/new/"
if IS_SAMPLE: 
    save_parent_location = save_parent_location + "sample/"

file_name = "sample.json"
test_file_name = "sample.json"
#url = "/media/Work/workspace/thesis/benchmark/output/" + file_name
sample_location = save_parent_location + file_name
sample_test_location = save_parent_location + test_file_name
docs_output = save_parent_location + "docs_output"
postings_list_output = save_parent_location + "postings_list_full.json"
accepted_terms_list_output = save_parent_location + "accepted_terms_list.pkl"
postings_list_chi_selected_output = save_parent_location + "postings_list_{}.json"
classification_index_output = save_parent_location + "classification_index.pkl"
doc_classification_map_output = save_parent_location + "doc_classification_map.pkl"
sections_output = save_parent_location + "sections.pkl"
classes_output = save_parent_location + "classes.pkl"
subclasses_output = save_parent_location + "subclasses.pkl"
classifications_output = save_parent_location + "classifications.pkl"
doc_lengths_map_output = save_parent_location + "doc_lengths_map.pkl"
term_dictionary_output = save_parent_location + "term_dictionary.pkl"
# training, validation and test set lists
training_docs_list_output = save_parent_location + "training_docs_list.pkl"
validation_docs_list_output = save_parent_location + "validation_docs_list.pkl"
test_docs_list_output = save_parent_location + "test_docs_list.pkl"
test_postings_list_output = save_parent_location + "test_postings_list_50000.json"
training_errors_output = save_parent_location + "training_errors.json"
model_output = save_parent_location + "models/" + "iter_" + str(SVM_ITERATIONS) + "_reg_" + str(SVM_REG) + "/"

In [8]:
def get_model_name(method, classification, reg=SVM_REG, no_of_features=TOP_N_FEATURES, iterations=SVM_ITERATIONS):
    return save_parent_location + "models/" + "iter_" + str(iterations) + "_reg_" + str(reg) + "/" + method + "_" + classification + "_model.svm"

#### Load Classification Objects

In [9]:
doc_classification_map = dict(sc.pickleFile(doc_classification_map_output).collect())
doc_count = len(doc_classification_map)
classifications_index = dict(sc.pickleFile(classification_index_output).collect())
sections = sc.pickleFile(sections_output).collect()
classes = sc.pickleFile(classes_output).collect()
subclasses = sc.pickleFile(subclasses_output).collect()
classifications = sc.pickleFile(classifications_output).collect()

In [49]:
classifications_index_set = {k:set(docs) for k,docs in classifications_index.iteritems()}

In [10]:
doc_count

2009750

In [16]:
classifications_index.items()[0]

(u'G-20-B', [u'07433566', u'07896523', u'06985663', u'07116477', u'07218441'])

In [50]:
classifications_index_set.items()[0]

(u'G-20-B', {u'06985663', u'07116477', u'07218441', u'07433566', u'07896523'})

In [17]:
doc_classification_map.items()[10]

(u'07007598', [u'B', u'B-30', u'B-30-B'])

In [18]:
sections

[u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H']

#### Load the training, validation and test document lists

In [11]:
training_documents = sc.pickleFile(training_docs_list_output).collect()
validation_documents = sc.pickleFile(validation_docs_list_output).collect()
test_documents = sc.pickleFile(test_docs_list_output).collect()

In [11]:
len(set(test_documents))

401877

### Calculate Chi

In [60]:
def get_chi_index(term_index, classifications_index_set, subclasses, number_of_docs):
    return term_index.map(lambda (term, postings_list): (term, calculate_chi_squared(postings_list.keys(), classifications_index_set, subclasses, number_of_docs)))

def calculate_chi_squared(document_list, classifications_index_set, subclasses, number_of_docs):
    """
    Chi squared is the ratio of the difference between actual frequency and expected frequency of a term relative to the expected frequency
    summed up across all classes and whether the term appears or not
    Here we calculate the average chi squared score which is one of two options in multi-lable classification (the other being max)
    """
    start_time = time.time()
    chi_score = 0
    N = len(document_list)
    doc_set = set(document_list)
    Nt1 = N # actual collection frequency of having the word
    Nt0 = number_of_docs - N # actual collection frequency of not having the word
    Pt1 = float(N)/ number_of_docs # probability of the term happening
    Pt0 = float(number_of_docs - N)/ number_of_docs # probablility of the term not happening
    #print "Docs Stats: Term present in %d (%.7f), Not Present in %d (%.7f) " % (Nt1, Pt1, Nt0, Pt0)
    end_time = time.time()
    #print "Pre loop: %.4f" % (end_time - start_time)
    start_time = time.time()
    for subclass in subclasses:
        Pc1 = float(len(classifications_index_set[subclass]))/ number_of_docs # probability of the class happening
        Pc0 = 1 - Pc1
        Pt1c1 = float(len(doc_set & classifications_index_set[subclass])) / number_of_docs
        Pt1c0 = Pt1 - Pt1c1
        Pt0c1 = Pc1 - Pt1c1
        Pt0c0 = 1 - Pt1c0 - Pt0c1 - Pt1c1
        
        cat_chi_score = (number_of_docs * math.pow(Pt1c1 * Pt0c0 - Pt1c0 * Pt0c1, 2))/(Pt1 * Pt0 * Pc1 * Pc0)
        #cat_chi_score = (number_of_docs * Pt1c1 * Pt0c0 - Pt1c0 * Pt0c1)/(Pt1 * Pt0 * Pc1 * Pc0)
        # calculate average chi score
        chi_score += Pc1 * cat_chi_score
        #print "subclass %s: %.7f, %.7f, %.7f, %.7f, %.7f, %.7f" % (subclass, Pc1, Pt1c1, Pt1c0, Pt0c1, Pt0c0, chi_score)
    end_time = time.time()
    print "Per call: %.3f" % (end_time - start_time)
    return chi_score

In [30]:
import Queue as Q
import time
import cProfile
from line_profiler import LineProfiler
min_doc_postings_lists_file = "/big/s/shalaby/postings_list_full.json"

In [34]:
time.time()

1467900248.512972

In [61]:
def test_chi():
    start_time = time.time()
    top_terms_pq = Q.PriorityQueue()
    i = 0
    for line in open(min_doc_postings_lists_file, 'r'):
        (term, document_list) = json.loads(line)
        #print "%s: %d" % (term, len(document_list))
        chi_score = calculate_chi_squared(document_list.keys(), classifications_index_set, subclasses, doc_count)
        top_terms_pq.put(-chi_score, term)
        if i % 10 == 0:
            curr_time = time.time()
            print "Duration: %.3f" % (curr_time - start_time)
            start_time = curr_time
        i+=1
        if i> 100: break

In [65]:
test_chi()

Per call: 0.029
Duration: 0.030
Per call: 0.013
Per call: 0.009
Per call: 0.012
Per call: 0.009
Per call: 0.009
Per call: 0.010
Per call: 0.009
Per call: 0.008
Per call: 0.007
Per call: 0.007
Duration: 0.096
Per call: 0.038
Per call: 0.007
Per call: 0.008
Per call: 0.013
Per call: 0.008
Per call: 0.034
Per call: 0.008
Per call: 0.011
Per call: 0.007
Per call: 0.009
Duration: 0.146
Per call: 0.008
Per call: 0.007
Per call: 0.007
Per call: 0.009
Per call: 0.008
Per call: 0.010
Per call: 0.009
Per call: 0.008
Per call: 0.008
Per call: 0.021
Duration: 0.097
Per call: 0.009
Per call: 0.008
Per call: 0.008
Per call: 0.007
Per call: 0.006
Per call: 0.007
Per call: 0.007
Per call: 0.009
Per call: 0.007
Per call: 0.011
Duration: 0.080
Per call: 0.014
Per call: 0.009
Per call: 0.007
Per call: 0.012
Per call: 0.008
Per call: 0.007
Per call: 0.128
Per call: 0.008
Per call: 0.008
Per call: 0.008
Duration: 0.230
Per call: 0.009
Per call: 0.009
Per call: 0.062
Per call: 0.007
Per call: 0.007
Per call

In [62]:
profile = LineProfiler(test_chi, calculate_chi_squared )

In [63]:
profile.run('test_chi()')

Per call: 0.076
Duration: 0.078
Per call: 0.078
Per call: 0.071
Per call: 0.068
Per call: 0.065
Per call: 0.063
Per call: 0.089
Per call: 0.071
Per call: 0.054
Per call: 0.066
Per call: 0.053
Duration: 0.682
Per call: 0.080
Per call: 0.053
Per call: 0.054
Per call: 0.059
Per call: 0.054
Per call: 0.078
Per call: 0.105
Per call: 0.056
Per call: 0.057
Per call: 0.051
Duration: 0.655
Per call: 0.056
Per call: 0.049
Per call: 0.055
Per call: 0.054
Per call: 0.049
Per call: 0.056
Per call: 0.049
Per call: 0.055
Per call: 0.054
Per call: 0.064
Duration: 0.546
Per call: 0.055
Per call: 0.047
Per call: 0.053
Per call: 0.051
Per call: 0.052
Per call: 0.053
Per call: 0.048
Per call: 0.056
Per call: 0.050
Per call: 0.058
Duration: 0.527
Per call: 0.059
Per call: 0.053
Per call: 0.053
Per call: 0.054
Per call: 0.054
Per call: 0.053
Per call: 0.163
Per call: 0.054
Per call: 0.082
Per call: 0.057
Duration: 0.702
Per call: 0.055
Per call: 0.046
Per call: 0.114
Per call: 0.051
Per call: 0.058
Per call

<line_profiler.LineProfiler at 0x7f8a25661ae0>

In [64]:
profile.print_stats()

Timer unit: 1e-06 s

Total time: 3.75098 s
File: <ipython-input-60-c5a7d60d2ab4>
Function: calculate_chi_squared at line 4

Line #      Hits         Time  Per Hit   % Time  Line Contents
     4                                           def calculate_chi_squared(document_list, classifications_index_set, subclasses, number_of_docs):
     5                                               """
     6                                               Chi squared is the ratio of the difference between actual frequency and expected frequency of a term relative to the expected frequency
     7                                               summed up across all classes and whether the term appears or not
     8                                               Here we calculate the average chi squared score which is one of two options in multi-lable classification (the other being max)
     9                                               """
    10       101          199      2.0      0.0      start_time =

In [18]:
cProfile.run('test_chi()', sort='pcalls')

Duration: 0.42
Duration: 2.51
Duration: 2.37
Duration: 2.30
Duration: 2.19
Duration: 2.30
Duration: 2.49
Duration: 2.45
Duration: 2.44
Duration: 2.39
Duration: 2.13
         374003 function calls in 24.009 seconds

   Ordered by: primitive call count

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
   371882    0.097    0.000    0.097    0.000 {len}
      202    0.003    0.000    0.003    0.000 {method 'acquire' of 'thread.lock' objects}
      202    0.000    0.000    0.000    0.000 {method 'end' of '_sre.SRE_Match' objects}
      202    0.001    0.000    0.001    0.000 {method 'match' of '_sre.SRE_Pattern' objects}
      101    0.001    0.000    0.007    0.000 Queue.py:107(put)
      101   23.863    0.236   23.961    0.237 <ipython-input-16-444c60ee5bf9>:4(calculate_chi_squared)
      101    0.000    0.000    0.000    0.000 {method 'keys' of 'dict' objects}
      101    0.000    0.000    0.000    0.000 {_heapq.heappush}
      101    0.000    0.000    0.031    0

In [51]:
cProfile.run("2+2", sort="cumulative")

         2 function calls in 0.000 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.000    0.000 <string>:1(<module>)
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}




In [12]:
min_doc_postings_lists.count()

44846888

In [90]:
# min_doc_postings_lists = sc.parallelize(min_doc_postings_lists.take(10000))

# term_accepted_chi_list_with_scores = get_chi_index(min_doc_postings_lists, classifications_index, subclasses, doc_count).takeOrdered(TOP_N_FEATURES, lambda (term,score): -score)


In [91]:
%%capture myout

term_accepted_chi_list_with_scores = min_doc_postings_lists.map(lambda (term, postings_list): (term, calculate_chi_squared(postings_list.keys(), classifications_index, subclasses, doc_count))).takeOrdered(TOP_N_FEATURES, lambda (term,score): -score)


In [None]:
term_accepted_chi_list_with_scores = get_chi_index(min_doc_postings_lists, classifications_index, subclasses, doc_count).takeOrdered(TOP_N_FEATURES, lambda (term,score): -score)
term_accepted_chi_list = map(lambda (x,y): x, term_accepted_chi_list_with_scores)
# gets a bit slower at the end but finishes eventually 
term_dictionary = get_term_dictionary(term_accepted_chi_list)
min_doc_postings_lists = min_doc_postings_lists.filter(lambda (term, postings): term in term_accepted_chi_list).cache()
# Save Postings List
# min_doc_postings_lists.map(lambda (term, postings_list): ",".join([term, json.dumps(postings_list)])).repartition(1).saveAsTextFile(postings_list_output)
min_doc_postings_lists.map(lambda postings: json.dumps(postings)).saveAsTextFile(postings_list_chi_selected_output.format(str(TOP_N_FEATURES)))
sc.parallelize(term_dictionary.items()).saveAsPickleFile(term_dictionary_output)
sc.parallelize(term_accepted_chi_list).saveAsPickleFile(accepted_terms_list_output)

In [None]:
term_accepted_chi_list_with_scores = get_chi_index(min_doc_postings_lists, classifications_index, subclasses, doc_count).takeOrdered(TOP_N_FEATURES, lambda (term,score): -score)
term_accepted_chi_list = map(lambda (x,y): x, term_accepted_chi_list_with_scores)

In [28]:
# Create a sample for testing
# sample_no = 10
# postings_sample = min_doc_postings_lists.take(sample_no)
# terms_with_chi = get_chi_index(sc.parallelize(postings_sample), classifications_index, subclasses, doc_count).takeOrdered(sample_no, lambda (term,score): -score)
# terms_with_chi

[(u'result tire', 120.00464625703135),
 (u'PLIAs describ', 27.38847101545022),
 (u'11.AP.26.157', 24.99762059035747),
 (u'poly(glycoamidoamine may', 11.041338300811203),
 (u'2\xd72\xd72 form', 5.367248809371421),
 (u'number_inidicator fifth-stag', 3.532208801170213),
 (u'erect across', 3.357988592234342),
 (u'stresses depth', 3.2341600269691106),
 (u'ABS dimens', 2.6859473213716405),
 (u'inward workpiec', 1.8592295579347673)]

In [29]:
#map(lambda (term, postings_list): (term, calculate_chi_squared(postings_list.keys(), classifications_index, subclasses, doc_count)), postings_sample)

[(u'result tire', 120.00464625703135),
 (u'inward workpiec', 1.8592295579347673),
 (u'2\xd72\xd72 form', 5.367248809371421),
 (u'PLIAs describ', 27.38847101545022),
 (u'ABS dimens', 2.6859473213716405),
 (u'11.AP.26.157', 24.99762059035747),
 (u'number_inidicator fifth-stag', 3.532208801170213),
 (u'erect across', 3.357988592234342),
 (u'stresses depth', 3.2341600269691106),
 (u'poly(glycoamidoamine may', 11.041338300811203)]

In [18]:
term_accepted_chi_list[:100]

[u'also set',
 u'bodi',
 u'milliseconds',
 u'Societi',
 u'5-7',
 u'jacket',
 u'promoters',
 u'number_inidicator dynam',
 u'partially',
 u'determin process',
 u'Sci number_inidicator',
 u'applic file',
 u'however embodi',
 u'side end',
 u'assembl compris',
 u'image FIG',
 u'devic employ',
 u'appli second',
 u'includ case',
 u'miner oil',
 u'power requir',
 u'decreas',
 u'technic scientif',
 u'passed',
 u'frequent',
 u'invent effect',
 u'dens',
 u'scenarios',
 u'body',
 u'bidirect',
 u'provid complet',
 u'micro',
 u'embodi consid',
 u'radial outward',
 u'applic claim',
 u'clarity',
 u'administ patient',
 u'e.g first',
 u'apparatu accord',
 u'released',
 u'includ frame',
 u'use locat',
 u'system softwar',
 u'detail FIGS',
 u'program provid',
 u'important',
 u'ascorb',
 u'order enabl',
 u'system and/or',
 u'output data',
 u'number_inidicator case',
 u'illustr process',
 u'resin number_inidicator',
 u'slant',
 u'amount power',
 u'art embodi',
 u'infect',
 u'16b',
 u'number_inidicator gradua

#### Recreate term dictionary with just the accepted terms

In [36]:
# gets a bit slower at the end but finishes eventually 
term_dictionary = get_term_dictionary(term_accepted_chi_list)

finished 10000


In [37]:
min_doc_postings_lists = min_doc_postings_lists.filter(lambda (term, postings): term in term_accepted_chi_list).cache()

In [None]:
number_of_terms = min_doc_postings_lists.count()
number_of_terms

#### Save Reduced Postings List

In [39]:
# Save Postings List
# min_doc_postings_lists.map(lambda (term, postings_list): ",".join([term, json.dumps(postings_list)])).repartition(1).saveAsTextFile(postings_list_output)
min_doc_postings_lists.map(lambda postings: json.dumps(postings)).saveAsTextFile(postings_list_chi_selected_output.format(str(TOP_N_FEATURES)))
sc.parallelize(term_dictionary.items()).saveAsPickleFile(term_dictionary_output)
sc.parallelize(term_accepted_chi_list).saveAsPickleFile(accepted_terms_list_output)

#### Load Reduced Postings List

In [17]:
min_doc_postings_lists = sc.textFile(postings_list_chi_selected_output).map(lambda json_postings: json.loads(json_postings))
term_dictionary = dict(sc.pickleFile(term_dictionary_output).collect())