In [6]:
import json
import nltk
from nltk.tokenize import RegexpTokenizer
import string
import math
import time
import pandas as pd
import pyspark
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.classification import SVMWithSGD, SVMModel
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import random
from sklearn.metrics import coverage_error
import sklearn.metrics
import cPickle as pickle

In [28]:
IS_SAMPLE = False
TRAINING_SAMPLE_PERCENTAGE = 0.01
VALIDATION_SAMPLE_PERCENTAGE = 0.01
MIN_TRAINING_SAMPLES = 10
MIN_VALIDATION_SAMPLES = 2

In [3]:
STOP_WORDS = nltk.corpus.stopwords.words('english')
NUMBER_INDICATOR = "number_inidicator"
CURRENCY_INDICATOR = "currency_inidicator"
CHEMICAL_INDICATOR = "chemical_inidicator"
MIN_SIZE = 3
MIN_DOCUMENTS = 5

TEST_SET_PERCENTAGE = 0.2
VALIDATION_IN_TRAINING_PERCENTAGE = 0.2
MIN_DOCUMENTS_FOR_TEST = 1
MIN_DOCUMENTS_FOR_VALIDATION = 1

RANDOM_SEED = 10000
random.seed(RANDOM_SEED)

stemmer = nltk.stem.porter.PorterStemmer().stem

### Input/Output directories

In [4]:
#sc = SparkContext("", "Generate Inverted Index Job")
es_server = "deka.cip.ifi.lmu.de"
es_port = "9200"

training_file = '/big/s/shalaby/docs_output.json'

original_parent_save_location = "hdfs://deka.cip.ifi.lmu.de/svm/new/"
save_parent_location = original_parent_save_location
sample_save_parent_location = save_parent_location + "sample/"
if IS_SAMPLE: 
    save_parent_location = save_parent_location + "sample/"

file_name = "sample.json"
test_file_name = "sample.json"
#url = "/media/Work/workspace/thesis/benchmark/output/" + file_name
sample_location = save_parent_location + file_name
sample_test_location = save_parent_location + test_file_name
docs_output = save_parent_location + "docs_output"
postings_list_output = save_parent_location + "postings_list_full.json"

accepted_terms_list_output = original_parent_save_location + "accepted_terms_list_{}.pkl"
accepted_terms_with_scores_list_output = original_parent_save_location + "accepted_terms_with_scores_list_{}.pkl"
postings_list_chi_selected_output = original_parent_save_location + "postings_list_{}.json"
term_df_map_output = original_parent_save_location + "term_df_map_output_{}.json"
doc_index_chi_selected_output = original_parent_save_location + "doc_index_for_postings_{}.json"
term_dictionary_output = original_parent_save_location + "term_dictionary_{}.pkl"


postings_list_training_chi_selected_output = save_parent_location + "training_postings_list_{}.json"
postings_list_validation_chi_selected_output = save_parent_location + "validation_postings_list_{}.json"
postings_list_test_chi_selected_output = save_parent_location + "test_postings_list_{}.json"

# Classification objects, unrelated to sample size
classification_index_output = original_parent_save_location + "classification_index.pkl"
doc_classification_map_output = original_parent_save_location + "doc_classification_map.pkl"
sections_output = original_parent_save_location + "sections.pkl"
classes_output = original_parent_save_location + "classes.pkl"
subclasses_output = original_parent_save_location + "subclasses.pkl"
classifications_output = original_parent_save_location + "classifications.pkl"
doc_lengths_map_output = original_parent_save_location + "doc_lengths_map.pkl"
# training, validation and test set lists
training_docs_list_output = original_parent_save_location + "training_docs_list.pkl"
validation_docs_list_output = original_parent_save_location + "validation_docs_list.pkl"
test_docs_list_output = original_parent_save_location + "test_docs_list.pkl"
sample_training_docs_list_output = sample_save_parent_location + "training_docs_list.pkl"


training_predictions_sections_output = save_parent_location + "training_predictions_sections_list.pkl"
training_labels_sections_list_output = save_parent_location + "training_labels_sections_list.pkl"
valdiation_predictions_sections_output = save_parent_location + "validation_predictions_sections_list.pkl"
validation_labels_sections_list_output = save_parent_location + "validation_labels_sections_list.pkl"


#### Load Classification Objects

In [5]:
doc_classification_map = dict(sc.pickleFile(doc_classification_map_output).collect())
doc_count = len(doc_classification_map)
classifications_index = dict(sc.pickleFile(classification_index_output).collect())
sections = sc.pickleFile(sections_output).collect()
classes = sc.pickleFile(classes_output).collect()
subclasses = sc.pickleFile(subclasses_output).collect()
classifications = sc.pickleFile(classifications_output).collect()

In [12]:
#training_docs_list_file = original_parent_save_location + "training_docs_list.pkl"
training_docs_list_file = "/big/s/shalaby/exported_data/" + "training_documents_" + str(0.004) + "_sample.pkl"
#training_docs_list = sc.pickleFile(training_docs_list_file).collect()
training_docs_list = pickle.load(open(training_docs_list_file))
print len(training_docs_list)

22707


In [14]:
doc_count

2009750

In [7]:
classifications_index.items()[0]

(u'G-20-B', [u'07433566', u'07896523', u'06985663', u'07116477', u'07218441'])

In [8]:
doc_classification_map.items()[10]

(u'07007598', [u'B', u'B-30', u'B-30-B'])

In [9]:
sections

[u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H']

## Creating Training, Validation and Test Splits

In [None]:
# Get min number of documents for any classification
min = 1000
from collections import defaultdict
min_classf = defaultdict(list)
for (classf, documents) in classifications_index.items():
    if len(documents) == 2: 
        min = len(documents)
        min_classf[classf].append(min)
min_classf, min
        

In [35]:
len(min_classf)

760

In [8]:
len(classifications_index)

2235

In [9]:
%%time
training_documents = set()
validation_documents = set()
test_documents = set()
for (classf, documents) in classifications_index.items():
    # only worry about subclasses, classes and sections will be already included
    if(classf in sections or classf in classes): pass
    
    # remove any documents that have already been picked before
    docs_set = set(documents)
    docs_set-=training_documents
    docs_set-=validation_documents
    docs_set-=test_documents
    
    base_test_docs_num = int(len(docs_set)* TEST_SET_PERCENTAGE)
    num_test_docs = base_test_docs_num if base_test_docs_num > 0 else MIN_DOCUMENTS_FOR_TEST if MIN_DOCUMENTS_FOR_TEST < len(docs_set) else 0
    print classf, len(docs_set), num_test_docs
    classif_test_docs = random.sample(docs_set, num_test_docs)
    
    remaining_docs = docs_set.difference(set(classif_test_docs))
    base_validation_docs_num = int(len(remaining_docs)* VALIDATION_IN_TRAINING_PERCENTAGE)
    num_validation_docs = base_validation_docs_num if base_validation_docs_num > 0 else MIN_DOCUMENTS_FOR_VALIDATION if MIN_DOCUMENTS_FOR_VALIDATION < len(remaining_docs) else 0
    classif_validation_docs = random.sample(remaining_docs, num_validation_docs)
    
    classif_training_docs = set(remaining_docs).difference(set(classif_validation_docs))
    
    training_documents.update(classif_training_docs)
    validation_documents.update(classif_validation_docs)
    test_documents.update(classif_test_docs)

5 1
1 0
2 1
8 1
2 1
2 1
5 1
1 0
1 0
3 1
6 1
11 2
4723 944
2 1
2 1
1 0
1 0
1 0
3 1
2 1
2 1
1 0
2 1
2097 419
47133 9426
5554 1110
1 0
1 0
6917 1383
0 0
4158 831
4 1
384 76
1 0
5 1
1 0
1 0
1 0
1 0
1 0
189 37
501 100
936 187
1 0
2 1
3 1
6 1
1 0
4 1
3 1
1 0
1 0
552 110
2 1
2 1
0 0
1 0
1 0
32668 6533
1 0
1 0
1948 389
74 14
813 162
1 0
2 1
1 0
327 65
684 136
138 27
183 36
33 6
56 11
15 3
22 4
248 49
65 13
82 16
1 0
1 0
1 0
354 70
1 0
1 0
418 83
1907 381
1830 366
1218 243
701 140
3 1
15068 3013
59 11
818 163
465 93
5566 1113
17933 3586
1 0
5 1
0 0
1 0
1 0
1 0
1 0
3 1
1 0
2 1
697 139
1496 299
444 88
0 0
1 0
1 0
1 0
456 91
2 1
105 21
40 8
729 145
455 91
175 35
1 0
1 0
3 1
1 0
6 1
1 0
1224 244
2208 441
1 0
2 1
462 92
234954 46990
7 1
1 0
3 1
1 0
4 1
1 0
3 1
1 0
3 1
0 0
1 0
1 0
1 0
1 0
2 1
2 1
1 0
0 0
0 0
11522 2304
33507 6701
1 0
0 0
0 0
1 0
0 0
48 9
2 1
1 0
3428 685
1 0
1 0
1 0
2 1
2 1
1 0
7526 1505
1 0
1 0
4 1
1 0
0 0
1 0
0 0
0 0
0 0
0 0
4 1
1584 316
1356 271
455 91
49 9
2 1
2 1
2243 448
256 51

## Get training divergence from actual section statistics

In [13]:
#for section in sections:
#     classification = section
for clss in classes:
    classification = clss
    actual_number = len(classifications_index[classification])
    actual_percentage = float(actual_number) / doc_count
    training_number = len([training_doc for training_doc in training_documents \
                                     if classification in doc_classification_map[training_doc]])
    training_percentage = float(training_number) / len(training_documents) 
    print "%s: Actual -> %d, %.6f, Training -> %d, %.6f" % (classification, actual_number, actual_percentage, 
                                                            training_number, training_percentage)

A-00: Actual -> 5, 0.0000, Training -> 4, 0.0000
A-01: Actual -> 47218, 0.0235, Training -> 30214, 0.0235
A-02: Actual -> 5, 0.0000, Training -> 1, 0.0000
A-03: Actual -> 5, 0.0000, Training -> 3, 0.0000
A-04: Actual -> 18, 0.0000, Training -> 9, 0.0000
A-05: Actual -> 9, 0.0000, Training -> 6, 0.0000
A-06: Actual -> 28, 0.0000, Training -> 15, 0.0000
A-07: Actual -> 20, 0.0000, Training -> 14, 0.0000
A-10: Actual -> 21, 0.0000, Training -> 15, 0.0000
A-11: Actual -> 10, 0.0000, Training -> 8, 0.0000
A-12: Actual -> 8, 0.0000, Training -> 6, 0.0000
A-13: Actual -> 2, 0.0000, Training -> 2, 0.0000
A-15: Actual -> 1, 0.0000, Training -> 0, 0.0000
A-16: Actual -> 67, 0.0000, Training -> 45, 0.0000
A-18: Actual -> 2, 0.0000, Training -> 1, 0.0000
A-21: Actual -> 1222, 0.0006, Training -> 808, 0.0006
A-22: Actual -> 1034, 0.0005, Training -> 657, 0.0005
A-23: Actual -> 6144, 0.0031, Training -> 3954, 0.0031
A-24: Actual -> 1075, 0.0005, Training -> 713, 0.0006
A-25: Actual -> 2, 0.0000, Tra

#### Save the training, validation and test document lists

In [25]:
sc.parallelize(training_documents).saveAsPickleFile(training_docs_list_output)
sc.parallelize(validation_documents).saveAsPickleFile(validation_docs_list_output)
sc.parallelize(test_documents).saveAsPickleFile(test_docs_list_output)

#### Load the training, validation and test document lists

In [15]:
training_documents = sc.pickleFile(training_docs_list_output).collect()
validation_documents = sc.pickleFile(validation_docs_list_output).collect()
test_documents = sc.pickleFile(test_docs_list_output).collect()

In [16]:
len(set(test_documents))

401877

In [17]:
len(training_documents)

1286325

In [18]:
len(validation_documents)

321473

In [10]:
import cPickle as pickle
pickle.dump(training_documents, open('/big/s/shalaby/exported_data_merged/training_docs_list.pkl', 'w'))

## Creating Sample

In [23]:
%%time
random.seed(RANDOM_SEED)
sample_training_documents = set()
training_documents_set = set(training_documents)
i = 0
for (classf, documents) in classifications_index.items():
    if(classf in sections or classf in classes): pass
    documents = set(documents) & training_documents_set
    if len(documents) > MIN_TRAINING_SAMPLES:
        base_sample_docs_len = int(len(documents)* TRAINING_SAMPLE_PERCENTAGE)
        num_sample_docs = base_sample_docs_len if base_sample_docs_len > 0 else MIN_TRAINING_SAMPLES
        #print "%s: Total %d, sample: %d" % (classf, len(documents), num_sample_docs)
        classif_training_docs = random.sample(documents, num_sample_docs)
        
        sample_training_documents.update(set(classif_training_docs))
    else:
        sample_training_documents.update(documents)
    i+=1
    
    #if i > 100: break
print len(sample_training_documents)
#sc.parallelize(sample_training_documents).saveAsPickleFile(sample_training_docs_list_output)

49789
CPU times: user 3.84 s, sys: 156 ms, total: 4 s
Wall time: 3.98 s


In [24]:
import cPickle as pickle
pickle.dump(list(sample_training_documents), open('/big/s/shalaby/exported_data/training_documents_' + str(TRAINING_SAMPLE_PERCENTAGE) + "_sample.pkl", 'w'))

## Creating Validation Sample

In [29]:
%%time
random.seed(RANDOM_SEED)
sample_validation_documents = set()
validation_documents_set = set(validation_documents)
i = 0
for (classf, documents) in classifications_index.items():
    if(classf in sections or classf in classes): pass
    documents = set(documents) & validation_documents_set
    if len(documents) > MIN_VALIDATION_SAMPLES:
        base_sample_docs_len = int(len(documents)* VALIDATION_SAMPLE_PERCENTAGE)
        num_sample_docs = base_sample_docs_len if base_sample_docs_len > 0 else MIN_VALIDATION_SAMPLES
        #print "%s: Total %d, sample: %d" % (classf, len(documents), num_sample_docs)
        classif_validation_docs = random.sample(documents, num_sample_docs)
        
        sample_validation_documents.update(set(classif_validation_docs))
    else:
        sample_validation_documents.update(documents)
    i+=1
    
    #if i > 100: break
print len(sample_validation_documents)
#sc.parallelize(sample_training_documents).saveAsPickleFile(sample_training_docs_list_output)

12412
CPU times: user 2 s, sys: 68 ms, total: 2.07 s
Wall time: 2.06 s


In [30]:
import cPickle as pickle
pickle.dump(list(sample_validation_documents), open('/big/s/shalaby/exported_data/validation_documents_' + str(VALIDATION_SAMPLE_PERCENTAGE) + "_sample.pkl", 'w'))

## Creating separate file for the sample

In [31]:
%%time
training_docs_set = set(sample_training_documents)
validation_docs_set = set(sample_validation_documents)
with open('/big/s/shalaby/docs_output_training_validation_documents_' + str(TRAINING_SAMPLE_PERCENTAGE), 'w') as sample_file:
    for line in open(training_file):
        (doc_id, text) = eval(line)
        # %time doc_id in training_docs_set or doc_id in validation_docs_set
        if doc_id in training_docs_set or doc_id in validation_docs_set:
            sample_file.write(line + "\n" if not line.endswith("\n") else line)

CPU times: user 17min 22s, sys: 1min 4s, total: 18min 26s
Wall time: 22min 4s


In [59]:
sc.parallelize(sample_training_documents).saveAsPickleFile(sample_save_parent_location + str(TRAINING_SAMPLE_PERCENTAGE) + "_sample.pkl")

In [11]:
sample_training_documents = sc.pickleFile(sample_training_docs_list_output).collect()

In [60]:
training_documents = sample_training_documents

### Section Distribution

In [None]:
for classif in sorted(classifications_index.keys()):
    if len(classif) == 1:
        print "%s : %d, %.3f" % (classif, len(set(classifications_index[classif])), float(len(classifications_index[classif]))/doc_count)

### Section Overlap

In [None]:
%%time
overlap_df = pd.DataFrame({section: [0]*len(sections) for section in sections} , index=sections, columns=sections)
for doc_id in doc_classification_map:
    for classif in doc_classification_map[doc_id]:
        if len(classif) == 1:
            for classif2 in doc_classification_map[doc_id]:
                if len(classif2) == 1:
                    overlap_df[classif][classif2] += 1
overlap_df

In [None]:
mpl.colors.Normalize(1,3)

In [None]:
overlap_df.values

In [None]:
fig = plt.figure(figsize=(16,8), dpi=120)
#ax = fig.add_subplot(111, frameon=True, xticks=[], yticks=[])
vals = overlap_df.values
normal = mpl.colors.Normalize()
normal = mpl.colors.Normalize(vals.min()-1, vals.max()+vals.max()/2)
formatter = lambda x: "{:,d}".format(int(x))

the_table=plt.table(cellText=np.vectorize(formatter)(vals), rowLabels=overlap_df.index, colLabels=overlap_df.columns, 
                    colWidths = [0.1]*(vals.shape[1]+3), loc='center',
                    cellColours=plt.cm.YlGn(normal(vals)))
the_table.set_fontsize(30)
the_table.scale(2, 4)
plt.axis("off")
plt.show()