In [248]:
import nltk
nltk.download('averaged_perceptron_tagger')

import string

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jake/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [249]:
def strip_punc(string):
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    no_punct = ""
    for char in string:
       if char not in punctuations:
           no_punct = no_punct + char
    return no_punct

In [250]:
from nltk.stem.porter import *

def stem(a):
    p = nltk.PorterStemmer()
    return p.stem(a)


apples bananas carrot


In [272]:
def load_document(docName):
    with open (docName, "r", encoding='iso-8859-1') as myfile:
        data=myfile.read()
        return data

In [252]:
def label_document(document):
    a = nltk.pos_tag(document.split())
    return a

In [253]:
def construct_grammar():
    grammar = 'NumericalPhrase: {<CD><IN>?<DT>?<JJ>?<NN|NNS>}'
    parser = nltk.RegexpParser(grammar)
    return parser

In [254]:
def output_number_examples(document_name, parser):
    a = ([tree.leaves() for tree in parser.parse(label_document(load_document(document_name))).subtrees() if tree.label() == 'NumericalPhrase'])
    print(a)
    return a

In [258]:
def output_conflicts(input_phrase):
    my_dict = dict()
    for l  in input_phrase:
        my_dict = {**my_dict, **dict({stem(strip_punc(key[0])): set({}) for key in l if key[1] in ['NN','NNS']})}
        
    for l  in input_phrase:
        cd, nn = get_cd_nns_pair(l)
        my_dict[nn].add(cd)
    
    for key in my_dict.keys():
        if len(my_dict[key]) > 1:
            print(key)
            print(my_dict[key])
            
    print("#######")
    print(my_dict)
    pass

In [259]:
def get_cd_nns_pair(one_phrase):
    cd = ""
    nn = ""
    for key in one_phrase:
        if key[1] == 'CD':
            cd = key[0]
        if key[1] in ['NN', 'NNS']:
            nn = key[0]
            
    return cd, stem(strip_punc(nn))
        

In [25]:
def load_model():

    import spacy
    import os

    # map of topic -> person -> document -> phrase
    print("Model being built.")
    
    nlp = spacy.load('en_core_web_sm')
    model_tpd = {}
    model_tpd.update({'null':  {}})
    model_tpd['null'].update({'null':  {}})
    model_tpd['null']['null'].update({'null':  set()})
    
    model_ptd = {}
    model_ptd.update({'null':  {}})
    model_ptd['null'].update({'null':  {}})
    model_ptd['null']['null'].update({'null':  set()})

    
    for person in os.listdir('data'):
        top_path = os.path.join('data', person)
        if person != '.DS_Store':
            for document in os.listdir(top_path):
                true_path = os.path.join('data', os.path.join(person, document))
                print("Processing: ", true_path)
                with open(true_path, "r", encoding='iso-8859-1') as f:
                    docstring = f.read()
                    docmodel = nlp(docstring)
                    for chunk in docmodel.noun_chunks:
                        containedNumber = False
                        for token in chunk:
                            containedNumber |= token.pos_ == 'NUM'
                        if containedNumber:
                            root = chunk.root.lemma_
                            
                            if root not in model_tpd.keys():
                                model_tpd.update({root:  {}})
                            if person not in model_tpd[root].keys():
                                model_tpd[root].update({person:  {}})
                            if document not in model_tpd[root][person].keys():
                                model_tpd[root][person].update({document:  set()})
                            model_tpd[root][person][document].add(chunk)
                            
                            if person not in model_ptd.keys():
                                model_ptd.update({person:  {}})
                            if root not in model_ptd[person].keys():
                                model_ptd[person].update({root:  {}})
                            if document not in model_ptd[person][root].keys():
                                model_ptd[person][root].update({document:  set()})
                            model_ptd[person][root][document].add(chunk)
                            
                            
                            

    print("Model loaded.")
    return model_tpd, model_ptd

map_topic_person, map_person_topic = load_model()
model_to_file(map_topic_person)

Model being built.
Processing:  data/Chris Christie/02-09-2016.txt
Processing:  data/Chris Christie/07-15-2015.txt
Processing:  data/Chris Christie/01-20-2016.txt
Processing:  data/Chris Christie/11-14-2015.txt
Processing:  data/Chris Christie/07-19-2016.txt
Processing:  data/Chris Christie/10-12-2015.txt
Processing:  data/Chris Christie/11-24-2015.txt
Processing:  data/Chris Christie/08-22-2015.txt
Processing:  data/Chris Christie/06-30-2015.txt
Processing:  data/Michelle Bachmann/10-07-2011.txt
Processing:  data/Michelle Bachmann/01-03-12.txt
Processing:  data/Michelle Bachmann/10-20-11.txt
Processing:  data/Michelle Bachmann/09-08-11.txt
Processing:  data/Michelle Bachmann/01-04-2012.txt
Processing:  data/Michelle Bachmann/01-03-2012.txt
Processing:  data/Michelle Bachmann/11-20-2011.txt
Processing:  data/Michelle Bachmann/11-20-11.txt
Processing:  data/Michelle Bachmann/10-07-11.txt
Processing:  data/Michelle Bachmann/05-27-2011.txt
Processing:  data/Michelle Bachmann/05-27-11.txt


Processing:  data/Hillary Clinton/09-15-2007.txt
Processing:  data/Hillary Clinton/05-31-07.txt
Processing:  data/Hillary Clinton/04-04-08.txt
Processing:  data/Hillary Clinton/02-09-2008.txt
Processing:  data/Hillary Clinton/04-22-08.txt
Processing:  data/Hillary Clinton/05-21-2007.txt
Processing:  data/Hillary Clinton/11-29-2007.txt
Processing:  data/Hillary Clinton/04-05-2008.txt
Processing:  data/Hillary Clinton/11-28-2007.txt
Processing:  data/Hillary Clinton/04-04-2008.txt
Processing:  data/Hillary Clinton/11-05-07.txt
Processing:  data/Hillary Clinton/03-17-2008.txt
Processing:  data/Hillary Clinton/07-10-2007.txt
Processing:  data/Hillary Clinton/05-06-2008.txt
Processing:  data/Hillary Clinton/05-21-07.txt
Processing:  data/Hillary Clinton/01-26-2008.txt
Processing:  data/Hillary Clinton/03-04-2007.txt
Processing:  data/Hillary Clinton/02-16-08.txt
Processing:  data/Hillary Clinton/02-20-2008.txt
Processing:  data/Hillary Clinton/06-01-2008.txt
Processing:  data/Hillary Clinto

Processing:  data/Martin O'Malley/07-26-15.txt
Processing:  data/Martin O'Malley/05-03-15.txt
Processing:  data/Martin O'Malley/10-30-2015.txt
Processing:  data/Martin O'Malley/07-17-2015.txt
Processing:  data/Martin O'Malley/10-23-15.txt
Processing:  data/Martin O'Malley/06-21-2015.txt
Processing:  data/Martin O'Malley/08-22-2015.txt
Processing:  data/Martin O'Malley/05-03-2015.txt
Processing:  data/Mitt Romney/10-18-12.txt
Processing:  data/Mitt Romney/11-04-2011.txt
Processing:  data/Mitt Romney/10-06-12.txt
Processing:  data/Mitt Romney/09-11-2012.txt
Processing:  data/Mitt Romney/09-10-2012.txt
Processing:  data/Mitt Romney/01-22-08.txt
Processing:  data/Mitt Romney/07-29-12.txt
Processing:  data/Mitt Romney/04-18-12.txt
Processing:  data/Mitt Romney/04-18-07.txt
Processing:  data/Mitt Romney/01-14-08.txt
Processing:  data/Mitt Romney/03-02-07.txt
Processing:  data/Mitt Romney/10-07-2011.txt
Processing:  data/Mitt Romney/04-05-2012.txt
Processing:  data/Mitt Romney/04-04-2012.txt


Processing:  data/Mitt Romney/09-26-2012.txt
Processing:  data/Mitt Romney/09-27-2012.txt
Processing:  data/Mitt Romney/01-10-2012.txt
Processing:  data/Mitt Romney/04-17-12.txt
Processing:  data/Mitt Romney/10-05-12.txt
Processing:  data/Mitt Romney/01-24-12.txt
Processing:  data/Mitt Romney/09-21-12.txt
Processing:  data/Mitt Romney/01-24-2012.txt
Processing:  data/Mitt Romney/02-07-12.txt
Processing:  data/Mitt Romney/09-12-2012.txt
Processing:  data/Mitt Romney/09-13-2012.txt
Processing:  data/Mitt Romney/01-03-2008.txt
Processing:  data/Mitt Romney/06-02-2011.txt
Processing:  data/Mitt Romney/05-05-2007.txt
Processing:  data/Mitt Romney/02-28-2012.txt
Processing:  data/Mitt Romney/02-07-07.txt
Processing:  data/Mitt Romney/09-17-12.txt
Processing:  data/Mitt Romney/09-26-2012-2.txt
Processing:  data/Mitt Romney/08-11-2012.txt
Processing:  data/Mitt Romney/04-13-12.txt
Processing:  data/Mitt Romney/10-01-12.txt
Processing:  data/Mitt Romney/05-08-2012.txt
Processing:  data/Mitt Rom

Processing:  data/John McCain/12-14-1999.txt
Processing:  data/John McCain/12-15-1999.txt
Processing:  data/John McCain/10-17-08.txt
Processing:  data/John McCain/10-04-2008.txt
Processing:  data/John McCain/06-03-2008.txt
Processing:  data/John McCain/04-05-08.txt
Processing:  data/John McCain/06-02-2008.txt
Processing:  data/John McCain/11-16-06-2.txt
Processing:  data/John McCain/03-25-08.txt
Processing:  data/John McCain/02-02-07.txt
Processing:  data/John McCain/09-18-2008.txt
Processing:  data/John McCain/08-11-08.txt
Processing:  data/John McCain/07-01-2008.txt
Processing:  data/John McCain/09-19-2008.txt
Processing:  data/John McCain/09-28-07.txt
Processing:  data/John McCain/07-18-2007.txt
Processing:  data/John McCain/04-21-08.txt
Processing:  data/John McCain/08-11-2008.txt
Processing:  data/John McCain/05-27-08.txt
Processing:  data/John McCain/01-12-08.txt
Processing:  data/John McCain/04-07-2008.txt
Processing:  data/John McCain/09-17-08.txt
Processing:  data/John McCain/

Processing:  data/John McCain/04-05-2008.txt
Processing:  data/John McCain/04-04-2008.txt
Processing:  data/John McCain/11-05-07.txt
Processing:  data/John McCain/11-16-2006.txt
Processing:  data/John McCain/09-27-07.txt
Processing:  data/John McCain/10-22-08.txt
Processing:  data/John McCain/01-19-2008.txt
Processing:  data/John McCain/07-01-08.txt
Processing:  data/John McCain/04-23-2007.txt
Processing:  data/John McCain/09-08-2007.txt
Processing:  data/John McCain/10-14-08.txt
Processing:  data/John McCain/05-07-2008.txt
Processing:  data/John McCain/05-06-2008.txt
Processing:  data/John McCain/05-21-07.txt
Processing:  data/John McCain/07-08-2008.txt
Processing:  data/John McCain/07-09-2008.txt
Processing:  data/John McCain/10-06-08.txt
Processing:  data/John McCain/12-10-07.txt
Processing:  data/John McCain/04-14-08.txt
Processing:  data/John McCain/11-04-2008-2.txt
Processing:  data/John McCain/07-25-08.txt
Processing:  data/John McCain/10-18-08.txt
Processing:  data/John McCain/

Processing:  data/Barack Obama/08-19-08.txt
Processing:  data/Barack Obama/09-20-2008.txt
Processing:  data/Barack Obama/10-02-2008.txt
Processing:  data/Barack Obama/10-03-2008.txt
Processing:  data/Barack Obama/07-10-2008-2.txt
Processing:  data/Barack Obama/01-28-2008.txt
Processing:  data/Barack Obama/01-29-2008.txt
Processing:  data/Barack Obama/10-13-08.txt
Processing:  data/Barack Obama/12-05-07.txt
Processing:  data/Barack Obama/03-27-2008.txt
Processing:  data/Barack Obama/11-02-12.txt
Processing:  data/Barack Obama/07-30-08.txt
Processing:  data/Barack Obama/09-22-2012-2.txt
Processing:  data/Barack Obama/08-23-08.txt
Processing:  data/Barack Obama/11-01-2008.txt
Processing:  data/Barack Obama/08-23-2008.txt
Processing:  data/Barack Obama/10-25-2012-3.txt
Processing:  data/Barack Obama/05-09-2008.txt
Processing:  data/Barack Obama/10-01-08.txt
Processing:  data/Barack Obama/06-23-2007.txt
Processing:  data/Barack Obama/08-21-2012-2.txt
Processing:  data/Barack Obama/04-15-200

Processing:  data/Barack Obama/04-15-08.txt
Processing:  data/Barack Obama/07-23-2008.txt
Processing:  data/Barack Obama/08-06-2008.txt
Processing:  data/Barack Obama/11-02-2012-2.txt
Processing:  data/Barack Obama/11-20-07.txt
Processing:  data/Barack Obama/09-22-2012.txt
Processing:  data/Barack Obama/11-02-2012-3.txt
Processing:  data/Barack Obama/09-02-12.txt
Processing:  data/Barack Obama/03-27-08.txt
Processing:  data/Barack Obama/12-27-07.txt
Processing:  data/Barack Obama/10-31-08.txt
Processing:  data/Barack Obama/07-05-12-2.txt
Processing:  data/Barack Obama/10-26-2008.txt
Processing:  data/Barack Obama/10-27-2008.txt
Processing:  data/Barack Obama/11-03-2007.txt
Processing:  data/Barack Obama/11-02-2007.txt
Processing:  data/Barack Obama/08-21-2007.txt
Processing:  data/Barack Obama/05-13-2008.txt
Processing:  data/Barack Obama/05-12-2008.txt
Processing:  data/Barack Obama/10-23-08.txt
Processing:  data/Barack Obama/07-05-2007.txt
Processing:  data/Barack Obama/06-21-2008.tx

Processing:  data/Barack Obama/10-25-12.txt
Processing:  data/Barack Obama/01-30-2008.txt
Processing:  data/Barack Obama/05-26-08.txt
Processing:  data/Barack Obama/08-09-2012-2.txt
Processing:  data/Barack Obama/07-14-12.txt
Processing:  data/Barack Obama/09-06-2008.txt
Processing:  data/Barack Obama/07-06-12-2.txt
Processing:  data/Barack Obama/10-25-2008.txt
Processing:  data/Barack Obama/10-24-2008.txt
Processing:  data/Barack Obama/06-05-08.txt
Processing:  data/Barack Obama/06-23-2008.txt
Processing:  data/Barack Obama/11-01-12-2.txt
Processing:  data/Barack Obama/08-15-12.txt
Processing:  data/Barack Obama/07-22-07.txt
Processing:  data/Barack Obama/09-13-12.txt
Processing:  data/Barack Obama/06-17-2008.txt
Processing:  data/Barack Obama/06-16-2008.txt
Processing:  data/Barack Obama/07-03-08.txt
Processing:  data/Barack Obama/10-08-2007.txt
Processing:  data/Barack Obama/05-25-2008.txt
Processing:  data/Barack Obama/08-02-08.txt
Processing:  data/Barack Obama/11-07-07.txt
Proces

Processing:  data/John Edwards/01-30-2008.txt
Processing:  data/John Edwards/08-23-07.txt
Processing:  data/John Edwards/08-23-2007.txt
Processing:  data/John Edwards/05-23-07.txt
Processing:  data/John Edwards/11-05-07.txt
Processing:  data/John Edwards/01-26-2008.txt
Processing:  data/John Edwards/01-03-08.txt
Processing:  data/John Edwards/03-15-07.txt
Processing:  data/Scott Walker/07-20-2016.txt
Processing:  data/Scott Walker/09-10-2015.txt
Processing:  data/Scott Walker/07-18-2015.txt
Processing:  data/Scott Walker/07-13-2015.txt
Processing:  data/Scott Walker/09-18-2015.txt
Processing:  data/Scott Walker/09-21-2015.txt
Processing:  data/Scott Walker/08-28-2015.txt
Processing:  data/Rand Paul/04-18-2015.txt
Processing:  data/Rand Paul/01-31-2016.txt
Processing:  data/Rand Paul/04-07-2015.txt
Processing:  data/Rand Paul/02-01-2016.txt
Processing:  data/John Kasich/07-21-2015.txt
Processing:  data/John Kasich/03-01-2016.txt
Processing:  data/John Kasich/03-14-2016.txt
Processing:  

Processing:  data/John Kerry/10-19-2004.txt
Processing:  data/John Kerry/10-18-2004.txt
Processing:  data/John Kerry/07-23-2004.txt
Processing:  data/John Kerry/07-22-2004.txt
Processing:  data/John Kerry/06-29-04-2.txt
Processing:  data/John Kerry/08-07-2004.txt
Processing:  data/John Kerry/07-12-04.txt
Processing:  data/John Kerry/04-23-04.txt
Processing:  data/John Kerry/10-27-2004.txt
Processing:  data/John Kerry/10-26-2004.txt
Processing:  data/John Kerry/10-23-04.txt
Processing:  data/John Kerry/06-21-2004.txt
Processing:  data/John Kerry/05-29-04.txt
Processing:  data/John Kerry/05-27-2004.txt
Processing:  data/John Kerry/09-15-2004-2.txt
Processing:  data/John Kerry/07-29-2004.txt
Processing:  data/John Kerry/06-06-04.txt
Processing:  data/John Kerry/09-15-04.txt
Processing:  data/John Kerry/03-08-2004.txt
Processing:  data/John Kerry/03-09-2004.txt
Processing:  data/John Kerry/07-16-2004.txt
Processing:  data/John Kerry/09-09-04.txt
Processing:  data/John Kerry/05-27-04.txt
Pr

Processing:  data/Rudy Guiliani/12-10-2007.txt
Processing:  data/Rudy Guiliani/05-14-2007.txt
Processing:  data/Rudy Guiliani/10-20-2007.txt
Processing:  data/Rudy Guiliani/10-11-07.txt
Processing:  data/Paul Tsongas/04-31-1991.txt
Processing:  data/Paul Tsongas/04-30-1991.txt
Processing:  data/George H.W Bush/10-26-1992-2.txt
Processing:  data/George H.W Bush/04-14-1992-2.txt
Processing:  data/George H.W Bush/10-30-1992.txt
Processing:  data/George H.W Bush/10-31-1992.txt
Processing:  data/George H.W Bush/09-13-1992.txt
Processing:  data/George H.W Bush/02-29-1992.txt
Processing:  data/George H.W Bush/02-25-1992-2.txt
Processing:  data/George H.W Bush/08-22-1992-2.txt
Processing:  data/George H.W Bush/09-30-1992-2.txt
Processing:  data/George H.W Bush/08-25-1992.txt
Processing:  data/George H.W Bush/08-24-1992.txt
Processing:  data/George H.W Bush/09-27-1992.txt
Processing:  data/George H.W Bush/09-26-1992.txt
Processing:  data/George H.W Bush/10-05-1992.txt
Processing:  data/George H

Processing:  data/Bill Richardson/12-17-2007.txt
Processing:  data/Bill Richardson/12-16-2006.txt
Processing:  data/Bill Richardson/03-27-07.txt
Processing:  data/Bill Richardson/12-29-2007.txt
Processing:  data/Bill Richardson/12-28-2007.txt
Processing:  data/Bill Richardson/10-04-2007.txt
Processing:  data/Bill Richardson/03-14-2007.txt
Processing:  data/Bill Richardson/11-27-07.txt
Processing:  data/Bill Richardson/12-16-06.txt
Processing:  data/Bill Richardson/04-25-07.txt
Processing:  data/Bill Richardson/08-07-07.txt
Processing:  data/Bill Richardson/03-27-2007.txt
Processing:  data/Bill Richardson/12-28-07.txt
Processing:  data/Bill Richardson/08-16-2007.txt
Processing:  data/Bill Richardson/12-07-2007.txt
Processing:  data/Bill Richardson/02-03-07.txt
Processing:  data/Bill Richardson/12-12-07.txt
Processing:  data/Bill Richardson/05-17-07.txt
Processing:  data/Bill Richardson/06-19-2007.txt
Processing:  data/Bill Richardson/07-16-07.txt
Processing:  data/Bill Richardson/06-27-

Processing:  data/William Clinton/09-20-1996.txt
Processing:  data/William Clinton/10-07-96-2.txt
Processing:  data/William Clinton/08-25-1996-2.txt
Processing:  data/William Clinton/11-04-96-2.txt
Processing:  data/William Clinton/08-26-96-2.txt
Processing:  data/William Clinton/11-01-1996.txt
Processing:  data/William Clinton/09-03-96.txt
Processing:  data/William Clinton/11-04-1996-3.txt
Processing:  data/William Clinton/09-11-96.txt
Processing:  data/William Clinton/10-30-1996-2.txt
Processing:  data/William Clinton/08-29-1996.txt
Processing:  data/William Clinton/10-07-1996-2.txt
Processing:  data/William Clinton/08-28-1996.txt
Processing:  data/William Clinton/11-05-96.txt
Processing:  data/William Clinton/09-14-1996.txt
Processing:  data/William Clinton/11-04-1996-2.txt
Processing:  data/William Clinton/08-28-1996-2.txt
Processing:  data/William Clinton/08-31-1996-2.txt
Processing:  data/William Clinton/09-01-96.txt
Processing:  data/William Clinton/08-31-96-2.txt
Processing:  d

Processing:  data/Robert Dole/10-25-1996.txt
Processing:  data/Robert Dole/10-11-1996.txt
Processing:  data/Robert Dole/10-10-1996.txt
Processing:  data/Robert Dole/10-18-96.txt
Processing:  data/Robert Dole/08-31-1996.txt
Processing:  data/Robert Dole/10-22-96.txt
Processing:  data/Robert Dole/07-13-96.txt
Processing:  data/Robert Dole/08-12-96.txt
Processing:  data/Robert Dole/10-30-96.txt
Processing:  data/Robert Dole/11-04-96.txt
Processing:  data/Robert Dole/10-26-96-3.txt
Processing:  data/Robert Dole/02-12-96.txt
Processing:  data/Robert Dole/05-28-1996.txt
Processing:  data/Robert Dole/06-03-96.txt
Processing:  data/Robert Dole/09-10-96.txt
Processing:  data/Robert Dole/10-22-1996.txt
Processing:  data/Robert Dole/10-23-1996.txt
Processing:  data/Robert Dole/04-10-95.txt
Processing:  data/Robert Dole/10-26-96-2.txt
Processing:  data/Robert Dole/06-10-1996.txt
Processing:  data/Robert Dole/07-17-96.txt
Processing:  data/Robert Dole/10-26-96.txt
Processing:  data/Robert Dole/10-2

In [273]:
output_conflicts(output_number_examples("test.txt", construct_grammar()))

[[('trillion', 'CD'), ('in', 'IN'), ('tax', 'NN')], [('trillion', 'CD'), ('tax', 'NN')], [('3', 'CD'), ('years,', 'NN')], [('4', 'CD'), ('years.', 'NN')], [('30', 'CD'), ('years', 'NNS')], [('two', 'CD'), ('decades.', 'NNS')], [('billion', 'CD'), ('in', 'IN'), ('taxpayer', 'NN')], [('100,000', 'CD'), ('new', 'JJ'), ('math', 'NN')], [('trillion', 'CD'), ('through', 'IN'), ('a', 'DT'), ('mix', 'NN')], [('trillion', 'CD'), ('dollars', 'NNS')], [('million', 'CD'), ('new', 'JJ'), ('jobs,', 'NN')], [('billion', 'CD'), ('in', 'IN'), ('corporate', 'JJ'), ('welfare.', 'NN')], [('one', 'CD'), ('way', 'NN')], [('trillion', 'CD'), ('in', 'IN'), ('new', 'JJ'), ('tax', 'NN')], [('trillion', 'CD'), ('in', 'IN'), ('new', 'JJ'), ('defense', 'NN')], [('Four', 'CD'), ('years', 'NNS')], [('one', 'CD'), ('people.', 'NN')], [('million', 'CD'), ('checks.', 'NN')], [('one', 'CD'), ('term.', 'NN')], [('one', 'CD'), ('party.', 'NN')], [('47', 'CD'), ('percent', 'NN')], [('1', 'CD'), ('percent', 'NN')], [('99', 

In [269]:
"""

Current Limitations: NLTK has problems accurately identifying the part of speech of some number words - it does
much better in the digital representation. The singular versions of 

"""

'\n\nCurrent Limitations: NLTK has problems accurately identifying the part of speech of some number words - it does\nmuch better in the digital representation. The singular versions of \n\n'

In [21]:
import json

class SetEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, set):
            return list(obj)
        if obj.__class__.__name__ == "Span":
            return str(obj)
        return json.JSONEncoder.default(self, obj)

def model_to_file(model):
    print("Writing JSON.")
    
    import json
    with open('result.json', 'w') as fp:
        json.dump(model, fp, cls=SetEncoder)
        
    print("Done writing JSON.")
    pass


In [22]:
model_to_file(maps)

Writing JSON.
Done writing JSON.


In [37]:
import os

print("Unique numeric topics spoken about ", len(map_topic_person.keys()))
print("##############################")
print("Topics spoken about per person:")

person_numbers = {'null': 0}
for person in os.listdir('data'):
    if person != ".DS_Store":
        person_numbers.update({person: len(os.listdir(os.path.join('data', person)))})
        print(len(os.listdir(os.path.join('data', person))))

for key in person_numbers.keys():
    if key != 'null':
        print(key, "spoke about ", len(map_person_topic[key]), " specific numeric topics over ", person_numbers[key], ", a ratio of ", len(map_person_topic[key]) / person_numbers[key])

Unique numeric topics spoken about  1718
##############################
Topics spoken about per person:
9
16
174
26
10
96
21
250
6
6
27
9
299
2
5
33
570
34
42
7
4
6
28
12
9
7
18
174
3
12
30
58
2
123
60
7
45
30
7
12
15
22
7
92
150
Chris Christie spoke about  52  specific numeric topics over  9 , a ratio of  5.777777777777778
Michelle Bachmann spoke about  62  specific numeric topics over  16 , a ratio of  3.875
Hillary Clinton spoke about  338  specific numeric topics over  174 , a ratio of  1.9425287356321839
Bernie Sanders spoke about  126  specific numeric topics over  26 , a ratio of  4.846153846153846
Ralph Nader spoke about  102  specific numeric topics over  10 , a ratio of  10.2
George W. Bush spoke about  159  specific numeric topics over  96 , a ratio of  1.65625
Martin O'Malley spoke about  76  specific numeric topics over  21 , a ratio of  3.619047619047619
Mitt Romney spoke about  319  specific numeric topics over  250 , a ratio of  1.276
Herman Cain spoke about  32  specif

In [58]:
print("Unique numeric topics spoken about ", len(map_topic_person.keys()))
print("##############################")
print("Topics spoken about per person:")

print("Person\tNumber of Numeric Phrases\tNumber of Document\tRatio")

person_numbers = {'null': 0}
for person in os.listdir('data'):
    if person != ".DS_Store":
        person_numbers.update({person: len(os.listdir(os.path.join('data', person)))})
        
vals = []

for key in person_numbers.keys():
    if key != 'null':
        vals.append(len(map_person_topic[key]) / person_numbers[key])
        print(key, "\t", len(map_person_topic[key]), "\t", person_numbers[key], "\t", len(map_person_topic[key]) / person_numbers[key])
        
import statistics

print("Stddev: ", statistics.stdev(vals))
print("Mean: ", statistics.mean(vals))
print("Median: ", statistics.median(vals))

print("##############################")

import collections

print("Most Popular Keys")

topics = collections.Counter({'null' : 0})

for topic in map_topic_person.keys():
    counter = 0
    for person in map_topic_person[topic].keys():
        for doc in map_topic_person[topic][person].keys():
            counter += len(map_topic_person[topic][person][doc])
            
    topics.update({topic : counter})
    

for topic in topics.most_common(20) : 
    print(topic[0], '\t', topic[1])
        
print("##############################")
print("Total Mentions")

print(sum(topics.values()))
    
    
    
    

Unique numeric topics spoken about  1718
##############################
Topics spoken about per person:
Person	Number of Numeric Phrases	Number of Document	Ratio
Chris Christie 	 52 	 9 	 5.777777777777778
Michelle Bachmann 	 62 	 16 	 3.875
Hillary Clinton 	 338 	 174 	 1.9425287356321839
Bernie Sanders 	 126 	 26 	 4.846153846153846
Ralph Nader 	 102 	 10 	 10.2
George W. Bush 	 159 	 96 	 1.65625
Martin O'Malley 	 76 	 21 	 3.619047619047619
Mitt Romney 	 319 	 250 	 1.276
Herman Cain 	 32 	 6 	 5.333333333333333
Tim Pawlenty 	 29 	 6 	 4.833333333333333
Rick Perry 	 103 	 27 	 3.814814814814815
Ted Cruz 	 57 	 9 	 6.333333333333333
John McCain 	 362 	 299 	 1.2107023411371238
Christopher Dodd 	 2 	 2 	 1.0
Joseph Lieberman 	 25 	 5 	 5.0
Howard Dean 	 169 	 33 	 5.121212121212121
Barack Obama 	 423 	 570 	 0.7421052631578947
Rick Santorum 	 78 	 34 	 2.2941176470588234
John Edwards 	 101 	 42 	 2.4047619047619047
Scott Walker 	 36 	 7 	 5.142857142857143
Rand Paul 	 40 	 4 	 10.0
J

In [59]:
for key, value in map_topic_person['job'].items():
    print(key)
    print(value)

Michelle Bachmann
{'10-20-11.txt': {1.4 million jobs}, '09-08-11.txt': {over 2.5 million jobs, 800,000 jobs, 416,000 government jobs}, '10-20-2011.txt': {1.4 million jobs}, '09-08-2011.txt': {over 2.5 million jobs, 416,000 government jobs, 800,000 jobs}}
Hillary Clinton
{'02-13-08.txt': {3 million manufacturing jobs, five million green collar jobs, 1,600 jobs, five million new, green collar jobs}, '03-04-2008.txt': {two jobs}, '11-05-2007.txt': {at least five million good new jobs, more than 1 million new jobs, some 300,000 good jobs, at least 5 million jobs, 85,000 new jobs}, '04-13-2007.txt': {Twenty-two million new jobs}, '04-01-08.txt': {3 million manufacturing jobs, 13,000 manufacturing jobs, nearly one-in-four manufacturing jobs, at least 5 million additional jobs, at least 3 million jobs}, '10-24-2007.txt': {two jobs}, '11-19-2007.txt': {five million new jobs, an estimated 1.8 million jobs, 22 million new jobs, one job}, '04-01-2008.txt': {at least 3 million jobs, 3 million manu

{'10-18-12.txt': {a million new manufacturing jobs, 23 million new jobs, 23 million new jobs, 12 million jobs, more than half a million new manufacturing jobs, more than 5 million new jobs, 800,000 jobs}, '10-15-2008.txt': {a million jobs, five million new, green jobs, one million jobs}, '07-25-12.txt': {23 million new jobs, more than 4 1/2 million new jobs, 800,000 jobs}, '01-30-08.txt': {a $7 an-hour job}, '05-20-2008.txt': {five million new jobs}, '05-21-2008.txt': {five million new jobs}, '08-13-12-2.txt': {4 1/2 million new jobs, about 7,000 jobs, 37,000 jobs, nearly 23 million new jobs, more than 1 million jobs}, '04-14-08-2.txt': {up to five million new jobs}, '08-02-2012-2.txt': {nearly 23 million new jobs, 4 1/2 million jobs}, '02-13-08.txt': {the next two decades - jobs, up to 5 million new jobs, nearly two million new jobs}, '01-22-08.txt': {$7-an-hour jobs}, '09-27-08.txt': {one million jobs, five million new jobs, the 400 union jobs}, '08-28-12.txt': {Nearly 7,000 good Iow