Parse all reddit data; save resulting parsed spaCy docs; save set of all verb lemmas used

In [7]:
import pandas as pd
import pickle

import spacy
NLP = spacy.load('en')
from nltk.stem import WordNetLemmatizer
LEMMATIZER = WordNetLemmatizer()

import neuralcoref
neuralcoref.add_to_pipe(NLP)

<spacy.lang.en.English at 0x15b8ce3d0>

In [49]:
NLP.pipe_names

['tagger', 'parser', 'ner', 'neuralcoref']

In [29]:
from spacy.tokens import DocBin

In [2]:
dataset_path = '../narrative_detection/narrative_posts_by_trained_classification.csv'
dataset_df = pd.read_csv(dataset_path)

In [6]:
stories = dataset_df['selftext'].tolist()

In [8]:
parsed_stories = {i: NLP(story) for i, story in enumerate(stories)}

In [10]:
parsed_list = [parsed_stories[i] for i in parsed_stories]

In [26]:
type(parsed_list[0])

spacy.tokens.doc.Doc

In [31]:
doc_bin = DocBin()
for doc in parsed_list:
    doc_bin.add(doc)

In [33]:
doc_bytes = doc_bin.to_bytes()

In [34]:
with open('data/parsed_stories.pickle', 'wb') as f:
    pickle.dump(doc_bytes, f)

In [35]:
saved_bytes = pickle.load(open('data/parsed_stories.pickle', 'rb'))
saved_docs = DocBin().from_bytes(saved_bytes)

In [43]:
recovered_docs = saved_docs.get_docs(NLP.vocab)

In [44]:
recovered_docs = [doc for doc in recovered_docs]

In [47]:
for sent in recovered_docs[0].sents:
    for noun_chunk in sent.noun_chunks:
        print(noun_chunk.text)
        print(noun_chunk._.is_coref)

ValueError: [E030] Sentence boundaries unset. You can add the 'sentencizer' component to the pipeline with: nlp.add_pipe(nlp.create_pipe('sentencizer')) Alternatively, add the dependency parser, or set sentence boundaries by setting doc[i].is_sent_start.

In [25]:
data = [pickle.dumps(doc) for doc in parsed_list]

NotImplementedError: [E112] Pickling a span is not supported, because spans are only views of the parent Doc and can't exist on their own. A pickled span would always have to include its Doc and Vocab, which has practically no advantage over pickling the parent Doc directly. So instead of pickling the span, pickle the Doc it belongs to or use Span.as_doc to convert the span to a standalone Doc object.

In [12]:
# save parsed stories to file 
pickle.dumps(parsed_list, open('parsed_stories.pkl', 'wb'))

TypeError: '_io.BufferedWriter' object cannot be interpreted as an integer

In [14]:
all_verbs = set()
for story in parsed_stories.values():
    for sent in story.sents:
        for noun_chunk in sent.noun_chunks:  
            if noun_chunk.root.dep_ in ['nsubj', 'dobj']:
                verb = noun_chunk.root.head.text
                verb_lemma = LEMMATIZER.lemmatize(verb, pos='v').lower()
                all_verbs.add(verb_lemma)

In [15]:
lexicon_path = 'FramesAgencyPower/agency_power.csv'
lexicon_df = pd.read_csv(lexicon_path)

# lemmatize verbs in lexicon
lexicon_df['lemma'] = lexicon_df['verb'].apply(lambda x: LEMMATIZER.lemmatize(x, pos='v').lower())

In [16]:
lexicon_verbs = set(lexicon_df['lemma'].tolist())

In [51]:
with open('data/lexicon_verbs_lemmatized.txt', 'w') as f:
    for verb in lexicon_verbs:
        f.write(verb + '\n')

In [19]:
# verbs missing from lexicon
missing_verbs = all_verbs - lexicon_verbs

In [21]:
# save missing verbs to file
with open('missing_verbs.txt', 'w') as f:
    for verb in missing_verbs:
        f.write(verb + '\n')

In [2]:
with open('data/missing_verbs.txt', 'r') as f:
    missing_verbs = f.read().splitlines()

In [3]:
len(missing_verbs)

2625