In [1]:
from collections import defaultdict
from datetime import datetime
import os
import random

import numpy as np
import pandas as pd

import spacy
nlp = spacy.load('en_core_web_sm')

from spacy.lemmatizer import Lemmatizer

# from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES # for spaCy 2.1 and earlier 
from spacy.lang.en import English   # for spaCy 2.2

# lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) # for spaCy 2.1 and earlier 
lemmatizer = English.Defaults.create_lemmatizer()   # for spaCy 2.2

import neuralcoref
nlp.add_pipe(neuralcoref.NeuralCoref(nlp.vocab,blacklist=False),name="neuralcoref")

from main2 import ConnoFramer

<br><br><br><br>

# **Small demo w/ Sap frames** 

In [2]:
example_stories = ["I was just thinking about walking down the street, when my shoelace snapped. I had to call my doctor to pick me up. I felt so bad I also called my friend Katie, who came in her car. She was a lifesaver. My friend Jack is nice.",
                   "My doctor fixed my shoe. I thanked him. Then Susan arrived. Now she is calling the doctor too."]
text_ids = [0, 1]

In [3]:
framer = ConnoFramer()
framer.load_sap_lexicon('power')
framer.train(example_stories,
             text_ids)

100%|██████████| 2/2 [00:00<00:00, 27.34it/s]

2023-02-20 21:08:38 Complete!





In [4]:
framer.get_score_totals()

{'i': -2.0, 'my doctor': 4.0, 'susan': -1.0, 'my': -1.0}

In [5]:
example_stories[0]

'I was just thinking about walking down the street, when my shoelace snapped. I had to call my doctor to pick me up. I felt so bad I also called my friend Katie, who came in her car. She was a lifesaver. My friend Jack is nice.'

In [6]:
framer.get_scores_for_doc(0)

{'i': -2.0, 'my doctor': 1.0}

In [7]:
framer.count_nsubj_for_doc(0)

{('i', 'think'): 1,
 ('i', 'have'): 1,
 ('i', 'feel'): 1,
 ('i', 'call'): 1,
 ('her', 'be'): 1}

In [8]:
framer.count_dobj_for_doc(0)

{('my doctor', 'call'): 1, ('i', 'pick'): 1}

In [9]:
example_stories[1]

'My doctor fixed my shoe. I thanked him. Then Susan arrived. Now she is calling the doctor too.'

In [10]:
framer.get_scores_for_doc(1)

{'susan': -1.0, 'my': -1.0, 'my doctor': 3.0}

In [11]:
framer.count_nsubj_for_doc(1)

{('susan', 'arrive'): 1,
 ('susan', 'call'): 1,
 ('my', 'thank'): 1,
 ('my doctor', 'fix'): 1}

In [12]:
framer.count_dobj_for_doc(1)

{('my doctor', 'thank'): 1, ('my doctor', 'call'): 1}

In [13]:
framer = ConnoFramer()
framer.load_sap_lexicon('agency')
framer.train(example_stories,
             text_ids)

100%|██████████| 2/2 [00:00<00:00, 34.51it/s]

2023-02-20 21:08:44 Complete!





In [14]:
framer.get_score_totals()

{'i': 2.0, 'my doctor': 1.0, 'susan': 1.0, 'my': 1.0}

<br><br><br><br>

# **Small demo w/ Rashkin frames**

In [15]:
example_stories = ["I was just thinking about walking down the street, when my shoelace snapped. I had to call my doctor to pick me up. I felt so bad I also called my friend Katie, who came in her car. She was a lifesaver. My friend Jack is nice.",
                   "My doctor fixed my shoe. I thanked him. Then Susan arrived. Now she is calling the doctor too."]
text_ids = [0, 1]

In [16]:
framer = ConnoFramer()
framer.load_rashkin_lexicon('effect')
framer.train(example_stories,
             text_ids)

100%|██████████| 2/2 [00:00<00:00, 36.13it/s]

2023-02-20 21:08:46 Complete!





In [17]:
framer.get_score_totals()

{'i': 1.4000000000000001,
 'my doctor': 1.4,
 'susan': 0.33333333333299997,
 'my': 0.666666666667}

In [18]:
framer.get_scores_for_doc(0)

{'i': 1.4000000000000001, 'my doctor': 0.266666666667}

In [19]:
framer.get_scores_for_doc(1)

{'my doctor': 1.133333333333,
 'susan': 0.33333333333299997,
 'my': 0.666666666667}

In [20]:
framer = ConnoFramer()
framer.load_rashkin_lexicon('value')
framer.train(example_stories,
             text_ids)

100%|██████████| 2/2 [00:00<00:00, 36.68it/s]

2023-02-20 21:08:47 Complete!





In [21]:
framer.get_score_totals()

{'i': 3.0,
 'my doctor': 3.2000000000010003,
 'my': 0.866666666667,
 'susan': 1.066666666667}

<br><br><br><br>

# **Bigger demo w/ Sap frames and example dataset**

In [22]:
texts = []
text_ids = []
stories_path = '/Users/maria/Documents/data/narrativity/litbank/original'   # Litbank corpus here: https://github.com/dbamman/litbank

j = 0
for _file_name in os.listdir(stories_path):
    _lines = []
    for _line in open(stories_path + '/' + _file_name, 'r'):
        if _line.strip():
            _lines.append(_line.strip())

    # Randomly sample 100 paragraphs from each book
    for _line in random.sample(_lines, 100):        
        texts.append(_line)
        text_ids.append(j)
        j += 1

len(texts), len(text_ids)

(10000, 10000)

In [23]:
framer = ConnoFramer()
framer.load_sap_lexicon('power')
framer.train(texts,
             text_ids)

100%|██████████| 10000/10000 [01:33<00:00, 106.58it/s]

2023-02-20 21:10:25 Complete!





In [24]:
persona_score_dict = framer.get_score_totals()
len(persona_score_dict)

772

In [25]:
for _persona, _score in sorted(persona_score_dict.items(), key=lambda x: x[1], reverse=True)[:20]:
    print(_score, '\t', _persona)

145.0 	 he
89.0 	 she
89.0 	 you
64.0 	 i
18.0 	 his
11.0 	 her
7.0 	 my
6.0 	 sir
6.0 	 herself
5.0 	 dick
5.0 	 george
4.0 	 him
4.0 	 robert
3.0 	 the doctor
3.0 	 "you
3.0 	 god
3.0 	 tom
3.0 	 fanny
3.0 	 lucilla
3.0 	 zora
