In [1]:
from collections import defaultdict
from datetime import datetime
import os
import random

import numpy as np
import pandas as pd

import spacy
nlp = spacy.load('en_core_web_sm')

from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)

import neuralcoref
nlp.add_pipe(neuralcoref.NeuralCoref(nlp.vocab,blacklist=False),name="neuralcoref")

from main2 import ConnoFramer

In [2]:
lexicon_path = '/Users/maria/Documents/data/FramesAgencyPower/agency_power.csv'

<br><br><br><br>

# Small demo (old)

In [3]:
example_stories = ["I was just thinking about walking down the street, when my shoelace snapped. I had to call my doctor to pick me up. I felt so bad I also called my friend Katie, who came in her car. She was a lifesaver. My friend Jack is nice.",
                   "My doctor fixed my shoe. I thanked him. Then Susan arrived. Now she is calling the doctor too."]
text_ids = [0, 1]

In [4]:
framer = ConnoFramer()
framer.train(lexicon_path, 
             example_stories,
             text_ids)

2023-02-06 10:19:36 Complete!


In [5]:
framer.get_score_totals()

defaultdict(<function main2.ConnoFramer.__score_dataset.<locals>.<lambda>()>,
            {'i': defaultdict(int, {'positive': 1, 'negative': 2}),
             'my doctor': defaultdict(int, {'positive': 4, 'negative': 0}),
             'susan': defaultdict(int, {'positive': 0, 'negative': 1})})

In [6]:
framer.get_scores_for_doc(1)

defaultdict(<function main2.ConnoFramer.__score_document.<locals>.<lambda>()>,
            {'my doctor': defaultdict(int, {'positive': 3, 'negative': 0}),
             'susan': defaultdict(int, {'negative': 1, 'positive': 0})})

In [7]:
framer.count_nsubj_for_doc(1)

defaultdict(int,
            {('my doctor', 'fix'): 1,
             ('susan', 'arrive'): 1,
             ('susan', 'call'): 1})

In [8]:
framer.count_dobj_for_doc(1)

defaultdict(int, {('my doctor', 'thank'): 1, ('my doctor', 'call'): 1})

In [9]:
framer.get_scores_for_doc(0)

defaultdict(<function main2.ConnoFramer.__score_document.<locals>.<lambda>()>,
            {'i': defaultdict(int, {'positive': 1, 'negative': 2}),
             'my doctor': defaultdict(int, {'positive': 1, 'negative': 0})})

In [10]:
framer.count_nsubj_for_doc(0)

defaultdict(int,
            {('i', 'think'): 1,
             ('i', 'have'): 1,
             ('i', 'feel'): 1,
             ('i', 'call'): 1})

In [11]:
framer.count_dobj_for_doc(0)

defaultdict(int, {('i', 'pick'): 1, ('my doctor', 'call'): 1})

<br><br><br><br>

# Bigger demo (new)

In [20]:
texts = []
text_ids = []
stories_path = '/Users/maria/Documents/data/narrativity/litbank/original'   # Litbank corpus here: https://github.com/dbamman/litbank

j = 0
for _file_name in os.listdir(stories_path):
    _lines = []
    for _line in open(stories_path + '/' + _file_name, 'r'):
        
        if _line.strip():
            _lines.append(_line.strip())

    # Randomly sample 100 paragraphs from each book
    for _line in random.sample(_lines, 100):        
        texts.append(_line)
        text_ids.append(j)
        j += 1

len(texts), len(text_ids)

(10000, 10000)

In [21]:
framer = ConnoFramer()

In [22]:
framer.train(lexicon_path, texts, text_ids)

2023-02-06 10:35:13 Processed 0 out of 10000
2023-02-06 10:35:15 Processed 100 out of 10000
2023-02-06 10:35:16 Processed 200 out of 10000
2023-02-06 10:35:17 Processed 300 out of 10000
2023-02-06 10:35:18 Processed 400 out of 10000
2023-02-06 10:35:19 Processed 500 out of 10000
2023-02-06 10:35:20 Processed 600 out of 10000
2023-02-06 10:35:21 Processed 700 out of 10000
2023-02-06 10:35:22 Processed 800 out of 10000
2023-02-06 10:35:23 Processed 900 out of 10000
2023-02-06 10:35:24 Processed 1000 out of 10000
2023-02-06 10:35:25 Processed 1100 out of 10000
2023-02-06 10:35:26 Processed 1200 out of 10000
2023-02-06 10:35:27 Processed 1300 out of 10000
2023-02-06 10:35:28 Processed 1400 out of 10000
2023-02-06 10:35:29 Processed 1500 out of 10000
2023-02-06 10:35:29 Processed 1600 out of 10000
2023-02-06 10:35:30 Processed 1700 out of 10000
2023-02-06 10:35:31 Processed 1800 out of 10000
2023-02-06 10:35:32 Processed 1900 out of 10000
2023-02-06 10:35:33 Processed 2000 out of 10000
2023

In [23]:
persona_score_dict = framer.get_score_totals()
len(persona_score_dict)

284

In [24]:
persona_sum_dict = {_persona: _category_score_dict['positive']-_category_score_dict['negative'] for _persona, _category_score_dict in persona_score_dict.items()}
len(persona_sum_dict)

284

In [25]:
for _persona, _sum in sorted(persona_sum_dict.items(), key=lambda x: x[1], reverse=True)[:20]:
    print(_persona)
    print(_sum, persona_score_dict[_persona]['positive'], persona_score_dict[_persona]['negative'])
    print()

i
179 292 113

you
37 75 38

mary
11 11 0

george
6 7 1

rosamond
3 3 0

jane
3 4 1

mrs. miller
2 2 0

isabella
2 2 0

deborah
2 2 0

tommy
2 3 1

sikes
2 2 0

dick
2 5 3

mr. hale
2 2 0

theobald
2 2 0

miriam
2 2 0

michael
2 2 0

freddy malins
2 2 0

jo
2 3 1

mrs. selwyn
2 2 0

joe
2 3 1

