# Transcripts

In [1]:
import string
from collections import Counter
from collections import defaultdict

import networkx as nx
import pandas as pd
from IPython.display import display
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from textblob import Word

In [2]:
pd.set_option('display.max_rows', 500)

## Loading data

### Transcripts

In [3]:
transcripts_df = pd.read_csv(
    'data/out/transcripts_1.csv',
    converters={'INTERVIEWERS': eval, 'INTERVIEWEES': eval, 'ALIASES': eval, 'INTERVIEW': eval},
)
transcripts_df = transcripts_df[['ID', 'INTERVIEWERS', 'INTERVIEWEES', 'ALIASES', 'INTERVIEW']]

In [4]:
transcripts_df.head()

Unnamed: 0,ID,INTERVIEWERS,INTERVIEWEES,ALIASES,INTERVIEW
0,Aimee Johnson – 17 September 2010,[Rick Fehr],[Aimee Johnson],[],"[(0, Rick, Ok. We’re recording now, I’m sitti..."
1,Anita Smith -,[Dave White],[Anita Smith],[],"[(0, Dave, How did we use to use the environme..."
2,Apollo Blackeagle – 27 October 2010,"[Rick Fehr, David White]",[Apollo Blackeagle],[],"[(0, Rick, Ok, its October 27th I believe, we’..."
3,Bill Sands,[Dave White],[Bill Sands],[],"[(0, Dave, In the past, there’s concern today ..."
4,Brenda Wheat – 24 May 2011,[Rick Fehr],[Brenda Wheat],[],"[(0, Rick, So what we’ll be using is just a li..."


In [5]:
transcripts_df.shape

(50, 5)

### Concepts

In [6]:
concepts = [
    'water', 'animal', 'medicine', 'plant', 'food', #'activity', 'place', 'tool', 'clothing'
    'treaty', 'ceremony', 'ancestor',
]

### Specific keywords

In [7]:
specific_words = {}
current_category = ''
with open('data/specific_keywords.txt') as f:
    for line in f:
        line = line.strip()
        if line:
            if line.endswith(':'):
                current_category = line.split(':')[0].lower()
                specific_words[current_category] = []
            else:
                specific_words[current_category].append(line.lower())

### Most frequent words

In [8]:
most_freq_words = defaultdict(dict)
current_category = ''
with open('data/most_freq_words.txt') as f:
    for line in f:
        if line.startswith('#'):
            current_category = line.split('-')[0].lower()[2:-1]
        else:
            word = line[3:].split("'")[0].lower()
            freq = line[3:].split("'")[1][2:-3]
            most_freq_words[current_category].update({word: int(freq)})
most_freq_words = dict(most_freq_words)

### Proper names

#### People

In [9]:
people = nx.read_gexf('data/out/people.gexf')

#### Others

In [10]:
others = nx.read_gexf('data/out/other.gexf')

## Processing data

In [11]:
def all_hypernyms(ss):
    return ss.closure(lambda x: x.hypernyms())

In [12]:
def common_synsets(ss, sss):
    hypers = set(all_hypernyms(ss))
    hypers.add(ss)
    return hypers.intersection(sss)

In [13]:
all_text = '\n'.join([text for interview in transcripts_df.INTERVIEW for (index, name, text) in interview])

In [14]:
all_text_lower = all_text.lower()

In [15]:
stop = stopwords.words('english')
stop.extend(string.punctuation)

In [16]:
def clean_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    words = [token for token in tokens if token not in stop and token and len(token) > 2]
    text = ' '.join(words)
    return text

In [17]:
cleaned_all_text = clean_text(all_text)

### Concepts

In [18]:
# concepts
wordnet_concepts_dict = {
    'water': ['body_of_water.n.01'],
    'animal': ['animal.n.01'],
    'medicine': ['medicine.n.02'],
#     'clothing': ['clothing.n.01'],
#     'tool': ['instrumentality.n.03'],
#     'activity': ['interact.v.01', 'act.v.01'],
#     'place': ['location.n.01', 'building.n.01'],
    'plant': ['plant.n.02'],
    'food': ['food.n.01', 'food.n.02'],
    'treaty': ['treaty.n.01'],
    'ceremony': ['ceremony.n.01', 'ceremony.n.02', 'ceremony.n.03'],
    'ancestor': ['ancestor.n.01'],
}
synsets = [wn.synset(ss) for ls in wordnet_concepts_dict.values() for ss in ls]

In [19]:
def inverse_list_dict(d):
    i = {}
    for k in d:
        for v in d[k]:
            i[v] = k
    return i

In [20]:
inverse_concepts = inverse_list_dict(wordnet_concepts_dict)

In [21]:
inverse_concepts

{'ancestor.n.01': 'ancestor',
 'animal.n.01': 'animal',
 'body_of_water.n.01': 'water',
 'ceremony.n.01': 'ceremony',
 'ceremony.n.02': 'ceremony',
 'ceremony.n.03': 'ceremony',
 'food.n.01': 'food',
 'food.n.02': 'food',
 'medicine.n.02': 'medicine',
 'plant.n.02': 'plant',
 'treaty.n.01': 'treaty'}

In [43]:
%%time

concepts_list = defaultdict(list)
for w in cleaned_all_text.split():
    sss = wn.synsets(w)
    for ss in sss:
        intersect = common_synsets(ss, synsets)
        if intersect:
            il = list(intersect)
            assert(len(il) <= 2)
            assert(len(il) == 1 or il[0].name().split('.')[0] == il[1].name().split('.')[0])
            ssi = il[0]
            category = inverse_concepts[ssi.name()]
            if category == 'ancestor':
                # In interpersonal relationships, it is intersting to see if they talk about
                # their mother/father (singular, their own) mothers/fathers (plural, everyone's in general).
                # Also, if they say mother/father (more formal) or mom/dad (more familiar).
                term = w # original word in the transcriptions
            else:
                term = ss.name().split('.')[0] # generic concept
                if term == "cat's-tail":
                    print(w)
            concepts_list[category].append(term)

bulrushes
bulrush
bulrushes
bulrushes
bulrushes
bulrushes
bulrushes
bulrushes
bulrushes
bulrush
bulrush
bulrush
CPU times: user 44.9 s, sys: 20 ms, total: 44.9 s
Wall time: 44.9 s


In [46]:
for ss in wn.synsets('bulrushes'):
    print(ss, ss.definition())

Synset('cat's-tail.n.01') tall marsh plant with cylindrical seed heads that explode when mature shedding large quantities of down; its long flat leaves are used for making mats and chair seats; of North America, Europe, Asia and North Africa
Synset('bulrush.n.02') tall rush with soft erect or arching stems found in Eurasia, Australia, New Zealand, and common in North America


In [47]:
for ss in wn.synsets('cattail'):
    print(ss, ss.definition())

Synset('cattail.n.01') tall erect herbs with sword-shaped leaves; cosmopolitan in fresh and salt marshes


In [48]:
for ss in wn.synsets('cat-tail'):
    print(ss, ss.definition())

In [23]:
concepts_counter = {}
for category in concepts_list:
    concepts_counter[category] = dict(Counter(concepts_list[category]))

In [24]:
concepts_counter

{'ancestor': {'ancestor': 2,
  'ancestors': 13,
  'dad': 329,
  'dads': 5,
  'father': 110,
  'fathers': 12,
  'forefathers': 4,
  'gran': 2,
  'grandfather': 75,
  'grandfathers': 12,
  'grandma': 16,
  'grandmas': 1,
  'grandmother': 59,
  'grandmothers': 4,
  'grandpa': 20,
  'grandparent': 1,
  'grandparents': 41,
  'jacob': 2,
  'jacobs': 26,
  'mom': 107,
  'moms': 2,
  'mother': 95,
  'mothers': 2,
  'papa': 1,
  'parent': 6,
  'parents': 58,
  'pop': 11,
  'pops': 1,
  'root': 39,
  'roots': 24,
  'simeon': 3},
 'animal': {'abalone': 1,
  'adult': 26,
  'alligator': 3,
  'american_bison': 3,
  'amphibian': 2,
  'animal': 93,
  'ant': 13,
  'ass': 3,
  'baby': 80,
  'bass': 38,
  'basset': 7,
  'bat': 7,
  'bay': 75,
  'bear': 15,
  'beaver': 8,
  'bee': 103,
  'beef': 3,
  'beetle': 5,
  'big_cat': 9,
  'billy': 8,
  'bird': 40,
  'bivalve': 1,
  'black-backed_gull': 3,
  'blackbird': 44,
  'blue': 16,
  'bluegill': 2,
  'bobcat': 10,
  'bobwhite': 2,
  'bowfin': 7,
  'brood_he

### Specific keywords

In [25]:
specific_words

{'ancestor': ['grandmother', 'grandfather'],
 'animal': ['quail',
  'bob white',
  'muskrat',
  'mink',
  'turtles',
  'snapping turtles',
  'turtle eggs',
  'fish',
  'frogs'],
 'ceremony': ['fasting ceremony'],
 'medicine': [],
 'other': ['strawberry moon', 'blueberry moon', 'moon'],
 'plant': ['phragmites',
  'bull rush',
  'cat tail',
  'pin weed',
  'sweet grass',
  'milk weed',
  'tobacco',
  'prairie grass',
  'tall-grass prairie',
  'meadow',
  'chestnuts',
  'lady slipper',
  'wild rice',
  'birch',
  'horsetail'],
 'water': ['bkejwanong', 'lake st. clair', 'marsh']}

In [26]:
def reduce(ls):
    ls1 = ls.copy()
    changed = True
    while changed:
        ls2 = ls1.copy()
        for x in ls1.copy():
            for y in ls1.copy():
                if x != y and x in y:
                    ls1.remove(y)
        changed = len(ls1) != len(ls2)
    return ls2

In [27]:
specific_words_forms = {}
for category in specific_words:
    specific_words_forms[category] = {}
    for w in specific_words[category]:
        specific_words_forms[category][w] = list(reduce({
            w,
            w.replace('-', ''),
            w.replace('-', ' '),
            Word(w).pluralize().string,
            Word(w).singularize().string,
            Word(w.replace('-', '')).pluralize().string,
            Word(w.replace('-', ' ')).pluralize().string,
            Word(w.replace('-', '')).singularize().string,
            Word(w.replace('-', ' ')).singularize().string,
        }))

In [28]:
specific_words_forms

{'ancestor': {'grandfather': ['grandfather'], 'grandmother': ['grandmother']},
 'animal': {'bob white': ['bob white'],
  'fish': ['fish'],
  'frogs': ['frog'],
  'mink': ['mink'],
  'muskrat': ['muskrat'],
  'quail': ['quail'],
  'snapping turtles': ['snapping turtle'],
  'turtle eggs': ['turtle egg'],
  'turtles': ['turtle']},
 'ceremony': {'fasting ceremony': ['fasting ceremony', 'fasting ceremonies']},
 'medicine': {},
 'other': {'blueberry moon': ['blueberry moon'],
  'moon': ['moon'],
  'strawberry moon': ['strawberry moon']},
 'plant': {'birch': ['birch'],
  'bull rush': ['bull rush'],
  'cat tail': ['cat tail'],
  'chestnuts': ['chestnut'],
  'horsetail': ['horsetail'],
  'lady slipper': ['lady slipper'],
  'meadow': ['meadow'],
  'milk weed': ['milk weed'],
  'phragmites': ['phragmite'],
  'pin weed': ['pin weed'],
  'prairie grass': ['prairie gras'],
  'sweet grass': ['sweet gras'],
  'tall-grass prairie': ['tallgrass prairie',
   'tall grass prairie',
   'tall-grass prairie']

In [29]:
specific_words_counter = {}
for category in specific_words_forms:
    specific_words_counter[category] = {}
    group = specific_words_forms[category]
    for k in group:
        specific_words_counter[category].setdefault(k, 0)
        for w in group[k]:
            specific_words_counter[category][k] += all_text_lower.count(w)

In [30]:
specific_words_counter

{'ancestor': {'grandfather': 91, 'grandmother': 64},
 'animal': {'bob white': 4,
  'fish': 796,
  'frogs': 95,
  'mink': 29,
  'muskrat': 251,
  'quail': 24,
  'snapping turtles': 4,
  'turtle eggs': 3,
  'turtles': 76},
 'ceremony': {'fasting ceremony': 0},
 'medicine': {},
 'other': {'blueberry moon': 0, 'moon': 28, 'strawberry moon': 0},
 'plant': {'birch': 0,
  'bull rush': 3,
  'cat tail': 2,
  'chestnuts': 0,
  'horsetail': 0,
  'lady slipper': 2,
  'meadow': 0,
  'milk weed': 0,
  'phragmites': 67,
  'pin weed': 0,
  'prairie grass': 3,
  'sweet grass': 76,
  'tall-grass prairie': 7,
  'tobacco': 13,
  'wild rice': 2},
 'water': {'bkejwanong': 5, 'lake st. clair': 9, 'marsh': 311}}

### Concepts + specific keywords

In [31]:
def join_two_dicts_aux(d1, d2):
    for k in d2:
        if k in d1:
            d1[k] = max(d1[k], d2[k])
        else:
            d1[k] = d2[k]
    return d1

def join_two_dicts(d1, d2):
    for k in d2:
        if k in d1:
            d1[k] = join_two_dicts_aux(d1[k], d2[k])
        else:
            d1[k] = d2[k]
    return d1

def join_dicts(*dicts):
    r = {}
    for d in dicts:
        r = join_two_dicts(r, d)
    return r

In [32]:
concepts_specific_words_counter = join_dicts(concepts_counter, specific_words_counter)

In [33]:
concepts_specific_words_counter

{'ancestor': {'ancestor': 2,
  'ancestors': 13,
  'dad': 329,
  'dads': 5,
  'father': 110,
  'fathers': 12,
  'forefathers': 4,
  'gran': 2,
  'grandfather': 91,
  'grandfathers': 12,
  'grandma': 16,
  'grandmas': 1,
  'grandmother': 64,
  'grandmothers': 4,
  'grandpa': 20,
  'grandparent': 1,
  'grandparents': 41,
  'jacob': 2,
  'jacobs': 26,
  'mom': 107,
  'moms': 2,
  'mother': 95,
  'mothers': 2,
  'papa': 1,
  'parent': 6,
  'parents': 58,
  'pop': 11,
  'pops': 1,
  'root': 39,
  'roots': 24,
  'simeon': 3},
 'animal': {'abalone': 1,
  'adult': 26,
  'alligator': 3,
  'american_bison': 3,
  'amphibian': 2,
  'animal': 93,
  'ant': 13,
  'ass': 3,
  'baby': 80,
  'bass': 38,
  'basset': 7,
  'bat': 7,
  'bay': 75,
  'bear': 15,
  'beaver': 8,
  'bee': 103,
  'beef': 3,
  'beetle': 5,
  'big_cat': 9,
  'billy': 8,
  'bird': 40,
  'bivalve': 1,
  'black-backed_gull': 3,
  'blackbird': 44,
  'blue': 16,
  'bluegill': 2,
  'bob white': 4,
  'bobcat': 10,
  'bobwhite': 2,
  'bowfi

In [41]:
for category in concepts_specific_words_counter:
    print(category, '-', len(concepts_specific_words_counter[category]), 'different words.')
print()
print()
for category in concepts_specific_words_counter:
    print('*' * 44)
    print('* CATEGORY:', category, '-', len(concepts_specific_words_counter[category]), 'different words.')
    print('*' * 44)
    print()
    print('Alphabetical order:')
    df = pd.DataFrame(concepts_specific_words_counter[category], index=['count']).T
    display(df)
    print()
    print('Sorted by descending count:')
    df = df.sort_values('count', ascending=False)
    display(df)
    print()
    print()
print('NOT FOUND WORDS:')
for category in concepts_specific_words_counter:
    group = concepts_specific_words_counter[category]
    for w in group:
        if group[w] == 0:
            for p in w.split():
                print(w, '-', p, '-', all_text_lower.count(p))

animal - 257 different words.
ceremony - 19 different words.
plant - 174 different words.
water - 31 different words.
other - 3 different words.
food - 287 different words.
medicine - 14 different words.
ancestor - 31 different words.
treaty - 4 different words.


********************************************
* CATEGORY: animal - 257 different words.
********************************************

Alphabetical order:


Unnamed: 0,count
abalone,1
adult,26
alligator,3
american_bison,3
amphibian,2
animal,93
ant,13
ass,3
baby,80
bass,38



Sorted by descending count:


Unnamed: 0,count
fish,796
kid,352
duck,322
muskrat,251
horse,207
world,187
deer,163
homo,152
young,128
bee,103




********************************************
* CATEGORY: ceremony - 19 different words.
********************************************

Alphabetical order:


Unnamed: 0,count
burial,10
ceremony,58
church_service,67
circumstance,4
dedication,3
exercise,20
fasting ceremony,0
funeral,9
immersion,7
initiation,1



Sorted by descending count:


Unnamed: 0,count
church_service,67
ceremony,58
vigil,53
exercise,20
opening,20
office,16
service,14
none,14
wedding,12
burial,10




********************************************
* CATEGORY: plant - 174 different words.
********************************************

Alphabetical order:


Unnamed: 0,count
american_ginseng,2
annual,11
apple,89
aquatic,4
arbor,2
ash,77
asparagus,2
bamboo,8
banana,1
barley,1



Sorted by descending count:


Unnamed: 0,count
shrub,353
tree,261
corn,230
plant,154
grass,111
elder,90
apple,89
potato,88
ash,77
sweet grass,76




********************************************
* CATEGORY: water - 31 different words.
********************************************

Alphabetical order:


Unnamed: 0,count
bay,75
bkejwanong,5
body_of_water,529
branch,22
brook,36
canal,2
channel,20
deep,28
ditch,61
drink,48



Sorted by descending count:


Unnamed: 0,count
body_of_water,529
marsh,311
river,199
lake,130
rivulet,85
bay,75
ditch,61
pond,55
main,54
strait,50




********************************************
* CATEGORY: other - 3 different words.
********************************************

Alphabetical order:


Unnamed: 0,count
blueberry moon,0
moon,28
strawberry moon,0



Sorted by descending count:


Unnamed: 0,count
moon,28
blueberry moon,0
strawberry moon,0




********************************************
* CATEGORY: food - 287 different words.
********************************************

Alphabetical order:


Unnamed: 0,count
alcohol,9
apple,89
applesauce,2
asparagus,2
bacon,3
banana,1
bannock,1
banquet,180
barley,1
bartlett,1



Sorted by descending count:


Unnamed: 0,count
water,529
food,501
fish,450
duck,322
cut,207
banquet,180
side,161
tea,150
ice,129
egg_white,125




********************************************
* CATEGORY: medicine - 14 different words.
********************************************

Alphabetical order:


Unnamed: 0,count
acoustic,3
amobarbital_sodium,16
antidote,1
draft,2
hit,49
lotion,5
medicine,144
ointment,2
pill,2
powder,4



Sorted by descending count:


Unnamed: 0,count
medicine,144
hit,49
remedy,42
specific,40
amobarbital_sodium,16
tonic,7
lotion,5
powder,4
acoustic,3
prescription_drug,3




********************************************
* CATEGORY: ancestor - 31 different words.
********************************************

Alphabetical order:


Unnamed: 0,count
ancestor,2
ancestors,13
dad,329
dads,5
father,110
fathers,12
forefathers,4
gran,2
grandfather,91
grandfathers,12



Sorted by descending count:


Unnamed: 0,count
dad,329
father,110
mom,107
mother,95
grandfather,91
grandmother,64
parents,58
grandparents,41
root,39
jacobs,26




********************************************
* CATEGORY: treaty - 4 different words.
********************************************

Alphabetical order:


Unnamed: 0,count
alliance,3
convention,2
peace,2
treaty,33



Sorted by descending count:


Unnamed: 0,count
treaty,33
alliance,3
convention,2
peace,2




NOT FOUND WORDS:
fasting ceremony - fasting - 1
fasting ceremony - ceremony - 12
pin weed - pin - 245
pin weed - weed - 98
meadow - meadow - 0
milk weed - milk - 41
milk weed - weed - 98
horsetail - horsetail - 0
birch - birch - 0
chestnuts - chestnuts - 0
blueberry moon - blueberry - 0
blueberry moon - moon - 28
strawberry moon - strawberry - 11
strawberry moon - moon - 28


### Proper names

In [None]:
all_interviewees_names_dict = {
    'Aimee': 'Aimee Johnson',
    'Aimee Johnson': 'Aimee Johnson',
#     'Andrew': 'Andrew Peters',
#     'Andrew Peters': 'Andrew Peters',
    'Anita': 'Anita Smith',
    'Anita Smith': 'Anita Smith',
    'Apollo': 'Apollo Blackeagle',
    'Apollo Blackeagle': 'Apollo Blackeagle',
    'Aquash': 'Mickey Aquash',
    'Archie': 'Archie',
    'Baxter': 'Eli Baxter',
    'Becky': 'Becky',
    'Bill': 'Bill Sands',
    'Bill Sands': 'Bill Sands',
    'Blackbird': 'Jennie Blackbird',
    'Blackeagle': 'Apollo Blackeagle',
    'Brenda': 'Brenda Wheat',
    'Brenda Wheat': 'Brenda Wheat',
    'Cal': 'Cal',
    'Cameron': 'Cameron',
    'Carl': 'Carl Smith',
    'Carl Smith (Resource Protection Officer)': 'Carl Smith',
    'Carmen': 'Carmen Wrightman',
    'Carmen Wrightman': 'Carmen Wrightman',
#     'Carrie': 'Carrie Isaac',
#     'Carrie Isaac': 'Carrie Isaac',
#     'Charles': 'Charles Wright',
#     'Charles Wright': 'Charles Wright',
    'Cheryl': 'Cheryl',
    'Chief': 'Chief Gilbert',
    'Chief Gilbert': 'Chief Gilbert',
    'Chief Joseph Gilbert': 'Chief Gilbert',
    'Chris': 'Chris Riley',
    'Chris Riley': 'Chris Riley',
    'Daniel': 'Daniel',
#     'Darren': 'Darren',
    'Day': '??? Day',
    'Dean': 'Dean Jacobs',
    'Dean Jacobs': 'Dean Jacobs',
    'Dot': 'Dot Peters',
    'Dot Peters': 'Dot Peters',
#     'Doug': 'Doug',
#     'Doug (Resource Protection Officer)': 'Doug',
    'Elaine': 'Elaine Jacobs',
    'Elaine Jacobs': 'Elaine Jacobs',
    'Eli': 'Eli Baxter',
    'Eli Baxter': 'Eli Baxter',
    'Eliza': 'Eliza John',
    'Eliza John': 'Eliza John',
    'Eric': 'Eric Isaac',
    'Eric Isaac': 'Eric Isaac',
#     'Frank': 'Frank',
#     'Georgina': 'Georgina',
    'Gilbert': 'Chief Gilbert',
    'Greg': 'Greg Isaac',
    'Greg Isaac': 'Greg Isaac',
    'Gus': 'Gus',
    'Harold': 'Harold Peters',
    'Harold Peters': 'Harold Peters',
    'Hoeksma': 'Mel Hoeksma',
    'Isaac': '??? Isaac',
    'Isabelle': 'Isabelle',
    'Jacobs': '??? Jacobs',
#     'Jane': 'Jane Jacobs',
#     'Jane Jacobs': 'Jane Jacobs',
    'Jasper': 'Jasper John',
    'Jasper John': 'Jasper John',
    'Jean': 'Jean Wrightman',
    'Jean Wrightman': 'Jean Wrightman',
    'Jen': 'Jennie Blackbird',
    'Jennie': 'Jennie Blackbird',
    'Jennie Blackbird': 'Jennie Blackbird',
#     'Jerome': 'Jerome',
#     'Jerry': 'Jerry',
    'Jessica': 'Jessica',
    'Joanne': 'Joanne Day',
    'Joanne Day': 'Joanne Day',
#     'Joe': 'Joe Isaac',
#     'Joe Isaac': 'Joe Isaac',
    'John': 'John',
    'Johnson': '??? Johnson',
    'Jones': '??? Jones',
    'Joseph': 'Chief Gilbert',
#     'Julia': 'Julia',
    'Karen': 'Karen Lalleen',
    'Karen Lalleen': 'Karen Lalleen',
    'Kenneth': 'Kenneth',
    'Kennon': 'Kennon Johnson',
    'Kennon Johnson': 'Kennon Johnson',
    'Kevin': 'Kevin Smith',
    'Kevin Smith': 'Kevin Smith',
    'Lalleen': 'Karen Lalleen',
    'Lee': 'Lee White',
    'Lee White': 'Lee White',
    'Linda': 'Linda White',
    'Linda White': 'Linda White',
    'Liz': 'Lizzie Isaac',
    'Lizzie': 'Lizzie Isaac',
    'Lizzie Isaac': 'Lizzie Isaac',
    'Lloyd': 'Lloyd Day',
    'Lloyd Day': 'Lloyd Day',
    'Lois': 'Lois Wrightman',
    'Lois Wrightman': 'Lois Wrightman',
    'Lyndsay': 'Lyndsay Sword',
    'Lyndsay Sword': 'Lyndsay Sword',
#     'Mark': 'Mark',
    'Mel': 'Mel Hoeksma',
    'Mel Hoeksma': 'Mel Hoeksma',
    'Mickey': 'Mickey Aquash',
    'Mickey Aquash': 'Mickey Aquash',
#     'Morris': 'Morris Wrightman',
#     'Morris Wrightman': 'Morris Wrightman',
    'Myrna': 'Myrna',
    'Naomi': 'Naomi Williams',
    'Naomi Williams': 'Naomi Williams',
    'PD': 'Puppydog',
    'Pat': 'Pat Riley',
    'Pat Riley': 'Pat Riley',
    'Patricia': 'Patricia',
    'Patty': 'Patty Isaac',
    'Patty Isaac': 'Patty Isaac',
#     'Paul': 'Paul',
    'Peters': '??? Peters',
    'Puppydog': 'Puppydog',
    'Rachel': 'Rachel',
    'Ralph': 'Ralph ???',
    'Ralph Johnson': 'Ralph Johnson',
    'Ralph Jones': 'Ralph Jones',
    'Riley': '??? Riley',
    'Rita': 'Rita Sands',
    'Rita Sands': 'Rita Sands',
#     'Ron': 'Ron',
    'Rose': 'Rose',
    'Sands': '??? Sands',
#     'Sarah': 'Sarah',
    'Shirley': 'Shirley',
    'Smith': '??? Smith',
    'Stanley': 'Stanley',
    'Stuart': 'Stuart',
    'Suzie': 'Suzie ???',
    'Suzie Isaac': 'Suzie Isaac',
    'Suzie Jones': 'Suzie Jones',
    'Sword': 'Lyndsay Sword',
    'Terry': 'Terry Sands',
    'Terry Sands': 'Terry Sands',
#     'Tom': 'Tom',
    'Vernon': 'Vernon Jones',
    'Vernon Jones': 'Vernon Jones',
    'Wheat': 'Brenda Wheat',
    'White': '??? White',
    'Williams': 'Naomi Williams',
#     'Wright': 'Charles Wright',
    'Wrightman': '??? Wrightman',
}

In [None]:
tagged_text = pos_tag(word_tokenize(text))
                parsed_text = cp.parse(tagged_text)
                for e in parsed_text:
                    if isinstance(e, nltk.tree.Tree) and e.label() == 'PROPER_NOUN':
                        names = [word for word, tag in e if len(word) > 1]

#### People

In [None]:
people_counter = defaultdict(int)
for n1, n2, data in people.edges_iter(data=True):
    people_counter[n2] += int(data['weight'])

In [None]:
people_counter = {'people': Counter(people_counter)}

In [None]:
people_counter

In [None]:
len(all_found_terms)

In [None]:
all_text.count('Vernon Jones')

In [None]:
all_text_lower.count('Vernon Jones'.lower())

In [None]:
all_text.count('Vernon')

In [None]:
all_text_lower.count('Vernon'.lower())

In [None]:
c=d=e=0
for w in people_counter['people']:
    c1 = people_counter['people'][w]
    if len(w.split()) > 1:
        c2 = all_text_lower.count(w.lower())
    else:
        c2 = all_text.count(w)
    if c1==c2:
        c+=1
    else:
        d+=1
        print(w, c1, c2)

In [None]:
c

In [None]:
d

In [None]:
e

#### Others

In [None]:
others_counter = defaultdict(int)
for n1, n2, data in others.edges_iter(data=True):
    others_counter[n2] += int(data['weight'])

In [None]:
others_counter = {'other': Counter(others_counter)}

In [None]:
others_counter

In [None]:
len(all_found_terms)

In [None]:
# others
Buscar tal cual en el texto

### Most frequent words

In [None]:
most_freq_words

In [None]:
most_freq_words_counter = defaultdict(dict)
for pos in most_freq_words:
    group = most_freq_words[pos]
    for w in group:
        if w not in all_found_terms:
            sss = wn.synsets(w)
            for ss in sss:
                intersect = common_synsets(ss, synsets)
                if intersect:
                    all_found_terms.add(w)
                    for ssi in intersect:
                        category = inverse_concepts[ssi.name()]
                        if category == 'ancestor':
                            # In interpersonal relationships, it is intersting to see if they talk about
                            # their mother/father (singular, their own) mothers/fathers (plural, everyone's in general).
                            # Also, if they say mother/father (more formal) or mom/dad (more familiar).
                            term = w # original word in the transcriptions
                        else:
                            term = ss.name().split('.')[0] # generic concept
                        most_freq_words_counter[category][term] = most_freq_words[pos][w]
                else:
                    most_freq_words_counter['other'][w] = most_freq_words[pos][w] # original word in 'other'

In [None]:
for k in most_freq_words_counter:
    most_freq_words_counter[k] = Counter(most_freq_words_counter[k])

In [None]:
most_freq_words_counter

In [None]:
len(all_found_terms)