# Transcripts

In [1]:
import string
from collections import Counter
from collections import defaultdict

import networkx as nx
import pandas as pd
from IPython.display import display
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from textblob import Word

In [2]:
pd.set_option('display.max_rows', 500)

## Loading data

### Transcripts

In [3]:
transcripts_df = pd.read_csv(
    'data/out/transcripts_1.csv',
    converters={'INTERVIEWERS': eval, 'INTERVIEWEES': eval, 'ALIASES': eval, 'INTERVIEW': eval},
)
transcripts_df = transcripts_df[['ID', 'INTERVIEWERS', 'INTERVIEWEES', 'ALIASES', 'INTERVIEW']]

In [4]:
transcripts_df.head()

Unnamed: 0,ID,INTERVIEWERS,INTERVIEWEES,ALIASES,INTERVIEW
0,Aimee Johnson – 17 September 2010,[Rick Fehr],[Aimee Johnson],[],"[(0, Rick, Ok. We’re recording now, I’m sitti..."
1,Anita Smith -,[Dave White],[Anita Smith],[],"[(0, Dave, How did we use to use the environme..."
2,Apollo Blackeagle – 27 October 2010,"[Rick Fehr, David White]",[Apollo Blackeagle],[],"[(0, Rick, Ok, its October 27th I believe, we’..."
3,Bill Sands,[Dave White],[Bill Sands],[],"[(0, Dave, In the past, there’s concern today ..."
4,Brenda Wheat – 24 May 2011,[Rick Fehr],[Brenda Wheat],[],"[(0, Rick, So what we’ll be using is just a li..."


In [5]:
transcripts_df.shape

(50, 5)

### Concepts

In [6]:
concepts = [
    'water', 'animal', 'medicine', 'plant', 'food', #'activity', 'place', 'tool', 'clothing'
    'treaty', 'ceremony', 'ancestor',
]

### Specific keywords

In [7]:
specific_words = {}
current_category = ''
with open('data/specific_keywords.txt') as f:
    for line in f:
        line = line.strip()
        if line:
            if line.endswith(':'):
                current_category = line.split(':')[0].lower()
                specific_words[current_category] = []
            else:
                specific_words[current_category].append(line.lower())

### Most frequent words

In [8]:
most_freq_words = defaultdict(dict)
current_category = ''
with open('data/most_freq_words.txt') as f:
    for line in f:
        if line.startswith('#'):
            current_category = line.split('-')[0].lower()[2:-1]
        else:
            word = line[3:].split("'")[0].lower()
            freq = line[3:].split("'")[1][2:-3]
            most_freq_words[current_category].update({word: int(freq)})
most_freq_words = dict(most_freq_words)

### Proper names

#### People

In [9]:
people = nx.read_gexf('data/out/people.gexf')

#### Others

In [10]:
others = nx.read_gexf('data/out/other.gexf')

## Processing data

In [11]:
def all_hypernyms(ss):
    return ss.closure(lambda x: x.hypernyms())

In [12]:
def common_synsets(ss, sss):
    hypers = set(all_hypernyms(ss))
    hypers.add(ss)
    return hypers.intersection(sss)

In [13]:
all_text = '\n'.join([text for interview in transcripts_df.INTERVIEW for (index, name, text) in interview])

In [14]:
stop = stopwords.words('english')
stop.extend(string.punctuation)

In [15]:
def clean_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    words = [token for token in tokens if token not in stop and token and len(token) > 2]
    text = ' '.join(words)
    return text

In [16]:
cleaned_all_text = clean_text(all_text)

### Concepts

In [17]:
# concepts
wordnet_concepts_dict = {
    'water': ['body_of_water.n.01'],
    'animal': ['animal.n.01'],
    'medicine': ['medicine.n.02'],
#     'clothing': ['clothing.n.01'],
#     'tool': ['instrumentality.n.03'],
#     'activity': ['interact.v.01', 'act.v.01'],
#     'place': ['location.n.01', 'building.n.01'],
    'plant': ['plant.n.02'],
    'food': ['food.n.01', 'food.n.02'],
    'treaty': ['treaty.n.01'],
    'ceremony': ['ceremony.n.01', 'ceremony.n.02', 'ceremony.n.03'],
    'ancestor': ['ancestor.n.01'],
}
synsets = [wn.synset(ss) for ls in wordnet_concepts_dict.values() for ss in ls]

In [18]:
def inverse_list_dict(d):
    i = {}
    for k in d:
        for v in d[k]:
            i[v] = k
    return i

In [19]:
inverse_concepts = inverse_list_dict(wordnet_concepts_dict)

In [20]:
inverse_concepts

{'ancestor.n.01': 'ancestor',
 'animal.n.01': 'animal',
 'body_of_water.n.01': 'water',
 'ceremony.n.01': 'ceremony',
 'ceremony.n.02': 'ceremony',
 'ceremony.n.03': 'ceremony',
 'food.n.01': 'food',
 'food.n.02': 'food',
 'medicine.n.02': 'medicine',
 'plant.n.02': 'plant',
 'treaty.n.01': 'treaty'}

In [21]:
%%time

concepts_list = defaultdict(list)
for w in cleaned_all_text.split():
    categories = set()
    sss = wn.synsets(w)
    for ss in sss:
        intersect = common_synsets(ss, synsets)
        if intersect:
            il = list(intersect)
            assert(len(il) <= 2)
            assert(len(il) == 1 or il[0].name().split('.')[0] == il[1].name().split('.')[0])
            for ssi in intersect:
                category = inverse_concepts[ssi.name()]
                if category not in categories:
                    concepts_list[category].append(w)
                    categories.add(category)
concepts_counter = {}
for category in concepts_list:
    concepts_counter[category] = dict(Counter(concepts_list[category]))

CPU times: user 46.6 s, sys: 144 ms, total: 46.7 s
Wall time: 46.7 s


In [22]:
concepts_counter

{'ancestor': {'ancestor': 2,
  'ancestors': 13,
  'dad': 329,
  'dads': 5,
  'father': 55,
  'fathers': 6,
  'forefathers': 4,
  'gran': 2,
  'grandfather': 75,
  'grandfathers': 12,
  'grandma': 16,
  'grandmas': 1,
  'grandmother': 59,
  'grandmothers': 4,
  'grandpa': 20,
  'grandparent': 1,
  'grandparents': 41,
  'jacob': 2,
  'jacobs': 26,
  'mom': 107,
  'moms': 2,
  'mother': 95,
  'mothers': 2,
  'papa': 1,
  'parent': 6,
  'parents': 58,
  'pop': 11,
  'pops': 1,
  'root': 39,
  'roots': 24,
  'simeon': 3},
 'animal': {'abalone': 1,
  'adult': 11,
  'adults': 15,
  'alligator': 3,
  'amphibian': 1,
  'amphibians': 1,
  'animal': 19,
  'animals': 74,
  'ant': 7,
  'ants': 6,
  'assess': 3,
  'babies': 11,
  'baby': 29,
  'bass': 38,
  'basset': 7,
  'bat': 7,
  'bay': 73,
  'bays': 2,
  'bear': 14,
  'bears': 1,
  'beaver': 6,
  'beavers': 2,
  'bee': 74,
  'beef': 3,
  'bees': 29,
  'beetle': 3,
  'beetles': 2,
  'billy': 8,
  'bird': 6,
  'birds': 34,
  'bivalves': 1,
  'bla

### Specific keywords

In [23]:
specific_words

{'ancestor': ['grandmother', 'grandfather'],
 'animal': ['quail',
  'bob white',
  'muskrat',
  'mink',
  'turtles',
  'snapping turtles',
  'turtle eggs',
  'fish',
  'frogs'],
 'ceremony': ['fasting ceremony'],
 'medicine': [],
 'other': ['strawberry moon', 'blueberry moon', 'moon'],
 'plant': ['phragmites',
  'bull rush',
  'cat tail',
  'pin weed',
  'sweet grass',
  'milk weed',
  'tobacco',
  'prairie grass',
  'tall-grass prairie',
  'meadow',
  'chestnuts',
  'lady slipper',
  'wild rice',
  'birch',
  'horsetail'],
 'water': ['bkejwanong', 'lake st. clair', 'marsh']}

In [24]:
def all_forms(w):
    return list({
        w,
        w.replace(' ', ''),
        w.replace('-', ''),
        w.replace('-', ' '),
        Word(w).pluralize().string,
        Word(w).singularize().string,
        Word(w.replace(' ', '')).pluralize().string,
        Word(w.replace('-', '')).pluralize().string,
        Word(w.replace('-', ' ')).pluralize().string,
        Word(w.replace(' ', '')).singularize().string,
        Word(w.replace('-', '')).singularize().string,
        Word(w.replace('-', ' ')).singularize().string,
    })

In [25]:
specific_words_forms = {}
for category in specific_words:
    specific_words_forms[category] = {}
    for w in specific_words[category]:
        specific_words_forms[category][w] = all_forms(w)

In [26]:
specific_words_forms

{'ancestor': {'grandfather': ['grandfather', 'grandfathers'],
  'grandmother': ['grandmothers', 'grandmother']},
 'animal': {'bob white': ['bob whites', 'bobwhites', 'bob white', 'bobwhite'],
  'fish': ['fish'],
  'frogs': ['frogss', 'frog', 'frogs'],
  'mink': ['mink', 'minks'],
  'muskrat': ['muskrat', 'muskrats'],
  'quail': ['quail', 'quails'],
  'snapping turtles': ['snapping turtles',
   'snappingturtles',
   'snapping turtle',
   'snapping turtless',
   'snappingturtle',
   'snappingturtless'],
  'turtle eggs': ['turtle egg',
   'turtle eggs',
   'turtleegg',
   'turtleeggs',
   'turtle eggss',
   'turtleeggss'],
  'turtles': ['turtle', 'turtless', 'turtles']},
 'ceremony': {'fasting ceremony': ['fasting ceremony',
   'fasting ceremonies',
   'fastingceremony',
   'fastingceremonies']},
 'medicine': {},
 'other': {'blueberry moon': ['blueberry moon',
   'blueberrymoon',
   'blueberrymoons',
   'blueberry moons'],
  'moon': ['moon', 'moons'],
  'strawberry moon': ['strawberry moo

In [27]:
def count_sublists(l1, l2):
    return sum(1 for i in range(len(l2)-len(l1)+1) if l1 == l2[i:i+len(l1)])

In [28]:
%%time

specific_words_counter = {}
for category in specific_words_forms:
    specific_words_counter[category] = {}
    group = specific_words_forms[category]
    for k in group:
        subtotal = 0
        for w in group[k]:
            count = count_sublists(w.split(), cleaned_all_text.split())
            if count > 0:
                specific_words_counter[category][w] = count
            subtotal += count
        if subtotal == 0:
            specific_words_counter[category][k] = 0

CPU times: user 6.52 s, sys: 12 ms, total: 6.54 s
Wall time: 6.54 s


In [29]:
specific_words_counter

{'ancestor': {'grandfather': 75,
  'grandfathers': 12,
  'grandmother': 59,
  'grandmothers': 4},
 'animal': {'bob white': 4,
  'bobwhite': 1,
  'bobwhites': 1,
  'fish': 449,
  'frog': 21,
  'frogs': 38,
  'mink': 24,
  'minks': 3,
  'muskrat': 141,
  'muskrats': 105,
  'quail': 24,
  'snapping turtle': 1,
  'snapping turtles': 3,
  'turtle': 37,
  'turtle eggs': 3,
  'turtles': 39},
 'ceremony': {'fasting ceremony': 0},
 'medicine': {},
 'other': {'blueberry moon': 0, 'moon': 7, 'moons': 1, 'strawberry moon': 0},
 'plant': {'birch': 0,
  'bull rushes': 3,
  'cat tails': 2,
  'cattail': 2,
  'cattails': 16,
  'chestnuts': 0,
  'horsetail': 0,
  'lady slipper': 2,
  'lady slippers': 2,
  'meadow': 0,
  'milkweed': 17,
  'milkweeds': 1,
  'phragmites': 65,
  'pin weed': 0,
  'prairie grass': 3,
  'sweet grass': 72,
  'sweetgrass': 12,
  'tall grass prairie': 4,
  'tall grass prairies': 2,
  'tallgrass prairie': 1,
  'tobacco': 13,
  'wild rice': 2},
 'water': {'bkejwanong': 5, 'lake st.

In [30]:
for category in specific_words_counter:
    group1 = specific_words_counter[category]
    if category in concepts_counter:
        group2 = concepts_counter[category]
        for w in group1:
            if w in group2:
                if group1[w] != group2[w]:
                    print('ERROR:', category, w, group1[w], group2[w])
            else:
                print('>>>', category, w, group1[w])
    else:
        for w in group1:
            print('***', category, w, group1[w])

>>> ceremony fasting ceremony 0
>>> water marshes 56
>>> water bkejwanong 5
>>> water marsh 235
>>> water lake st. clair 8
*** other moon 7
*** other blueberry moon 0
*** other strawberry moon 0
*** other moons 1
>>> animal snapping turtles 3
>>> animal bob white 4
>>> animal snapping turtle 1
>>> animal turtle eggs 3
>>> plant horsetail 0
>>> plant lady slippers 2
>>> plant birch 0
>>> plant sweetgrass 12
>>> plant wild rice 2
>>> plant lady slipper 2
>>> plant tallgrass prairie 1
>>> plant bull rushes 3
>>> plant phragmites 65
>>> plant tall grass prairies 2
>>> plant chestnuts 0
>>> plant pin weed 0
>>> plant meadow 0
>>> plant cat tails 2
>>> plant prairie grass 3
>>> plant tall grass prairie 4
>>> plant sweet grass 72


### Concepts + specific keywords

In [31]:
def join_two_dicts(d1, d2):
    r = d1
    for k in r:
        if k in d2:
            r[k].update(d2[k])
    for k in d2:
        if k not in r:
            r[k] = d2[k]
    return r

def join_dicts(*dicts):
    r = {}
    for d in dicts:
        r = join_two_dicts(r, d)
    return r

In [32]:
concepts_specific_words_counter = join_dicts(concepts_counter, specific_words_counter)

In [33]:
concepts_specific_words_counter

{'ancestor': {'ancestor': 2,
  'ancestors': 13,
  'dad': 329,
  'dads': 5,
  'father': 55,
  'fathers': 6,
  'forefathers': 4,
  'gran': 2,
  'grandfather': 75,
  'grandfathers': 12,
  'grandma': 16,
  'grandmas': 1,
  'grandmother': 59,
  'grandmothers': 4,
  'grandpa': 20,
  'grandparent': 1,
  'grandparents': 41,
  'jacob': 2,
  'jacobs': 26,
  'mom': 107,
  'moms': 2,
  'mother': 95,
  'mothers': 2,
  'papa': 1,
  'parent': 6,
  'parents': 58,
  'pop': 11,
  'pops': 1,
  'root': 39,
  'roots': 24,
  'simeon': 3},
 'animal': {'abalone': 1,
  'adult': 11,
  'adults': 15,
  'alligator': 3,
  'amphibian': 1,
  'amphibians': 1,
  'animal': 19,
  'animals': 74,
  'ant': 7,
  'ants': 6,
  'assess': 3,
  'babies': 11,
  'baby': 29,
  'bass': 38,
  'basset': 7,
  'bat': 7,
  'bay': 73,
  'bays': 2,
  'bear': 14,
  'bears': 1,
  'beaver': 6,
  'beavers': 2,
  'bee': 74,
  'beef': 3,
  'bees': 29,
  'beetle': 3,
  'beetles': 2,
  'billy': 8,
  'bird': 6,
  'birds': 34,
  'bivalves': 1,
  'bla

In [34]:
# Correctness
for category in concepts_specific_words_counter:
    group = concepts_specific_words_counter[category]
    for w in group:
        v = group[w]
        if category in concepts_counter:
            group1 = concepts_counter[category]
            if w in group1:
                v1 = group1[w]
                assert(v == v1)
        if category in specific_words_counter:
            group2 = specific_words_counter[category]
            if w in group2:
                v2 = group2[w]
                assert(v == v2)

In [35]:
# Completeness
for category in concepts_counter:
    group = concepts_specific_words_counter[category]
    group1 = concepts_counter[category]
    for w in group1:
        v = group[w]
        v1 = group1[w]
        assert(v == v1)
for category in specific_words_counter:
    group = concepts_specific_words_counter[category]
    group2 = specific_words_counter[category]
    for w in group2:
        v = group[w]
        v2 = group2[w]
        assert(v == v2)

In [36]:
def reduce_dict(dct):
    reductions = {}
    dict1 = dct.copy()
    changed = True
    while changed:
        dict2 = dict1.copy()
        brk = False
        for x in dict2:
            forms = all_forms(x)
            for y in forms:
                if y != x and y in dict2:
                    print('Joining', x, '(', dict2[x], ') and', y, '(', dict2[y], ')...')
                    if dict2[x] >= dict2[y]:
                        reductions[x] = {x: dict1[x], y: dict1[y]}
                        dict1[x] += dict1[y]
                        del(dict1[y])
                        print('... keeping', x, ', deleting', y, '.')
                    else:
                        reductions[y] = {y: dict1[y], x: dict1[x]}
                        dict1[y] += dict1[x]
                        del(dict1[x])
                        print('... keeping', y, ', deleting', x, '.')
                    brk = True
                if brk:
                    break
            if brk:
                break
        changed = len(dict1) != len(dict2)
    return (dict1, reductions)

In [37]:
reduced_counter = {}
reductions = {}
for category in concepts_specific_words_counter:
    reduced_counter[category], reductions[category] = reduce_dict(concepts_specific_words_counter[category])

Joining treat ( 12 ) and treats ( 2 )...
... keeping treat , deleting treats .
Joining pie ( 13 ) and pies ( 12 )...
... keeping pie , deleting pies .
Joining chop ( 13 ) and chops ( 2 )...
... keeping chop , deleting chops .
Joining strawberry ( 11 ) and strawberries ( 11 )...
... keeping strawberry , deleting strawberries .
Joining stock ( 5 ) and stocks ( 3 )...
... keeping stock , deleting stocks .
Joining dishes ( 9 ) and dish ( 2 )...
... keeping dishes , deleting dish .
Joining shoulders ( 6 ) and shoulder ( 7 )...
... keeping shoulder , deleting shoulders .
Joining berries ( 43 ) and berry ( 6 )...
... keeping berries , deleting berry .
Joining cherries ( 9 ) and cherry ( 6 )...
... keeping cherries , deleting cherry .
Joining stews ( 2 ) and stew ( 4 )...
... keeping stew , deleting stews .
Joining geese ( 42 ) and goose ( 41 )...
... keeping geese , deleting goose .
Joining candy ( 4 ) and candies ( 3 )...
... keeping candy , deleting candies .
Joining bird ( 6 ) and birds ( 

... keeping grandfather , deleting grandfathers .
Joining mothers ( 2 ) and mother ( 95 )...
... keeping mother , deleting mothers .
Joining seals ( 1 ) and seal ( 4 )...
... keeping seal , deleting seals .
Joining bob white ( 4 ) and bobwhites ( 1 )...
... keeping bob white , deleting bobwhites .
Joining bob white ( 5 ) and bobwhite ( 1 )...
... keeping bob white , deleting bobwhite .
Joining hen ( 6 ) and hens ( 5 )...
... keeping hen , deleting hens .
Joining bulls ( 1 ) and bull ( 14 )...
... keeping bull , deleting bulls .
Joining stock ( 5 ) and stocks ( 3 )...
... keeping stock , deleting stocks .
Joining babies ( 11 ) and baby ( 29 )...
... keeping baby , deleting babies .
Joining crow ( 5 ) and crows ( 2 )...
... keeping crow , deleting crows .
Joining permit ( 2 ) and permits ( 2 )...
... keeping permit , deleting permits .
Joining men ( 40 ) and man ( 79 )...
... keeping man , deleting men .
Joining fox ( 37 ) and foxes ( 1 )...
... keeping fox , deleting foxes .
Joining rac

Joining watermelons ( 2 ) and watermelon ( 6 )...
... keeping watermelon , deleting watermelons .
Joining corn ( 114 ) and corns ( 1 )...
... keeping corn , deleting corns .
Joining oaks ( 2 ) and oak ( 21 )...
... keeping oak , deleting oaks .
Joining flower ( 8 ) and flowers ( 15 )...
... keeping flowers , deleting flower .
Joining cucumbers ( 3 ) and cucumber ( 1 )...
... keeping cucumbers , deleting cucumber .
Joining bamboo ( 7 ) and bamboos ( 1 )...
... keeping bamboo , deleting bamboos .
Joining pears ( 13 ) and pear ( 5 )...
... keeping pears , deleting pear .
Joining pumpkins ( 3 ) and pumpkin ( 7 )...
... keeping pumpkin , deleting pumpkins .
Joining violets ( 2 ) and violet ( 2 )...
... keeping violets , deleting violet .
Joining onions ( 13 ) and onion ( 2 )...
... keeping onions , deleting onion .
Joining boxes ( 11 ) and box ( 43 )...
... keeping box , deleting boxes .
Joining elms ( 2 ) and elm ( 26 )...
... keeping elm , deleting elms .
Joining torches ( 1 ) and torch (

In [38]:
for category in reduced_counter:
    for w in reduced_counter[category]:
        if w not in reductions[category]:
            reductions[category][w] = {}

In [39]:
reductions

{'ancestor': {'ancestors': {'ancestor': 2, 'ancestors': 13},
  'dad': {'dad': 329, 'dads': 5},
  'father': {'father': 55, 'fathers': 6},
  'forefathers': {},
  'gran': {},
  'grandfather': {'grandfather': 75, 'grandfathers': 12},
  'grandma': {'grandma': 16, 'grandmas': 1},
  'grandmother': {'grandmother': 59, 'grandmothers': 4},
  'grandpa': {},
  'grandparents': {'grandparent': 1, 'grandparents': 41},
  'jacobs': {'jacob': 2, 'jacobs': 26},
  'mom': {'mom': 107, 'moms': 2},
  'mother': {'mother': 95, 'mothers': 2},
  'papa': {},
  'parents': {'parent': 6, 'parents': 58},
  'pop': {'pop': 11, 'pops': 1},
  'root': {'root': 39, 'roots': 24},
  'simeon': {}},
 'animal': {'abalone': {},
  'adults': {'adult': 11, 'adults': 15},
  'alligator': {},
  'amphibian': {'amphibian': 1, 'amphibians': 1},
  'animals': {'animal': 19, 'animals': 74},
  'ant': {'ant': 7, 'ants': 6},
  'assess': {},
  'baby': {'babies': 11, 'baby': 29},
  'bass': {},
  'basset': {},
  'bat': {},
  'bay': {'bay': 73, 'b

In [40]:
for category in reduced_counter:
    print(category, '-', len(reduced_counter[category]), 'different words.')
print()
print()
for category in reduced_counter:
    print('*' * 42)
    print('* CATEGORY:', category, '-', len(reduced_counter[category]), 'different words.')
    print('*' * 42)
    print()
    print('Alphabetical order:')
    df = pd.DataFrame(reduced_counter[category], index=['count']).T
    df['word forms'] = df.apply(lambda x: reductions[category][x.name], axis=1)
    display(df)
    print()
    print('Sorted by descending count:')
    df = df.sort_values('count', ascending=False)
    display(df)
    print()
    print()
print('NOT FOUND WORDS:')
for category in reduced_counter:
    group = reduced_counter[category]
    for w in group:
        if group[w] == 0:
            for p in w.split():
                print(w, '-', p, '-', cleaned_all_text.count(p))

food - 302 different words.
ancestor - 18 different words.
medicine - 18 different words.
water - 31 different words.
other - 3 different words.
treaty - 4 different words.
animal - 248 different words.
plant - 163 different words.
ceremony - 21 different words.


******************************************
* CATEGORY: food - 302 different words.
******************************************

Alphabetical order:


Unnamed: 0,count,word forms
alcohol,9,{}
apples,89,"{'apples': 59, 'apple': 30}"
applesauce,2,{}
asparagus,2,{}
bacon,3,{}
bananas,1,{}
bannock,1,{}
barley,1,{}
bartlett,1,{}
bass,38,{}



Sorted by descending count:


Unnamed: 0,count,word forms
water,529,"{'waters': 33, 'water': 496}"
fish,450,"{'fishes': 1, 'fish': 449}"
ducks,322,"{'ducks': 185, 'duck': 137}"
food,247,"{'food': 226, 'foods': 21}"
cut,207,"{'cuts': 8, 'cut': 199}"
feast,162,"{'feast': 104, 'feasts': 58}"
side,161,"{'sides': 16, 'side': 145}"
ice,129,{}
white,125,"{'white': 121, 'whites': 4}"
corn,115,"{'corns': 1, 'corn': 114}"




******************************************
* CATEGORY: ancestor - 18 different words.
******************************************

Alphabetical order:


Unnamed: 0,count,word forms
ancestors,15,"{'ancestor': 2, 'ancestors': 13}"
dad,334,"{'dads': 5, 'dad': 329}"
father,61,"{'father': 55, 'fathers': 6}"
forefathers,4,{}
gran,2,{}
grandfather,87,"{'grandfather': 75, 'grandfathers': 12}"
grandma,17,"{'grandma': 16, 'grandmas': 1}"
grandmother,63,"{'grandmothers': 4, 'grandmother': 59}"
grandpa,20,{}
grandparents,42,"{'grandparent': 1, 'grandparents': 41}"



Sorted by descending count:


Unnamed: 0,count,word forms
dad,334,"{'dads': 5, 'dad': 329}"
mom,109,"{'mom': 107, 'moms': 2}"
mother,97,"{'mother': 95, 'mothers': 2}"
grandfather,87,"{'grandfather': 75, 'grandfathers': 12}"
parents,64,"{'parents': 58, 'parent': 6}"
root,63,"{'root': 39, 'roots': 24}"
grandmother,63,"{'grandmothers': 4, 'grandmother': 59}"
father,61,"{'father': 55, 'fathers': 6}"
grandparents,42,"{'grandparent': 1, 'grandparents': 41}"
jacobs,28,"{'jacob': 2, 'jacobs': 26}"




******************************************
* CATEGORY: medicine - 18 different words.
******************************************

Alphabetical order:


Unnamed: 0,count,word forms
acoustic,3,{}
antidote,1,{}
application,5,{}
blue,16,{}
cure,7,{}
draft,2,{}
hit,49,"{'hit': 47, 'hits': 2}"
medication,1,{}
medicine,143,"{'medicine': 74, 'medicines': 69}"
pills,1,{}



Sorted by descending count:


Unnamed: 0,count,word forms
medicine,143,"{'medicine': 74, 'medicines': 69}"
hit,49,"{'hit': 47, 'hits': 2}"
specific,40,{}
remedies,35,"{'remedies': 18, 'remedy': 17}"
blue,16,{}
cure,7,{}
tonic,5,"{'tonics': 2, 'tonic': 3}"
application,5,{}
powder,4,{}
prescription,3,{}




******************************************
* CATEGORY: water - 31 different words.
******************************************

Alphabetical order:


Unnamed: 0,count,word forms
bay,75,"{'bays': 2, 'bay': 73}"
bkejwanong,5,{}
branches,22,"{'branch': 6, 'branches': 16}"
canal,2,{}
channel,20,"{'channels': 1, 'channel': 19}"
creek,36,"{'creeks': 8, 'creek': 28}"
crossing,1,{}
deep,28,{}
ditches,61,"{'ditches': 33, 'ditch': 28}"
drink,48,"{'drinks': 2, 'drink': 46}"



Sorted by descending count:


Unnamed: 0,count,word forms
water,529,"{'waters': 33, 'water': 496}"
marsh,291,"{'marshes': 56, 'marsh': 235}"
river,199,"{'rivers': 17, 'river': 182}"
lake,130,"{'lake': 109, 'lakes': 21}"
run,85,"{'run': 72, 'runs': 13}"
bay,75,"{'bays': 2, 'bay': 73}"
ditches,61,"{'ditches': 33, 'ditch': 28}"
main,54,{}
sounds,50,"{'sounds': 30, 'sound': 20}"
pond,50,"{'ponds': 14, 'pond': 36}"




******************************************
* CATEGORY: other - 3 different words.
******************************************

Alphabetical order:


Unnamed: 0,count,word forms
blueberry moon,0,{}
moon,8,"{'moon': 7, 'moons': 1}"
strawberry moon,0,{}



Sorted by descending count:


Unnamed: 0,count,word forms
moon,8,"{'moon': 7, 'moons': 1}"
blueberry moon,0,{}
strawberry moon,0,{}




******************************************
* CATEGORY: treaty - 4 different words.
******************************************

Alphabetical order:


Unnamed: 0,count,word forms
alliance,3,{}
conventions,2,{}
peace,2,{}
treaty,33,"{'treaties': 11, 'treaty': 22}"



Sorted by descending count:


Unnamed: 0,count,word forms
treaty,33,"{'treaties': 11, 'treaty': 22}"
alliance,3,{}
conventions,2,{}
peace,2,{}




******************************************
* CATEGORY: animal - 248 different words.
******************************************

Alphabetical order:


Unnamed: 0,count,word forms
abalone,1,{}
adults,26,"{'adult': 11, 'adults': 15}"
alligator,3,{}
amphibian,2,"{'amphibian': 1, 'amphibians': 1}"
animals,93,"{'animals': 74, 'animal': 19}"
ant,13,"{'ants': 6, 'ant': 7}"
assess,3,{}
baby,40,"{'babies': 11, 'baby': 29}"
bass,38,{}
basset,7,{}



Sorted by descending count:


Unnamed: 0,count,word forms
fish,450,"{'fishes': 1, 'fish': 449}"
kids,352,"{'kid': 87, 'kids': 265}"
ducks,322,"{'ducks': 185, 'duck': 137}"
muskrat,246,"{'muskrat': 141, 'muskrats': 105}"
horses,207,"{'horses': 131, 'horse': 76}"
deer,163,{}
young,127,{}
man,122,"{'mans': 3, 'man': 119}"
bee,103,"{'bee': 74, 'bees': 29}"
animals,93,"{'animals': 74, 'animal': 19}"




******************************************
* CATEGORY: plant - 163 different words.
******************************************

Alphabetical order:


Unnamed: 0,count,word forms
annual,11,{}
apples,89,"{'apples': 59, 'apple': 30}"
aquatic,4,{}
arbor,2,"{'arbor': 1, 'arbors': 1}"
ash,77,"{'ash': 74, 'ashes': 3}"
asparagus,2,{}
bamboo,8,"{'bamboo': 7, 'bamboos': 1}"
bananas,1,{}
barley,1,{}
basswood,11,{}



Sorted by descending count:


Unnamed: 0,count,word forms
bush,353,"{'bushes': 21, 'bush': 332}"
trees,261,"{'trees': 151, 'tree': 110}"
plants,154,"{'plants': 94, 'plant': 60}"
corn,115,"{'corns': 1, 'corn': 114}"
grass,111,"{'grasses': 4, 'grass': 107}"
elders,90,"{'elders': 76, 'elder': 14}"
apples,89,"{'apples': 59, 'apple': 30}"
potatoes,88,"{'potato': 18, 'potatoes': 70}"
sweet grass,84,"{'sweetgrass': 12, 'sweet grass': 72}"
ash,77,"{'ash': 74, 'ashes': 3}"




******************************************
* CATEGORY: ceremony - 21 different words.
******************************************

Alphabetical order:


Unnamed: 0,count,word forms
burial,10,"{'burial': 7, 'burials': 3}"
ceremonial,6,{}
ceremony,17,"{'ceremonies': 5, 'ceremony': 12}"
church,67,"{'churches': 8, 'church': 59}"
circumstance,4,"{'circumstances': 2, 'circumstance': 2}"
dedication,3,{}
exercise,20,"{'exercise': 18, 'exercises': 2}"
fasting ceremony,0,{}
funeral,9,"{'funerals': 4, 'funeral': 5}"
immersion,7,{}



Sorted by descending count:


Unnamed: 0,count,word forms
church,67,"{'churches': 8, 'church': 59}"
watch,53,{}
exercise,20,"{'exercise': 18, 'exercises': 2}"
opening,20,{}
ceremony,17,"{'ceremonies': 5, 'ceremony': 12}"
office,16,"{'office': 13, 'offices': 3}"
none,14,{}
service,14,"{'service': 9, 'services': 5}"
weddings,12,"{'weddings': 6, 'wedding': 6}"
burial,10,"{'burial': 7, 'burials': 3}"




NOT FOUND WORDS:
blueberry moon - blueberry - 0
blueberry moon - moon - 28
strawberry moon - strawberry - 11
strawberry moon - moon - 28
meadow - meadow - 0
pin weed - pin - 245
pin weed - weed - 98
birch - birch - 0
horsetail - horsetail - 0
chestnuts - chestnuts - 0
fasting ceremony - fasting - 1
fasting ceremony - ceremony - 12


In [46]:
cleaned_all_text.count('lake st. clair')

9

In [48]:
cleaned_all_text[:4000]

'ok. recording sitting aimee johnson talking today customary law project involved worked heritage centre attached explaining yesterday similar theme direction environmental policy future based concerns knowledge foresight thoughts might better shape going happen future right ok. first discussions already considerable thought traditions customs connected anishinaabeg territory lands waters experience knowledge people related territory positive way way enriching ways guess… messed question laughs ask let think better way ask completely sucked going anyway good job pro transcribe right anyway customs aware connect people land waters would like see way continue connect people land waters hmm well think something seen really really diminished concept use storytelling actually passing knowledge hunch able find able find direct evidence quite time stories stories legends clearly used teaching tool probably also really deep scientific knowledge processes life work world interacts people intera

In [44]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [54]:
count_sublists('lake st. clair'.split(), cleaned_all_text.split())

8

### Proper names

In [None]:
all_interviewees_names_dict = {
    'Aimee': 'Aimee Johnson',
    'Aimee Johnson': 'Aimee Johnson',
#     'Andrew': 'Andrew Peters',
#     'Andrew Peters': 'Andrew Peters',
    'Anita': 'Anita Smith',
    'Anita Smith': 'Anita Smith',
    'Apollo': 'Apollo Blackeagle',
    'Apollo Blackeagle': 'Apollo Blackeagle',
    'Aquash': 'Mickey Aquash',
    'Archie': 'Archie',
    'Baxter': 'Eli Baxter',
    'Becky': 'Becky',
    'Bill': 'Bill Sands',
    'Bill Sands': 'Bill Sands',
    'Blackbird': 'Jennie Blackbird',
    'Blackeagle': 'Apollo Blackeagle',
    'Brenda': 'Brenda Wheat',
    'Brenda Wheat': 'Brenda Wheat',
    'Cal': 'Cal',
    'Cameron': 'Cameron',
    'Carl': 'Carl Smith',
    'Carl Smith (Resource Protection Officer)': 'Carl Smith',
    'Carmen': 'Carmen Wrightman',
    'Carmen Wrightman': 'Carmen Wrightman',
#     'Carrie': 'Carrie Isaac',
#     'Carrie Isaac': 'Carrie Isaac',
#     'Charles': 'Charles Wright',
#     'Charles Wright': 'Charles Wright',
    'Cheryl': 'Cheryl',
    'Chief': 'Chief Gilbert',
    'Chief Gilbert': 'Chief Gilbert',
    'Chief Joseph Gilbert': 'Chief Gilbert',
    'Chris': 'Chris Riley',
    'Chris Riley': 'Chris Riley',
    'Daniel': 'Daniel',
#     'Darren': 'Darren',
    'Day': '??? Day',
    'Dean': 'Dean Jacobs',
    'Dean Jacobs': 'Dean Jacobs',
    'Dot': 'Dot Peters',
    'Dot Peters': 'Dot Peters',
#     'Doug': 'Doug',
#     'Doug (Resource Protection Officer)': 'Doug',
    'Elaine': 'Elaine Jacobs',
    'Elaine Jacobs': 'Elaine Jacobs',
    'Eli': 'Eli Baxter',
    'Eli Baxter': 'Eli Baxter',
    'Eliza': 'Eliza John',
    'Eliza John': 'Eliza John',
    'Eric': 'Eric Isaac',
    'Eric Isaac': 'Eric Isaac',
#     'Frank': 'Frank',
#     'Georgina': 'Georgina',
    'Gilbert': 'Chief Gilbert',
    'Greg': 'Greg Isaac',
    'Greg Isaac': 'Greg Isaac',
    'Gus': 'Gus',
    'Harold': 'Harold Peters',
    'Harold Peters': 'Harold Peters',
    'Hoeksma': 'Mel Hoeksma',
    'Isaac': '??? Isaac',
    'Isabelle': 'Isabelle',
    'Jacobs': '??? Jacobs',
#     'Jane': 'Jane Jacobs',
#     'Jane Jacobs': 'Jane Jacobs',
    'Jasper': 'Jasper John',
    'Jasper John': 'Jasper John',
    'Jean': 'Jean Wrightman',
    'Jean Wrightman': 'Jean Wrightman',
    'Jen': 'Jennie Blackbird',
    'Jennie': 'Jennie Blackbird',
    'Jennie Blackbird': 'Jennie Blackbird',
#     'Jerome': 'Jerome',
#     'Jerry': 'Jerry',
    'Jessica': 'Jessica',
    'Joanne': 'Joanne Day',
    'Joanne Day': 'Joanne Day',
#     'Joe': 'Joe Isaac',
#     'Joe Isaac': 'Joe Isaac',
    'John': 'John',
    'Johnson': '??? Johnson',
    'Jones': '??? Jones',
    'Joseph': 'Chief Gilbert',
#     'Julia': 'Julia',
    'Karen': 'Karen Lalleen',
    'Karen Lalleen': 'Karen Lalleen',
    'Kenneth': 'Kenneth',
    'Kennon': 'Kennon Johnson',
    'Kennon Johnson': 'Kennon Johnson',
    'Kevin': 'Kevin Smith',
    'Kevin Smith': 'Kevin Smith',
    'Lalleen': 'Karen Lalleen',
    'Lee': 'Lee White',
    'Lee White': 'Lee White',
    'Linda': 'Linda White',
    'Linda White': 'Linda White',
    'Liz': 'Lizzie Isaac',
    'Lizzie': 'Lizzie Isaac',
    'Lizzie Isaac': 'Lizzie Isaac',
    'Lloyd': 'Lloyd Day',
    'Lloyd Day': 'Lloyd Day',
    'Lois': 'Lois Wrightman',
    'Lois Wrightman': 'Lois Wrightman',
    'Lyndsay': 'Lyndsay Sword',
    'Lyndsay Sword': 'Lyndsay Sword',
#     'Mark': 'Mark',
    'Mel': 'Mel Hoeksma',
    'Mel Hoeksma': 'Mel Hoeksma',
    'Mickey': 'Mickey Aquash',
    'Mickey Aquash': 'Mickey Aquash',
#     'Morris': 'Morris Wrightman',
#     'Morris Wrightman': 'Morris Wrightman',
    'Myrna': 'Myrna',
    'Naomi': 'Naomi Williams',
    'Naomi Williams': 'Naomi Williams',
    'PD': 'Puppydog',
    'Pat': 'Pat Riley',
    'Pat Riley': 'Pat Riley',
    'Patricia': 'Patricia',
    'Patty': 'Patty Isaac',
    'Patty Isaac': 'Patty Isaac',
#     'Paul': 'Paul',
    'Peters': '??? Peters',
    'Puppydog': 'Puppydog',
    'Rachel': 'Rachel',
    'Ralph': 'Ralph ???',
    'Ralph Johnson': 'Ralph Johnson',
    'Ralph Jones': 'Ralph Jones',
    'Riley': '??? Riley',
    'Rita': 'Rita Sands',
    'Rita Sands': 'Rita Sands',
#     'Ron': 'Ron',
    'Rose': 'Rose',
    'Sands': '??? Sands',
#     'Sarah': 'Sarah',
    'Shirley': 'Shirley',
    'Smith': '??? Smith',
    'Stanley': 'Stanley',
    'Stuart': 'Stuart',
    'Suzie': 'Suzie ???',
    'Suzie Isaac': 'Suzie Isaac',
    'Suzie Jones': 'Suzie Jones',
    'Sword': 'Lyndsay Sword',
    'Terry': 'Terry Sands',
    'Terry Sands': 'Terry Sands',
#     'Tom': 'Tom',
    'Vernon': 'Vernon Jones',
    'Vernon Jones': 'Vernon Jones',
    'Wheat': 'Brenda Wheat',
    'White': '??? White',
    'Williams': 'Naomi Williams',
#     'Wright': 'Charles Wright',
    'Wrightman': '??? Wrightman',
}

In [None]:
tagged_text = pos_tag(word_tokenize(text))
                parsed_text = cp.parse(tagged_text)
                for e in parsed_text:
                    if isinstance(e, nltk.tree.Tree) and e.label() == 'PROPER_NOUN':
                        names = [word for word, tag in e if len(word) > 1]

#### People

In [None]:
people_counter = defaultdict(int)
for n1, n2, data in people.edges_iter(data=True):
    people_counter[n2] += int(data['weight'])

In [None]:
people_counter = {'people': Counter(people_counter)}

In [None]:
people_counter

In [None]:
len(all_found_terms)

In [None]:
all_text.count('Vernon Jones')

In [None]:
all_text_lower.count('Vernon Jones'.lower())

In [None]:
all_text.count('Vernon')

In [None]:
all_text_lower.count('Vernon'.lower())

In [None]:
c=d=e=0
for w in people_counter['people']:
    c1 = people_counter['people'][w]
    if len(w.split()) > 1:
        c2 = all_text_lower.count(w.lower())
    else:
        c2 = all_text.count(w)
    if c1==c2:
        c+=1
    else:
        d+=1
        print(w, c1, c2)

In [None]:
c

In [None]:
d

In [None]:
e

#### Others

In [None]:
others_counter = defaultdict(int)
for n1, n2, data in others.edges_iter(data=True):
    others_counter[n2] += int(data['weight'])

In [None]:
others_counter = {'other': Counter(others_counter)}

In [None]:
others_counter

In [None]:
len(all_found_terms)

In [None]:
# others
Buscar tal cual en el texto

### Most frequent words

In [None]:
most_freq_words

In [None]:
most_freq_words_counter = defaultdict(dict)
for pos in most_freq_words:
    group = most_freq_words[pos]
    for w in group:
        if w not in all_found_terms:
            sss = wn.synsets(w)
            for ss in sss:
                intersect = common_synsets(ss, synsets)
                if intersect:
                    all_found_terms.add(w)
                    for ssi in intersect:
                        category = inverse_concepts[ssi.name()]
                        if category == 'ancestor':
                            # In interpersonal relationships, it is intersting to see if they talk about
                            # their mother/father (singular, their own) mothers/fathers (plural, everyone's in general).
                            # Also, if they say mother/father (more formal) or mom/dad (more familiar).
                            term = w # original word in the transcriptions
                        else:
                            term = ss.name().split('.')[0] # generic concept
                        most_freq_words_counter[category][term] = most_freq_words[pos][w]
                else:
                    most_freq_words_counter['other'][w] = most_freq_words[pos][w] # original word in 'other'

In [None]:
for k in most_freq_words_counter:
    most_freq_words_counter[k] = Counter(most_freq_words_counter[k])

In [None]:
most_freq_words_counter

In [None]:
len(all_found_terms)