# Transcripts

In [61]:
import pickle
from collections import Counter
from collections import defaultdict

import networkx as nx
import nltk
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag

## Loading data

In [2]:
transcripts_df = pd.read_csv(
    'data/out/transcripts_1.csv',
    converters={'INTERVIEWERS': eval, 'INTERVIEWEES': eval, 'ALIASES': eval, 'INTERVIEW': eval},
)
transcripts_df = transcripts_df[['ID', 'INTERVIEWERS', 'INTERVIEWEES', 'ALIASES', 'INTERVIEW']]

In [3]:
transcripts_df.head()

Unnamed: 0,ID,INTERVIEWERS,INTERVIEWEES,ALIASES,INTERVIEW
0,Aimee Johnson – 17 September 2010,[Rick Fehr],[Aimee Johnson],[],"[(0, Rick, Ok. We’re recording now, I’m sitti..."
1,Anita Smith -,[Dave White],[Anita Smith],[],"[(0, Dave, How did we use to use the environme..."
2,Apollo Blackeagle – 27 October 2010,"[Rick Fehr, David White]",[Apollo Blackeagle],[],"[(0, Rick, Ok, its October 27th I believe, we’..."
3,Bill Sands,[Dave White],[Bill Sands],[],"[(0, Dave, In the past, there’s concern today ..."
4,Brenda Wheat – 24 May 2011,[Rick Fehr],[Brenda Wheat],[],"[(0, Rick, So what we’ll be using is just a li..."


In [4]:
transcripts_df.shape

(50, 5)

In [5]:
transcripts_df.head()

Unnamed: 0,ID,INTERVIEWERS,INTERVIEWEES,ALIASES,INTERVIEW
0,Aimee Johnson – 17 September 2010,[Rick Fehr],[Aimee Johnson],[],"[(0, Rick, Ok. We’re recording now, I’m sitti..."
1,Anita Smith -,[Dave White],[Anita Smith],[],"[(0, Dave, How did we use to use the environme..."
2,Apollo Blackeagle – 27 October 2010,"[Rick Fehr, David White]",[Apollo Blackeagle],[],"[(0, Rick, Ok, its October 27th I believe, we’..."
3,Bill Sands,[Dave White],[Bill Sands],[],"[(0, Dave, In the past, there’s concern today ..."
4,Brenda Wheat – 24 May 2011,[Rick Fehr],[Brenda Wheat],[],"[(0, Rick, So what we’ll be using is just a li..."


## Processing data

In [6]:
all_text = '\n'.join([text for interview in transcripts_df.INTERVIEW for (index, name, text) in interview])

In [7]:
with open('data/out/all_text.txt', 'w') as f:
    f.write(all_text)

### Keywords

In [8]:
MIN_LENGTH = 2

#### All

In [9]:
def clean_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    words = [token for token in tokens if token not in stopwords.words('english') and len(token) > MIN_LENGTH]
    text = ' '.join(words)
    return text

In [10]:
%%time

cleaned_all_text = clean_text(all_text)

CPU times: user 1min 1s, sys: 3.86 s, total: 1min 5s
Wall time: 1min 5s


In [11]:
all_words = cleaned_all_text.split()

In [12]:
counter_all = Counter(all_words)

In [13]:
counter_all.most_common(100)

[('yea', 2440),
 ('like', 2210),
 ('would', 1934),
 ('know', 1891),
 ('people', 1362),
 ('used', 1308),
 ('yeah', 1307),
 ('hmm', 1243),
 ('one', 1236),
 ('get', 1151),
 ('remember', 1135),
 ('time', 1088),
 ('think', 1026),
 ('well', 1013),
 ('back', 1012),
 ('things', 985),
 ('got', 953),
 ('way', 870),
 ('going', 868),
 ('lot', 857),
 ('right', 848),
 ('could', 735),
 ('use', 693),
 ('see', 672),
 ('something', 658),
 ('say', 623),
 ('good', 603),
 ('come', 600),
 ('around', 569),
 ('thing', 560),
 ('said', 512),
 ('stuff', 499),
 ('water', 496),
 ('went', 494),
 ('laughs', 492),
 ('take', 473),
 ('always', 472),
 ('really', 469),
 ('even', 466),
 ('community', 461),
 ('little', 459),
 ('fish', 449),
 ('kind', 439),
 ('put', 437),
 ('big', 437),
 ('make', 390),
 ('never', 383),
 ('want', 375),
 ('still', 361),
 ('maybe', 357),
 ('recall', 355),
 ('hunting', 353),
 ('anything', 345),
 ('old', 345),
 ('years', 343),
 ('bush', 332),
 ('dad', 329),
 ('much', 327),
 ('area', 317),
 ('eve

#### By POS tag

In [14]:
%%time

tagged_all_text = pos_tag(word_tokenize(all_text))

CPU times: user 28.7 s, sys: 88 ms, total: 28.8 s
Wall time: 28.8 s


##### Adjectives

In [15]:
%%time

adjectives = [word.lower() for word, pos in tagged_all_text if word not in stopwords.words('english') and len(word) > MIN_LENGTH and pos.startswith('J')]

CPU times: user 55.5 s, sys: 3.85 s, total: 59.3 s
Wall time: 59.4 s


In [16]:
counter_adjs = Counter(adjectives)

In [17]:
counter_adjs.most_common(100)

[('good', 593),
 ('little', 444),
 ('big', 428),
 ('old', 332),
 ('different', 307),
 ('much', 242),
 ('many', 229),
 ('whole', 191),
 ('right', 181),
 ('long', 176),
 ('first', 168),
 ('fish', 152),
 ('last', 139),
 ('able', 129),
 ('next', 126),
 ('certain', 125),
 ('young', 124),
 ('great', 120),
 ('indian', 118),
 ('particular', 111),
 ('wild', 101),
 ('sure', 97),
 ('real', 91),
 ('high', 90),
 ('environmental', 85),
 ('important', 83),
 ('sweet', 82),
 ('inaudible', 80),
 ('hard', 79),
 ('new', 76),
 ('nice', 76),
 ('black', 73),
 ('white', 71),
 ('small', 71),
 ('traditional', 68),
 ('younger', 66),
 ('bad', 66),
 ('uh…', 63),
 ('open', 57),
 ('natural', 56),
 ('customary', 56),
 ('older', 55),
 ('commercial', 55),
 ('main', 54),
 ('full', 53),
 ('best', 50),
 ('yea', 50),
 ('better', 49),
 ('common', 49),
 ('past', 46),
 ('native', 45),
 ('and…', 44),
 ('invasive', 44),
 ('red', 43),
 ('public', 42),
 ('social', 41),
 ('fresh', 40),
 ('early', 40),
 ('dry', 39),
 ('late', 38),


##### Nouns

In [18]:
%%time

nouns = [word.lower() for word, pos in tagged_all_text if word not in stopwords.words('english') and len(word) > MIN_LENGTH and pos.startswith('N')]

CPU times: user 55.6 s, sys: 3.74 s, total: 59.3 s
Wall time: 59.4 s


In [19]:
counter_nouns = Counter(nouns)

In [20]:
counter_nouns.most_common(100)

[('yea', 1734),
 ('people', 1358),
 ('hmm', 1231),
 ('time', 1088),
 ('things', 985),
 ('way', 870),
 ('lot', 857),
 ('something', 642),
 ('thing', 560),
 ('water', 496),
 ('community', 461),
 ('stuff', 458),
 ('kind', 439),
 ('well', 346),
 ('years', 343),
 ('anything', 335),
 ('bush', 320),
 ('area', 317),
 ('island', 309),
 ('dad', 305),
 ('today', 299),
 ('everything', 280),
 ('day', 275),
 ('kids', 259),
 ('everybody', 259),
 ('house', 258),
 ('year', 257),
 ('land', 256),
 ('home', 242),
 ('family', 242),
 ('walpole', 242),
 ('road', 241),
 ('place', 239),
 ('did', 236),
 ('fish', 228),
 ('food', 226),
 ('right', 210),
 ('stories', 209),
 ('marsh', 202),
 ('part', 202),
 ('days', 201),
 ('somebody', 201),
 ('environment', 201),
 ('yeah', 200),
 ('name', 198),
 ('money', 195),
 ('guys', 187),
 ('laughs', 187),
 ('person', 184),
 ('river', 180),
 ('ducks', 179),
 ('wood', 172),
 ('use', 166),
 ('school', 166),
 ('hunting', 161),
 ('times', 160),
 ('areas', 159),
 ('hall', 158),
 ('

##### Verbs

In [21]:
%%time

verbs = [word.lower() for word, pos in tagged_all_text if word not in stopwords.words('english') and len(word) > MIN_LENGTH and pos.startswith('V')]

CPU times: user 55.3 s, sys: 4 s, total: 59.3 s
Wall time: 59.3 s


In [22]:
counter_verbs = Counter(verbs)

In [23]:
counter_verbs.most_common(100)

[('know', 1867),
 ('used', 1307),
 ('remember', 1089),
 ('get', 1075),
 ('think', 992),
 ('got', 942),
 ('going', 868),
 ('see', 655),
 ('say', 616),
 ('come', 552),
 ('use', 516),
 ('said', 512),
 ('went', 492),
 ('take', 468),
 ('put', 426),
 ('make', 382),
 ('want', 371),
 ('recall', 327),
 ('laughs', 289),
 ('came', 259),
 ('made', 251),
 ('told', 212),
 ('coming', 206),
 ('look', 205),
 ('done', 201),
 ('getting', 197),
 ('guess', 197),
 ('bring', 195),
 ('need', 192),
 ('took', 190),
 ('hunting', 189),
 ('tell', 186),
 ('eat', 176),
 ('knew', 173),
 ('talking', 172),
 ('help', 169),
 ('give', 168),
 ('live', 163),
 ('call', 154),
 ('work', 152),
 ('keep', 147),
 ('lived', 143),
 ('says', 143),
 ('wanted', 140),
 ('talk', 140),
 ('thought', 138),
 ('start', 137),
 ('seen', 136),
 ('find', 129),
 ('mean', 128),
 ('goes', 127),
 ('gone', 126),
 ('looking', 126),
 ('called', 125),
 ('hunt', 125),
 ('hear', 123),
 ('happened', 122),
 ('cut', 119),
 ('taking', 114),
 ('saying', 114),
 

### Graphs

In [24]:
def first_names(names):
    return [x.split()[0] for x in names]

In [25]:
def last_names(names):
    r = []
    for x in names:
        if '(' in x:
            x = x.split('(')[0].strip()
        xs = x.split()
        if len(xs) > 1:
            ys = xs[1:]
            r.extend(ys)
    return r

In [26]:
interviewees = list(set(y for x in transcripts_df.INTERVIEWEES for y in x))

In [27]:
interviewees_first_names = first_names(interviewees)

In [28]:
interviewees_last_names = last_names(interviewees)

In [29]:
aliases = list(set(z for x in transcripts_df.ALIASES for y in x for z in y))

In [30]:
all_interviewees_names = set(interviewees + interviewees_first_names + interviewees_last_names + aliases)

In [31]:
sorted(all_interviewees_names)

['Aimee',
 'Aimee Johnson',
 'Anita',
 'Anita Smith',
 'Apollo',
 'Apollo Blackeagle',
 'Aquash',
 'Archie',
 'Baxter',
 'Becky',
 'Bill',
 'Bill Sands',
 'Blackbird',
 'Blackeagle',
 'Brenda',
 'Brenda Wheat',
 'Cal',
 'Cameron',
 'Carl',
 'Carl Smith (Resource Protection Officer)',
 'Carmen',
 'Carmen Wrightman',
 'Cheryl',
 'Chief',
 'Chief Gilbert',
 'Chief Joseph Gilbert',
 'Chris',
 'Chris Riley',
 'Daniel',
 'Day',
 'Dean',
 'Dean Jacobs',
 'Dot',
 'Dot Peters',
 'Elaine',
 'Elaine Jacobs',
 'Eli',
 'Eli Baxter',
 'Eliza',
 'Eliza John',
 'Eric',
 'Eric Isaac',
 'Gilbert',
 'Greg',
 'Greg Isaac',
 'Gus',
 'Harold',
 'Harold Peters',
 'Hoeksma',
 'Isaac',
 'Isabelle',
 'Jacobs',
 'Jasper',
 'Jasper John',
 'Jean',
 'Jean Wrightman',
 'Jen',
 'Jennie',
 'Jennie Blackbird',
 'Jessica',
 'Joanne',
 'Joanne Day',
 'John',
 'Johnson',
 'Jones',
 'Joseph',
 'Karen',
 'Karen Lalleen',
 'Kenneth',
 'Kennon',
 'Kennon Johnson',
 'Kevin',
 'Kevin Smith',
 'Lalleen',
 'Lee',
 'Lee White',
 

In [32]:
all_interviewees_names_dict = {
    'Aimee': 'Aimee Johnson',
    'Aimee Johnson': 'Aimee Johnson',
#     'Andrew': 'Andrew Peters',
#     'Andrew Peters': 'Andrew Peters',
    'Anita': 'Anita Smith',
    'Anita Smith': 'Anita Smith',
    'Apollo': 'Apollo Blackeagle',
    'Apollo Blackeagle': 'Apollo Blackeagle',
    'Aquash': 'Mickey Aquash',
    'Archie': 'Archie',
    'Baxter': 'Eli Baxter',
    'Becky': 'Becky',
    'Bill': 'Bill Sands',
    'Bill Sands': 'Bill Sands',
    'Blackbird': 'Jennie Blackbird',
    'Blackeagle': 'Apollo Blackeagle',
    'Brenda': 'Brenda Wheat',
    'Brenda Wheat': 'Brenda Wheat',
    'Cal': 'Cal',
    'Cameron': 'Cameron',
    'Carl': 'Carl Smith',
    'Carl Smith (Resource Protection Officer)': 'Carl Smith',
    'Carmen': 'Carmen Wrightman',
    'Carmen Wrightman': 'Carmen Wrightman',
#     'Carrie': 'Carrie Isaac',
#     'Carrie Isaac': 'Carrie Isaac',
#     'Charles': 'Charles Wright',
#     'Charles Wright': 'Charles Wright',
    'Cheryl': 'Cheryl',
    'Chief': 'Chief Gilbert',
    'Chief Gilbert': 'Chief Gilbert',
    'Chief Joseph Gilbert': 'Chief Gilbert',
    'Chris': 'Chris Riley',
    'Chris Riley': 'Chris Riley',
    'Daniel': 'Daniel',
#     'Darren': 'Darren',
    'Day': '??? Day',
    'Dean': 'Dean Jacobs',
    'Dean Jacobs': 'Dean Jacobs',
    'Dot': 'Dot Peters',
    'Dot Peters': 'Dot Peters',
#     'Doug': 'Doug',
#     'Doug (Resource Protection Officer)': 'Doug',
    'Elaine': 'Elaine Jacobs',
    'Elaine Jacobs': 'Elaine Jacobs',
    'Eli': 'Eli Baxter',
    'Eli Baxter': 'Eli Baxter',
    'Eliza': 'Eliza John',
    'Eliza John': 'Eliza John',
    'Eric': 'Eric Isaac',
    'Eric Isaac': 'Eric Isaac',
#     'Frank': 'Frank',
#     'Georgina': 'Georgina',
    'Gilbert': 'Chief Gilbert',
    'Greg': 'Greg Isaac',
    'Greg Isaac': 'Greg Isaac',
    'Gus': 'Gus',
    'Harold': 'Harold Peters',
    'Harold Peters': 'Harold Peters',
    'Hoeksma': 'Mel Hoeksma',
    'Isaac': '??? Isaac',
    'Isabelle': 'Isabelle',
    'Jacobs': '??? Jacobs',
#     'Jane': 'Jane Jacobs',
#     'Jane Jacobs': 'Jane Jacobs',
    'Jasper': 'Jasper John',
    'Jasper John': 'Jasper John',
    'Jean': 'Jean Wrightman',
    'Jean Wrightman': 'Jean Wrightman',
    'Jen': 'Jennie Blackbird',
    'Jennie': 'Jennie Blackbird',
    'Jennie Blackbird': 'Jennie Blackbird',
#     'Jerome': 'Jerome',
#     'Jerry': 'Jerry',
    'Jessica': 'Jessica',
    'Joanne': 'Joanne Day',
    'Joanne Day': 'Joanne Day',
#     'Joe': 'Joe Isaac',
#     'Joe Isaac': 'Joe Isaac',
    'John': 'John',
    'Johnson': '??? Johnson',
    'Jones': '??? Jones',
    'Joseph': 'Chief Gilbert',
#     'Julia': 'Julia',
    'Karen': 'Karen Lalleen',
    'Karen Lalleen': 'Karen Lalleen',
    'Kenneth': 'Kenneth',
    'Kennon': 'Kennon Johnson',
    'Kennon Johnson': 'Kennon Johnson',
    'Kevin': 'Kevin Smith',
    'Kevin Smith': 'Kevin Smith',
    'Lalleen': 'Karen Lalleen',
    'Lee': 'Lee White',
    'Lee White': 'Lee White',
    'Linda': 'Linda White',
    'Linda White': 'Linda White',
    'Liz': 'Lizzie Isaac',
    'Lizzie': 'Lizzie Isaac',
    'Lizzie Isaac': 'Lizzie Isaac',
    'Lloyd': 'Lloyd Day',
    'Lloyd Day': 'Lloyd Day',
    'Lois': 'Lois Wrightman',
    'Lois Wrightman': 'Lois Wrightman',
    'Lyndsay': 'Lyndsay Sword',
    'Lyndsay Sword': 'Lyndsay Sword',
#     'Mark': 'Mark',
    'Mel': 'Mel Hoeksma',
    'Mel Hoeksma': 'Mel Hoeksma',
    'Mickey': 'Mickey Aquash',
    'Mickey Aquash': 'Mickey Aquash',
#     'Morris': 'Morris Wrightman',
#     'Morris Wrightman': 'Morris Wrightman',
    'Myrna': 'Myrna',
    'Naomi': 'Naomi Williams',
    'Naomi Williams': 'Naomi Williams',
    'PD': 'Puppydog',
    'Pat': 'Pat Riley',
    'Pat Riley': 'Pat Riley',
    'Patricia': 'Patricia',
    'Patty': 'Patty Isaac',
    'Patty Isaac': 'Patty Isaac',
#     'Paul': 'Paul',
    'Peters': '??? Peters',
    'Puppydog': 'Puppydog',
    'Rachel': 'Rachel',
    'Ralph': 'Ralph ???',
    'Ralph Johnson': 'Ralph Johnson',
    'Ralph Jones': 'Ralph Jones',
    'Riley': '??? Riley',
    'Rita': 'Rita Sands',
    'Rita Sands': 'Rita Sands',
#     'Ron': 'Ron',
    'Rose': 'Rose',
    'Sands': '??? Sands',
#     'Sarah': 'Sarah',
    'Shirley': 'Shirley',
    'Smith': '??? Smith',
    'Stanley': 'Stanley',
    'Stuart': 'Stuart',
    'Suzie': 'Suzie ???',
    'Suzie Isaac': 'Suzie Isaac',
    'Suzie Jones': 'Suzie Jones',
    'Sword': 'Lyndsay Sword',
    'Terry': 'Terry Sands',
    'Terry Sands': 'Terry Sands',
#     'Tom': 'Tom',
    'Vernon': 'Vernon Jones',
    'Vernon Jones': 'Vernon Jones',
    'Wheat': 'Brenda Wheat',
    'White': '??? White',
    'Williams': 'Naomi Williams',
#     'Wright': 'Charles Wright',
    'Wrightman': '??? Wrightman',
}

In [33]:
grammar = 'PROPER_NOUN: {<NNP>+}'
cp = nltk.RegexpParser(grammar)

#### People

In [34]:
%%time

g = nx.DiGraph()
interviewers_names = ['Dave', 'Rick', 'Clint']
for interview in transcripts_df.INTERVIEW:
    for index, name, text in interview:
        if text:
            if name not in interviewers_names:
                name = all_interviewees_names_dict[name]
                if not g.has_node(name):
                    g.add_node(name)
                tagged_text = pos_tag(word_tokenize(text))
                parsed_text = cp.parse(tagged_text)
                for e in parsed_text:
                    if isinstance(e, nltk.tree.Tree) and e.label() == 'PROPER_NOUN':
                        names = [word for word, tag in e if len(word) > 1]
                        if all_interviewees_names.intersection(names):
                            proper_noun = ' '.join(names)
                            if proper_noun in all_interviewees_names_dict and \
                               '???' not in all_interviewees_names_dict[proper_noun]:
                                proper_noun = all_interviewees_names_dict[proper_noun]
                            if proper_noun != name:
                                if not g.has_edge(name, proper_noun):
                                    g.add_edge(name, proper_noun, weight=0)
                                g[name][proper_noun]['weight'] += 1

CPU times: user 22.4 s, sys: 96 ms, total: 22.5 s
Wall time: 22.5 s


#### Other

In [35]:
with open('data/wordsEn.txt') as f:
    english_words = set(word.strip().lower() for word in f)

In [94]:
def clean_ellipsis(text):
    text = text.replace('…', ' ')
    return text

In [94]:
def clean_words(words):
    cleaned_words = []
    for w in words:
        if word.endswith('…'):
            w = w[:-1]
    cleaned_words.append(w)
    
    return text

def clean_name(name):
    return ' '.join([clean_word(w) for w in name.split()])

def test2(name):
    return len(name) > 2 and not name.isupper() and test1(name.split())

def test1(names):
    return
        names and \
        not all_interviewees_names.intersection(names) and \
        (len(names) > 1 or names[0].lower() not in english_words)

In [100]:
' '.join(['','','','x','y'])

'   x y'

In [95]:
%%time

h = nx.DiGraph()
interviewers_names = ['Dave', 'Rick', 'Clint']
for interview in transcripts_df.INTERVIEW:
    for index, name, text in interview:
        if text:
            if name not in interviewers_names:
                name = all_interviewees_names_dict[name]
                if not h.has_node(name):
                    h.add_node(name, type='interviewee')
                tagged_text = pos_tag(word_tokenize(text))
                parsed_text = cp.parse(tagged_text)
                for e in parsed_text:
                    if isinstance(e, nltk.tree.Tree) and e.label() == 'PROPER_NOUN':
                        names = [word for word, tag in e if len(word) > 1]
                        if test1(names):
                            proper_noun = ' '.join(names)
                            proper_noun = clean_name(proper_noun)
                            if test2(proper_noun):
                                if proper_noun != name:
                                    if not h.has_node(proper_noun):
                                        h.add_node(proper_noun, type='other')
                                    if not h.has_edge(name, proper_noun):
                                        h.add_edge(name, proper_noun, weight=0)
                                    h[name][proper_noun]['weight'] += 1

CPU times: user 22.1 s, sys: 80 ms, total: 22.2 s
Wall time: 22.2 s


In [None]:
black_list = [
    '-yea-', 'A-O-K', 'A.D.D', 'AA', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 
    'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 
    'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 
    'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 'qqq', 
    
]

In [96]:
nodes_deg_dict = defaultdict(int)
for n, data in h.nodes_iter(data=True):
    if data['type'] == 'other':
#         name = clean_name(n)
#         if test(name):
        nodes_deg_dict[n] += h.degree(n)
nodes_deg_dict = dict(nodes_deg_dict)

In [97]:
len(nodes_deg_dict)

922

In [98]:
for k, v in sorted(nodes_deg_dict.items()):
    print(k, '-', v)

-yea- - 2
Aambe - 1
Aboriginal Title Claim - 1
Ackland - 3
Adam Calicut - 2
Agricultural Drainage Ditches - 1
Aileen - 1
Akii-Kwe - 1
Aklyn - 2
Alan Deleary - 1
Alan Delery - 1
Albany River - 1
Albert Kewayosh - 1
Aldra Shipman - 1
Aldrin Dodge - 2
Alec - 1
Alex Rhodes - 1
Alex Shipman - 1
Alfred Soney - 1
Algonac - 14
Alice Brigham - 2
Alice Warner - 1
All-righty - 2
Allen Kiyoshk - 1
Altiman - 4
Altiman Rd - 1
Altiman Road - 3
Altman - 1
Alue - 1
Alyssa - 1
Alyssa Murphy - 1
Amherstberg - 1
Amherstburg - 1
Anchor Bay - 1
Anderson Lodge - 1
Andrew Medlar - 1
Andrew Miller - 1
Angie - 1
Anglican Church - 7
Anglican Hall - 1
Anglican Parish - 1
Anglican Parish Hall - 1
Anishi-zhewago - 1
Anishibaabeg Maatisowin - 1
Anishinaabe - 3
Anishinaabeg - 3
Anishinaabemowin - 2
Anishinaabg - 1
Anishinaabge - 1
Anishna - 1
Anishnaabe - 3
Anishnabe - 2
Anziiyaan - 1
Arlita Nahdee - 1
Art Miskokomon - 1
Ashkebee - 1
Asnaa - 1
Atrazine - 1
Atsokan - 1
Attapiskatt First Nation - 1
Attawapiskatt - 1
Au

In [99]:


SyntaxError: invalid character in identifier (<ipython-input-99-a5cc69ddca5d>, line 1)

In [53]:
len(sorted(nodes))

1022

In [58]:
Counter([('x',2),('x',1)])

Counter({('x', 1): 1, ('x', 2): 1})

In [60]:
sum(('x',2),('x',1))

TypeError: can only concatenate tuple (not "str") to tuple

In [55]:
'A-O-K'.isupper()

False

In [46]:
sorted(nodes)

[('-yea-', 2),
 ('A-O-K', 2),
 ('A.D.D', 1),
 ('AA', 1),
 ('AHHHH', 1),
 ('ANCC', 1),
 ('Aambe', 1),
 ('Aboriginal Title Claim', 1),
 ('Ackland', 3),
 ('Adam Calicut', 2),
 ('Agricultural Drainage Ditches', 1),
 ('Aileen', 1),
 ('Akii-Kwe', 1),
 ('Aklyn', 2),
 ('Alan Deleary', 1),
 ('Alan Delery', 1),
 ('Albany River', 1),
 ('Albert Kewayosh', 1),
 ('Aldra Shipman…', 1),
 ('Aldrin Dodge', 2),
 ('Alec', 1),
 ('Alex Rhodes', 1),
 ('Alex Shipman', 1),
 ('Alfred Soney', 1),
 ('Algonac', 14),
 ('Alice Brigham', 2),
 ('Alice Warner', 1),
 ('All-righty', 2),
 ('Allen Isaac…', 1),
 ('Allen Kiyoshk', 1),
 ('Altiman', 4),
 ('Altiman Rd', 1),
 ('Altiman Road', 3),
 ('Altman', 1),
 ('Alue', 1),
 ('Alyssa', 1),
 ('Alyssa Murphy', 1),
 ('Amherstberg', 1),
 ('Amherstburg', 1),
 ('Anchor Bay', 1),
 ('Anderson Lodge', 1),
 ('Andrew Medlar', 1),
 ('Andrew Miller', 1),
 ('And…and', 1),
 ('Angie', 1),
 ('Anglican Church', 7),
 ('Anglican Hall', 1),
 ('Anglican Parish', 1),
 ('Anglican Parish Hall', 1),
 (

In [49]:
sorted(nodes)[-30:]

[('Wilsey', 2),
 ('Wilsie Kewayosh', 1),
 ('Windsor Environment Canada', 1),
 ('Windsor Star', 1),
 ('Winston', 3),
 ('Winter Gathering…', 1),
 ('Wood Bee', 3),
 ('Wood Bees', 1),
 ('Woodcutting Bee', 2),
 ('World Health Organization', 1),
 ('World War', 2),
 ('World War II', 1),
 ('Yea Bassett', 2),
 ('Yea Chiefs Road', 1),
 ('Yea Quopkaing', 1),
 ('Yea Winston', 1),
 ('Yea uh', 1),
 ('Yea…', 1),
 ('Yep uh', 1),
 ('Zaagewin', 1),
 ('Zebra Mussel', 1),
 ('Zhinkaaza', 1),
 ('Zhoon', 3),
 ('Zhoon ya', 1),
 ('Zhooskon', 1),
 ('Zooters', 1),
 ('knock knock', 1),
 ('yea uh', 1),
 ('yea yep', 2),
 ('zhooshkwaa', 1)]

## Saving data

In [37]:
with open('data/out/counter_all.pickle', 'wb') as f:
    pickle.dump(counter_all, f)

In [38]:
with open('data/out/counter_adjs.pickle', 'wb') as f:
    pickle.dump(counter_adjs, f)

In [39]:
with open('data/out/counter_nouns.pickle', 'wb') as f:
    pickle.dump(counter_nouns, f)

In [40]:
with open('data/out/counter_verbs.pickle', 'wb') as f:
    pickle.dump(counter_verbs, f)

In [41]:
nx.write_gexf(g, 'data/out/people.gexf')

In [42]:
nx.write_gexf(h, 'data/out/other.gexf')