# Transcripts

In [1]:
import pickle
from collections import Counter
from collections import defaultdict
from string import punctuation

import networkx as nx
import nltk
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag

## Loading data

In [2]:
transcripts_df = pd.read_csv(
    'data/out/transcripts_1.csv',
    converters={'INTERVIEWERS': eval, 'INTERVIEWEES': eval, 'ALIASES': eval, 'INTERVIEW': eval},
)
transcripts_df = transcripts_df[['ID', 'INTERVIEWERS', 'INTERVIEWEES', 'ALIASES', 'INTERVIEW']]

In [3]:
transcripts_df.head()

Unnamed: 0,ID,INTERVIEWERS,INTERVIEWEES,ALIASES,INTERVIEW
0,Aimee Johnson – 17 September 2010,[Rick Fehr],[Aimee Johnson],[],"[(0, Rick, Ok. We’re recording now, I’m sitti..."
1,Anita Smith -,[Dave White],[Anita Smith],[],"[(0, Dave, How did we use to use the environme..."
2,Apollo Blackeagle – 27 October 2010,"[Rick Fehr, David White]",[Apollo Blackeagle],[],"[(0, Rick, Ok, its October 27th I believe, we’..."
3,Bill Sands,[Dave White],[Bill Sands],[],"[(0, Dave, In the past, there’s concern today ..."
4,Brenda Wheat – 24 May 2011,[Rick Fehr],[Brenda Wheat],[],"[(0, Rick, So what we’ll be using is just a li..."


In [4]:
transcripts_df.shape

(50, 5)

In [5]:
transcripts_df.head()

Unnamed: 0,ID,INTERVIEWERS,INTERVIEWEES,ALIASES,INTERVIEW
0,Aimee Johnson – 17 September 2010,[Rick Fehr],[Aimee Johnson],[],"[(0, Rick, Ok. We’re recording now, I’m sitti..."
1,Anita Smith -,[Dave White],[Anita Smith],[],"[(0, Dave, How did we use to use the environme..."
2,Apollo Blackeagle – 27 October 2010,"[Rick Fehr, David White]",[Apollo Blackeagle],[],"[(0, Rick, Ok, its October 27th I believe, we’..."
3,Bill Sands,[Dave White],[Bill Sands],[],"[(0, Dave, In the past, there’s concern today ..."
4,Brenda Wheat – 24 May 2011,[Rick Fehr],[Brenda Wheat],[],"[(0, Rick, So what we’ll be using is just a li..."


## Processing data

In [6]:
all_text = '\n'.join([text for interview in transcripts_df.INTERVIEW for (index, name, text) in interview])

### Keywords

In [7]:
MIN_LENGTH = 2

#### All

In [8]:
def clean_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    words = [token for token in tokens if token not in stopwords.words('english') and len(token) > MIN_LENGTH]
    text = ' '.join(words)
    return text

In [9]:
%%time

cleaned_all_text = clean_text(all_text)

CPU times: user 1min 3s, sys: 3.85 s, total: 1min 7s
Wall time: 1min 7s


In [10]:
all_words = cleaned_all_text.split()

In [11]:
counter_all = Counter(all_words)

In [12]:
counter_all.most_common(100)

[('yea', 2440),
 ('like', 2210),
 ('would', 1934),
 ('know', 1891),
 ('people', 1362),
 ('used', 1308),
 ('yeah', 1307),
 ('hmm', 1243),
 ('one', 1236),
 ('get', 1151),
 ('remember', 1135),
 ('time', 1088),
 ('think', 1026),
 ('well', 1013),
 ('back', 1012),
 ('things', 985),
 ('got', 953),
 ('way', 870),
 ('going', 868),
 ('lot', 857),
 ('right', 848),
 ('could', 735),
 ('use', 693),
 ('see', 672),
 ('something', 658),
 ('say', 623),
 ('good', 603),
 ('come', 600),
 ('around', 569),
 ('thing', 560),
 ('said', 512),
 ('stuff', 499),
 ('water', 496),
 ('went', 494),
 ('laughs', 492),
 ('take', 473),
 ('always', 472),
 ('really', 469),
 ('even', 466),
 ('community', 461),
 ('little', 459),
 ('fish', 449),
 ('kind', 439),
 ('big', 437),
 ('put', 437),
 ('make', 390),
 ('never', 383),
 ('want', 375),
 ('still', 361),
 ('maybe', 357),
 ('recall', 355),
 ('hunting', 353),
 ('anything', 345),
 ('old', 345),
 ('years', 343),
 ('bush', 332),
 ('dad', 329),
 ('much', 327),
 ('area', 317),
 ('eve

#### By POS tag

In [13]:
%%time

tagged_all_text = pos_tag(word_tokenize(all_text))

CPU times: user 29.7 s, sys: 76 ms, total: 29.7 s
Wall time: 29.9 s


##### Adjectives

In [14]:
%%time

adjectives = [word.lower() for word, pos in tagged_all_text if word not in stopwords.words('english') and len(word) > MIN_LENGTH and pos.startswith('J')]

CPU times: user 57.3 s, sys: 3.87 s, total: 1min 1s
Wall time: 1min 1s


In [15]:
counter_adjs = Counter(adjectives)

In [16]:
counter_adjs.most_common(100)

[('good', 593),
 ('little', 444),
 ('big', 428),
 ('old', 332),
 ('different', 307),
 ('much', 242),
 ('many', 229),
 ('whole', 191),
 ('right', 181),
 ('long', 176),
 ('first', 168),
 ('fish', 152),
 ('last', 139),
 ('able', 129),
 ('next', 126),
 ('certain', 125),
 ('young', 124),
 ('great', 120),
 ('indian', 118),
 ('particular', 111),
 ('wild', 101),
 ('sure', 97),
 ('real', 91),
 ('high', 90),
 ('environmental', 85),
 ('important', 83),
 ('sweet', 82),
 ('inaudible', 80),
 ('hard', 79),
 ('new', 76),
 ('nice', 76),
 ('black', 73),
 ('white', 71),
 ('small', 71),
 ('traditional', 68),
 ('younger', 66),
 ('bad', 66),
 ('uh…', 63),
 ('open', 57),
 ('customary', 56),
 ('natural', 56),
 ('commercial', 55),
 ('older', 55),
 ('main', 54),
 ('full', 53),
 ('yea', 50),
 ('best', 50),
 ('common', 49),
 ('better', 49),
 ('past', 46),
 ('native', 45),
 ('and…', 44),
 ('invasive', 44),
 ('red', 43),
 ('public', 42),
 ('social', 41),
 ('early', 40),
 ('fresh', 40),
 ('dry', 39),
 ('rid', 38),
 

##### Nouns

In [17]:
%%time

nouns = [word.lower() for word, pos in tagged_all_text if word not in stopwords.words('english') and len(word) > MIN_LENGTH and pos.startswith('N')]

CPU times: user 58.4 s, sys: 3.97 s, total: 1min 2s
Wall time: 1min 2s


In [18]:
counter_nouns = Counter(nouns)

In [19]:
counter_nouns.most_common(100)

[('yea', 1734),
 ('people', 1358),
 ('hmm', 1231),
 ('time', 1088),
 ('things', 985),
 ('way', 870),
 ('lot', 857),
 ('something', 642),
 ('thing', 560),
 ('water', 496),
 ('community', 461),
 ('stuff', 458),
 ('kind', 439),
 ('well', 346),
 ('years', 343),
 ('anything', 335),
 ('bush', 320),
 ('area', 317),
 ('island', 309),
 ('dad', 305),
 ('today', 299),
 ('everything', 280),
 ('day', 275),
 ('kids', 259),
 ('everybody', 259),
 ('house', 258),
 ('year', 257),
 ('land', 256),
 ('walpole', 242),
 ('home', 242),
 ('family', 242),
 ('road', 241),
 ('place', 239),
 ('did', 236),
 ('fish', 228),
 ('food', 226),
 ('right', 210),
 ('stories', 209),
 ('part', 202),
 ('marsh', 202),
 ('environment', 201),
 ('days', 201),
 ('somebody', 201),
 ('yeah', 200),
 ('name', 198),
 ('money', 195),
 ('laughs', 187),
 ('guys', 187),
 ('person', 184),
 ('river', 180),
 ('ducks', 179),
 ('wood', 172),
 ('use', 166),
 ('school', 166),
 ('hunting', 161),
 ('times', 160),
 ('areas', 159),
 ('hall', 158),
 ('

##### Verbs

In [20]:
%%time

verbs = [word.lower() for word, pos in tagged_all_text if word not in stopwords.words('english') and len(word) > MIN_LENGTH and pos.startswith('V')]

CPU times: user 57.3 s, sys: 3.94 s, total: 1min 1s
Wall time: 1min 1s


In [21]:
counter_verbs = Counter(verbs)

In [22]:
counter_verbs.most_common(100)

[('know', 1867),
 ('used', 1307),
 ('remember', 1089),
 ('get', 1075),
 ('think', 992),
 ('got', 942),
 ('going', 868),
 ('see', 655),
 ('say', 616),
 ('come', 552),
 ('use', 516),
 ('said', 512),
 ('went', 492),
 ('take', 468),
 ('put', 426),
 ('make', 382),
 ('want', 371),
 ('recall', 327),
 ('laughs', 289),
 ('came', 259),
 ('made', 251),
 ('told', 212),
 ('coming', 206),
 ('look', 205),
 ('done', 201),
 ('guess', 197),
 ('getting', 197),
 ('bring', 195),
 ('need', 192),
 ('took', 190),
 ('hunting', 189),
 ('tell', 186),
 ('eat', 176),
 ('knew', 173),
 ('talking', 172),
 ('help', 169),
 ('give', 168),
 ('live', 163),
 ('call', 154),
 ('work', 152),
 ('keep', 147),
 ('says', 143),
 ('lived', 143),
 ('talk', 140),
 ('wanted', 140),
 ('thought', 138),
 ('start', 137),
 ('seen', 136),
 ('find', 129),
 ('mean', 128),
 ('goes', 127),
 ('looking', 126),
 ('gone', 126),
 ('called', 125),
 ('hunt', 125),
 ('hear', 123),
 ('happened', 122),
 ('cut', 118),
 ('saying', 114),
 ('taking', 114),
 

### Graphs

In [23]:
def first_names(names):
    return [x.split()[0] for x in names]

In [24]:
def last_names(names):
    r = []
    for x in names:
        if '(' in x:
            x = x.split('(')[0].strip()
        xs = x.split()
        if len(xs) > 1:
            ys = xs[1:]
            r.extend(ys)
    return r

In [25]:
interviewees = list(set(y for x in transcripts_df.INTERVIEWEES for y in x))

In [26]:
interviewees_first_names = first_names(interviewees)

In [27]:
interviewees_last_names = last_names(interviewees)

In [28]:
aliases = list(set(z for x in transcripts_df.ALIASES for y in x for z in y))

In [29]:
all_interviewees_names = set(interviewees + interviewees_first_names + interviewees_last_names + aliases)

In [30]:
sorted(all_interviewees_names)

['Aimee',
 'Aimee Johnson',
 'Anita',
 'Anita Smith',
 'Apollo',
 'Apollo Blackeagle',
 'Aquash',
 'Archie',
 'Baxter',
 'Becky',
 'Bill',
 'Bill Sands',
 'Blackbird',
 'Blackeagle',
 'Brenda',
 'Brenda Wheat',
 'Cal',
 'Cameron',
 'Carl',
 'Carl Smith (Resource Protection Officer)',
 'Carmen',
 'Carmen Wrightman',
 'Cheryl',
 'Chief',
 'Chief Gilbert',
 'Chief Joseph Gilbert',
 'Chris',
 'Chris Riley',
 'Daniel',
 'Day',
 'Dean',
 'Dean Jacobs',
 'Dot',
 'Dot Peters',
 'Elaine',
 'Elaine Jacobs',
 'Eli',
 'Eli Baxter',
 'Eliza',
 'Eliza John',
 'Eric',
 'Eric Isaac',
 'Gilbert',
 'Greg',
 'Greg Isaac',
 'Gus',
 'Harold',
 'Harold Peters',
 'Hoeksma',
 'Isaac',
 'Isabelle',
 'Jacobs',
 'Jasper',
 'Jasper John',
 'Jean',
 'Jean Wrightman',
 'Jen',
 'Jennie',
 'Jennie Blackbird',
 'Jessica',
 'Joanne',
 'Joanne Day',
 'John',
 'Johnson',
 'Jones',
 'Joseph',
 'Karen',
 'Karen Lalleen',
 'Kenneth',
 'Kennon',
 'Kennon Johnson',
 'Kevin',
 'Kevin Smith',
 'Lalleen',
 'Lee',
 'Lee White',
 

In [31]:
all_interviewees_names_dict = {
    'Aimee': 'Aimee Johnson',
    'Aimee Johnson': 'Aimee Johnson',
#     'Andrew': 'Andrew Peters',
#     'Andrew Peters': 'Andrew Peters',
    'Anita': 'Anita Smith',
    'Anita Smith': 'Anita Smith',
    'Apollo': 'Apollo Blackeagle',
    'Apollo Blackeagle': 'Apollo Blackeagle',
    'Aquash': 'Mickey Aquash',
    'Archie': 'Archie',
    'Baxter': 'Eli Baxter',
    'Becky': 'Becky',
    'Bill': 'Bill Sands',
    'Bill Sands': 'Bill Sands',
    'Blackbird': 'Jennie Blackbird',
    'Blackeagle': 'Apollo Blackeagle',
    'Brenda': 'Brenda Wheat',
    'Brenda Wheat': 'Brenda Wheat',
    'Cal': 'Cal',
    'Cameron': 'Cameron',
    'Carl': 'Carl Smith',
    'Carl Smith (Resource Protection Officer)': 'Carl Smith',
    'Carmen': 'Carmen Wrightman',
    'Carmen Wrightman': 'Carmen Wrightman',
#     'Carrie': 'Carrie Isaac',
#     'Carrie Isaac': 'Carrie Isaac',
#     'Charles': 'Charles Wright',
#     'Charles Wright': 'Charles Wright',
    'Cheryl': 'Cheryl',
    'Chief': 'Chief Gilbert',
    'Chief Gilbert': 'Chief Gilbert',
    'Chief Joseph Gilbert': 'Chief Gilbert',
    'Chris': 'Chris Riley',
    'Chris Riley': 'Chris Riley',
    'Daniel': 'Daniel',
#     'Darren': 'Darren',
    'Day': '??? Day',
    'Dean': 'Dean Jacobs',
    'Dean Jacobs': 'Dean Jacobs',
    'Dot': 'Dot Peters',
    'Dot Peters': 'Dot Peters',
#     'Doug': 'Doug',
#     'Doug (Resource Protection Officer)': 'Doug',
    'Elaine': 'Elaine Jacobs',
    'Elaine Jacobs': 'Elaine Jacobs',
    'Eli': 'Eli Baxter',
    'Eli Baxter': 'Eli Baxter',
    'Eliza': 'Eliza John',
    'Eliza John': 'Eliza John',
    'Eric': 'Eric Isaac',
    'Eric Isaac': 'Eric Isaac',
#     'Frank': 'Frank',
#     'Georgina': 'Georgina',
    'Gilbert': 'Chief Gilbert',
    'Greg': 'Greg Isaac',
    'Greg Isaac': 'Greg Isaac',
    'Gus': 'Gus',
    'Harold': 'Harold Peters',
    'Harold Peters': 'Harold Peters',
    'Hoeksma': 'Mel Hoeksma',
    'Isaac': '??? Isaac',
    'Isabelle': 'Isabelle',
    'Jacobs': '??? Jacobs',
#     'Jane': 'Jane Jacobs',
#     'Jane Jacobs': 'Jane Jacobs',
    'Jasper': 'Jasper John',
    'Jasper John': 'Jasper John',
    'Jean': 'Jean Wrightman',
    'Jean Wrightman': 'Jean Wrightman',
    'Jen': 'Jennie Blackbird',
    'Jennie': 'Jennie Blackbird',
    'Jennie Blackbird': 'Jennie Blackbird',
#     'Jerome': 'Jerome',
#     'Jerry': 'Jerry',
    'Jessica': 'Jessica',
    'Joanne': 'Joanne Day',
    'Joanne Day': 'Joanne Day',
#     'Joe': 'Joe Isaac',
#     'Joe Isaac': 'Joe Isaac',
    'John': 'John',
    'Johnson': '??? Johnson',
    'Jones': '??? Jones',
    'Joseph': 'Chief Gilbert',
#     'Julia': 'Julia',
    'Karen': 'Karen Lalleen',
    'Karen Lalleen': 'Karen Lalleen',
    'Kenneth': 'Kenneth',
    'Kennon': 'Kennon Johnson',
    'Kennon Johnson': 'Kennon Johnson',
    'Kevin': 'Kevin Smith',
    'Kevin Smith': 'Kevin Smith',
    'Lalleen': 'Karen Lalleen',
    'Lee': 'Lee White',
    'Lee White': 'Lee White',
    'Linda': 'Linda White',
    'Linda White': 'Linda White',
    'Liz': 'Lizzie Isaac',
    'Lizzie': 'Lizzie Isaac',
    'Lizzie Isaac': 'Lizzie Isaac',
    'Lloyd': 'Lloyd Day',
    'Lloyd Day': 'Lloyd Day',
    'Lois': 'Lois Wrightman',
    'Lois Wrightman': 'Lois Wrightman',
    'Lyndsay': 'Lyndsay Sword',
    'Lyndsay Sword': 'Lyndsay Sword',
#     'Mark': 'Mark',
    'Mel': 'Mel Hoeksma',
    'Mel Hoeksma': 'Mel Hoeksma',
    'Mickey': 'Mickey Aquash',
    'Mickey Aquash': 'Mickey Aquash',
#     'Morris': 'Morris Wrightman',
#     'Morris Wrightman': 'Morris Wrightman',
    'Myrna': 'Myrna',
    'Naomi': 'Naomi Williams',
    'Naomi Williams': 'Naomi Williams',
    'PD': 'Puppydog',
    'Pat': 'Pat Riley',
    'Pat Riley': 'Pat Riley',
    'Patricia': 'Patricia',
    'Patty': 'Patty Isaac',
    'Patty Isaac': 'Patty Isaac',
#     'Paul': 'Paul',
    'Peters': '??? Peters',
    'Puppydog': 'Puppydog',
    'Rachel': 'Rachel',
    'Ralph': 'Ralph ???',
    'Ralph Johnson': 'Ralph Johnson',
    'Ralph Jones': 'Ralph Jones',
    'Riley': '??? Riley',
    'Rita': 'Rita Sands',
    'Rita Sands': 'Rita Sands',
#     'Ron': 'Ron',
    'Rose': 'Rose',
    'Sands': '??? Sands',
#     'Sarah': 'Sarah',
    'Shirley': 'Shirley',
    'Smith': '??? Smith',
    'Stanley': 'Stanley',
    'Stuart': 'Stuart',
    'Suzie': 'Suzie ???',
    'Suzie Isaac': 'Suzie Isaac',
    'Suzie Jones': 'Suzie Jones',
    'Sword': 'Lyndsay Sword',
    'Terry': 'Terry Sands',
    'Terry Sands': 'Terry Sands',
#     'Tom': 'Tom',
    'Vernon': 'Vernon Jones',
    'Vernon Jones': 'Vernon Jones',
    'Wheat': 'Brenda Wheat',
    'White': '??? White',
    'Williams': 'Naomi Williams',
#     'Wright': 'Charles Wright',
    'Wrightman': '??? Wrightman',
}

In [32]:
grammar = 'PROPER_NOUN: {<NNP>+}'
cp = nltk.RegexpParser(grammar)

#### People

In [33]:
%%time

g = nx.DiGraph()
interviewers_names = ['Dave', 'Rick', 'Clint']
for interview in transcripts_df.INTERVIEW:
    for index, name, text in interview:
        if text:
            if name not in interviewers_names:
                name = all_interviewees_names_dict[name]
                if not g.has_node(name):
                    g.add_node(name)
                tagged_text = pos_tag(word_tokenize(text))
                parsed_text = cp.parse(tagged_text)
                for e in parsed_text:
                    if isinstance(e, nltk.tree.Tree) and e.label() == 'PROPER_NOUN':
                        names = [word for word, tag in e if len(word) > 1]
                        if all_interviewees_names.intersection(names):
                            proper_noun = ' '.join(names)
                            if proper_noun in all_interviewees_names_dict and \
                               '???' not in all_interviewees_names_dict[proper_noun]:
                                proper_noun = all_interviewees_names_dict[proper_noun]
                            if proper_noun != name:
                                if not g.has_edge(name, proper_noun):
                                    g.add_edge(name, proper_noun, weight=0)
                                g[name][proper_noun]['weight'] += 1

CPU times: user 22.5 s, sys: 72 ms, total: 22.6 s
Wall time: 22.6 s


#### Other

##### Continents

In [34]:
continents_list = ['Europe', 'America', 'Asia', 'Africa']

##### Countries

In [35]:
with open('data/places/countries.txt') as f:
    countries_list = f.readlines()

##### Canadian main places

In [36]:
with open('data/places/canadian_main_places.txt') as f:
    ca_main_places_list = [p.strip() for p in f.readlines()]

##### Other Canadian places

In [37]:
with open('data/places/canadian_places.txt') as f:
    ca_other_places_list = [p.strip() for p in f.readlines()]

##### Canadian cities

In [38]:
ca_cities_list = []

with open('data/places/canadian_cities.txt') as f:
    for line in f:
        if line.strip():
            city = line.split('\t')[0]
            city = city.replace('(part)', '')
            if '[' in city:
                city = city.split('[')[0]
            ca_cities_list.append(city.strip())

##### Canadian towns

In [39]:
ca_towns_list = []

with open('data/places/canadian_towns.txt') as f:
    for line in f:
        if line.strip():
            town = line.split('\t')[0]
            town = town.replace('–', '')
            if '[' in town:
                town = town.split('[')[0]
            if '(' in town:
                town = town.split('(')[0]
            ca_towns_list.append(town.strip())

##### US states

In [40]:
with open('data/places/us_states.txt') as f:
    us_states_list = [p.strip() for p in f.readlines()]

##### US cities

In [41]:
us_cities_list = []

with open('data/places/us_cities.txt') as f:
    for line in f:
        if line.strip():
            city = line.split('\t')[1]
            city = city.replace('–', '')
            if '[' in city:
                city = city.split('[')[0]
            us_cities_list.append(city.strip())

##### Michigan cities and towns

In [42]:
mi_cities_towns_list = []

with open('data/places/mi_cities_towns.txt') as f:
    for line in f:
        if line.strip():
            city = line.split('\t')[0]
            if ',' in city:
                city = city.split(',')[0]
            mi_cities_towns_list.append(city.strip())

##### New York cities and towns

In [43]:
ny_cities_towns_list = []

with open('data/places/ny_cities_towns.txt') as f:
    for line in f:
        if line.strip():
            city = line.split('\t')[0]
            if '(' in city:
                city = city.split('(')[0]
            ny_cities_towns_list.append(city.strip())

##### All places

In [44]:
all_places_list = \
    continents_list + countries_list + ca_main_places_list + ca_other_places_list + ca_cities_list + \
    ca_towns_list + us_states_list + us_cities_list + mi_cities_towns_list + ny_cities_towns_list
all_places_lower_list = [p.lower() for p in all_places_list]

##### English words

In [45]:
with open('data/wordsEn.txt') as f:
    english_words = set(word.strip().lower() for word in f)

##### Changes

In [46]:
changes_dict = {
    'AHHHH': '',
    'Alan Delery': 'Alan Deleary',
    'All-righty': '',
    'Altiman Rd': 'Altiman Road',
    'Altiman Road': 'Altiman Road',
    'Altman': 'Altiman',
    'Amherstberg': 'Amherstburg',
    'Anglican Hall': 'Parish Hall',
    'Anglican Parish': 'Parish Hall',
    'Anglican Parish Hall': 'Parish Hall',
    'Anishibaabeg Maatisowin': 'Anishinaabe Maatisowin',
    'Anishinaabeg': 'Anishinaabe',
    'Anishinaabg': 'Anishinaabe',
    'Anishinaabge': 'Anishinaabe',
    'Anishna': 'Anishinaabe',
    'Anishnaabe': 'Anishinaabe',
    'Anishnabe': 'Anishinaabe',
    'Attapiskatt First Nation': 'Attawapiskat First Nation',
    'Attawapiskatt': 'Attawapiskat First Nation',
    'Austin Rd': 'Austin Road',
    'Basset Island': 'Bassett Island',
    'Became menstrual': 'Bassett Island',
    'Cecelia': 'Cecilia',
    'Cecile': 'Cecilia',
    'Cedrick': 'Cedric',
    'Chematagon': 'Chematogan',
    'Chematogon': 'Chematogan',
    'Chematogon Bay': 'Chematogan Bay',
    'Chester Armstrong Well': 'Chester Armstrong',
    'Chimey': 'Chimmy',
    'Clifford Roy': 'Cliff Roy',
    'Clint Yeah': 'Clint',
    'DUNK DUNK DUNK': 'DUNK DUNK',
    'Dave Did': 'Dave',
    'Dave Do': 'Dave',
    'Dredgecut': 'Dredge Cut',
    'Enh': '',
    'Ethel Kick': 'Ethel Kicknosway',
    'First Nation': 'First Nations',
    'Frank Clown Dawson': 'Franklin Dawson',
    'Franny': 'Frannie',
    'Fred Hall': 'Fred Hulls',
    'George uh': 'George',
    'Gooselake': 'Goose Lake',
    'Gzeh-mnidoo': 'Gzhe-mnidoo',
    'He/we': '',
    'Hello Aklyn': 'Aklyn',
    'Heriatge Centre': 'Heritage Centre',
    'Heritage Center': 'Heritage Centre',
    'Hickory Ridges': 'Hickory Ridge',
    'Highbanks yea': 'Highbanks',
    'Hm': '',
    'Hmm': '',
    'Hmm-mm': '',
    'I-I-I': 'I-I',
    'If-if': '',
    'Igotta': '',
    'It-it': '',
    'Jan Longboats': 'Jan Longboat',
    'Jeffrey': 'Jeffery',
    'Joe Bidore Bay': 'Joe Bedore Bay',
    'Joe Crows': 'Joe Crow',
    'Kewayosh-': 'Kewayosh',
    'Kicknosways': 'Kicknosway',
    'Laughs Nobody': '',
    'Laughs So': '',
    'Laughs Walpole Island': 'Walpole Island',
    'Laughs Yea': '',
    'Laughs Yea Robert Kiyoshk': 'Robert Kiyoshk',
    'Mid-winter Pow-wow': 'Mid-winter Pow-Wow',
    'Mitchells Bay': 'Mitchell Bay',
    'Mm': '',
    'Mm-hmm': '',
    'Mrs': '',
    'Munja': 'Moonja',
    'Nahdees': 'Nahdee',
    'Native Crafts.': 'Native Crafts',
    'New Years': 'New Year',
    'New Years Eve': 'New Year',
    'New Years Feast': 'New Year',
    'Nishnaabs': 'Nishnaabe',
    'Nishnob': 'Nishnaabe',
    'Nishnobs': 'Nishnaabe',
    'Odawas': 'Odawa',
    'Oh God': '',
    'Ojibwe Park': 'Ojibway Park',
    'Ok Dorothy': 'Dorothy',
    'Ok Henry': 'Henry',
    'POW WOW': 'Pow-Wow',
    'POW WOWS': 'Pow-Wow',
    'POW-Wow': 'Pow-Wow',
    'Parrish Hall': 'Parish Hall',
    'Potawatomis': 'Potawatomi',
    'Pottawatomi': 'Potawatomi',
    'Pottawatomi Island': 'Potawatomi Island',
    'Pow-Wows': 'Pow-Wow',
    'Pow-wow': 'Pow-Wow',
    'Rama Powwow': 'Rama Pow-Wow',
    'Rodeo': 'Rodeo',
    'Rommel': 'Romall',
    'Rommels': 'Romall',
    'Rumall': 'Romall',
    'Russel Osagee': 'Russell Osagee',
    'SHHHHHHH': '',
    'Sam Weegee': 'Sam Weejii',
    'Sam Weegy': 'Sam Weejii',
    'Sam Weeji': 'Sam Weejii',
    'Sam Wiiji': 'Sam Weejii',
    'Sam Wiji': 'Sam Weejii',
    'Same Wiji': 'Sam Weejii',
    'Sanjgwan': 'Sanjgwon',
    'Shingauk': 'Shingwauk',
    'Shinguak': 'Shingwauk',
    'Shingwak': 'Shingwauk',
    'Shingwuak': 'Shingwauk',
    'Shobs': 'Shob',
    'Shogie': 'Shoggie',
    'Skeesic': 'Skeezik',
    'Skiizhiig': 'Skeezik',
    'So Potawatomi': 'Potawatomi',
    'Soo': '',
    'St. Ann': 'St. Anne',
    'Strawberry Soc': 'Strawberry Social',
    'T.V.': 'T.V',
    'Thank-you': '',
    'Trans Canada Pipeline': 'Transport Canada',
    'Twila': 'Twyla',
    'U.S.': 'U.S',
    'Uh Aldra Brown': 'Aldra Brown',
    'Uh Eldron': 'Eldron',
    'Uh Henry': 'Henry',
    'Um': '',
    'Umm': '',
    'Walpole Island.': 'Walpole Island',
    'Weekaan': 'Wiikenh',
    'Wood Bees': 'Wood Bee',
    'Woodcutting Bee': 'Wood Bee',
    'World War II': 'World War',
    'Yea Bassett': 'Bassett',
    'Yea Chiefs Road': 'Chiefs Road',
    'Yea Quopkaing': 'Quopkaing',
    'Yea Winston': 'Winston',
    'Yea uh': '',
    'Yep uh': '',
    'Zhoon ya': 'Zhoon',
    'knock knock': '',
    'yea uh': '',
    'yea yep': '',
}

##### Auxiliar functions

In [47]:
def fix_ellipsis(text):
    text = text.replace('…', '...')
    return text

In [48]:
def test(names):
    return names and \
           (' '.join(names).lower() in all_places_lower_list or \
           not all_interviewees_names.intersection(names) and \
           (len(names) > 1 or names[0].lower().strip(punctuation + ' ') not in english_words)
           )

##### Graph

In [49]:
%%time

h = nx.DiGraph()
interviewers_names = ['Dave', 'Rick', 'Clint']
for interview in transcripts_df.INTERVIEW:
    for index, name, text in interview:
        if text:
            text = fix_ellipsis(text)
            if name not in interviewers_names:
                name = all_interviewees_names_dict[name]
                if not h.has_node(name):
                    h.add_node(name, type='interviewee')
                tagged_text = pos_tag(word_tokenize(text))
                parsed_text = cp.parse(tagged_text)
                for e in parsed_text:
                    if isinstance(e, nltk.tree.Tree) and e.label() == 'PROPER_NOUN':
                        names = [word for word, tag in e if len(word) > 1]
                        if test(names):
                            proper_noun = ' '.join(names)
                            proper_noun = changes_dict.get(proper_noun, proper_noun)
                            if proper_noun != name:
                                if not h.has_node(proper_noun):
                                    h.add_node(proper_noun, type='other')
                                if not h.has_edge(name, proper_noun):
                                    h.add_edge(name, proper_noun, weight=0)
                                h[name][proper_noun]['weight'] += 1

CPU times: user 23.2 s, sys: 48 ms, total: 23.2 s
Wall time: 23.3 s


In [50]:
i = h.copy()
for n in i.nodes():
    if len(n) <= MIN_LENGTH:
        i.remove_node(n)

In [51]:
i.number_of_nodes()

953

In [52]:
i.number_of_edges()

1545

## Saving data

In [53]:
with open('data/out/all_text.txt', 'w') as f:
    f.write(all_text)

In [54]:
with open('data/out/counter_all.pickle', 'wb') as f:
    pickle.dump(counter_all, f)

In [55]:
with open('data/out/counter_adjs.pickle', 'wb') as f:
    pickle.dump(counter_adjs, f)

In [56]:
with open('data/out/counter_nouns.pickle', 'wb') as f:
    pickle.dump(counter_nouns, f)

In [57]:
with open('data/out/counter_verbs.pickle', 'wb') as f:
    pickle.dump(counter_verbs, f)

In [58]:
nx.write_gexf(g, 'data/out/people.gexf')

In [59]:
nx.write_gexf(i, 'data/out/other.gexf')