# Transcripts

In [1]:
from collections import Counter

import networkx as nx
import nltk
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag

## Loading data

In [2]:
transcripts_df = pd.read_csv(
    'data/out/transcripts_1.csv',
    converters={'INTERVIEWERS': eval, 'INTERVIEWEES': eval, 'ALIASES': eval, 'INTERVIEW': eval},
)
transcripts_df = transcripts_df[['ID', 'INTERVIEWERS', 'INTERVIEWEES', 'ALIASES', 'INTERVIEW']]

In [3]:
transcripts_df.head()

Unnamed: 0,ID,INTERVIEWERS,INTERVIEWEES,ALIASES,INTERVIEW
0,Aimee Johnson – 17 September 2010,[Rick Fehr],[Aimee Johnson],[],"[(0, Rick, Ok. We’re recording now, I’m sitti..."
1,Anita Smith -,[Dave White],[Anita Smith],[],"[(0, Dave, How did we use to use the environme..."
2,Apollo Blackeagle – 27 October 2010,"[Rick Fehr, David White]",[Apollo Blackeagle],[],"[(0, Rick, Ok, its October 27th I believe, we’..."
3,Bill Sands,[Dave White],[Bill Sands],[],"[(0, Dave, In the past, there’s concern today ..."
4,Brenda Wheat – 24 May 2011,[Rick Fehr],[Brenda Wheat],[],"[(0, Rick, So what we’ll be using is just a li..."


In [4]:
transcripts_df.shape

(50, 5)

In [5]:
transcripts_df.head()

Unnamed: 0,ID,INTERVIEWERS,INTERVIEWEES,ALIASES,INTERVIEW
0,Aimee Johnson – 17 September 2010,[Rick Fehr],[Aimee Johnson],[],"[(0, Rick, Ok. We’re recording now, I’m sitti..."
1,Anita Smith -,[Dave White],[Anita Smith],[],"[(0, Dave, How did we use to use the environme..."
2,Apollo Blackeagle – 27 October 2010,"[Rick Fehr, David White]",[Apollo Blackeagle],[],"[(0, Rick, Ok, its October 27th I believe, we’..."
3,Bill Sands,[Dave White],[Bill Sands],[],"[(0, Dave, In the past, there’s concern today ..."
4,Brenda Wheat – 24 May 2011,[Rick Fehr],[Brenda Wheat],[],"[(0, Rick, So what we’ll be using is just a li..."


## Processing data

In [6]:
all_text = '\n'.join([text for interview in transcripts_df.INTERVIEW for (index, name, text) in interview])

In [7]:
with open('data/out/all_text.txt', 'w') as f:
    f.write(all_text)

### Keywords

#### All

In [8]:
def clean_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    words = [token for token in tokens if token not in stopwords.words('english') and len(token) > 2]
    text = ' '.join(words)
    return text

In [9]:
%%time

cleaned_all_text = clean_text(all_text)

CPU times: user 1min 25s, sys: 5.13 s, total: 1min 30s
Wall time: 1min 30s


In [10]:
all_words = cleaned_all_text.split()

In [11]:
counter_all = Counter(all_words)

In [12]:
counter_all.most_common(100)

[('like', 2817),
 ('would', 2627),
 ('know', 2450),
 ('yea', 2440),
 ('people', 1995),
 ('yeah', 1924),
 ('go', 1883),
 ('used', 1824),
 ('one', 1702),
 ('remember', 1549),
 ('get', 1545),
 ('time', 1537),
 ('uh', 1517),
 ('think', 1465),
 ('things', 1404),
 ('well', 1390),
 ('back', 1364),
 ('got', 1273),
 ('going', 1250),
 ('hmm', 1245),
 ('way', 1204),
 ('lot', 1182),
 ('right', 1116),
 ('could', 990),
 ('something', 930),
 ('see', 905),
 ('say', 856),
 ('use', 839),
 ('come', 818),
 ('good', 812),
 ('thing', 787),
 ('around', 771),
 ('said', 766),
 ('us', 741),
 ('oh', 706),
 ('community', 689),
 ('water', 677),
 ('really', 673),
 ('went', 664),
 ('stuff', 656),
 ('always', 643),
 ('even', 639),
 ('take', 633),
 ('laughs', 631),
 ('fish', 624),
 ('little', 598),
 ('kind', 596),
 ('put', 585),
 ('big', 577),
 ('recall', 545),
 ('hunting', 535),
 ('um', 520),
 ('never', 519),
 ('want', 516),
 ('make', 509),
 ('still', 489),
 ('years', 483),
 ('today', 483),
 ('old', 477),
 ('dad', 47

#### By POS tag

In [13]:
%%time

tagged_all_text = pos_tag(word_tokenize(all_text))

CPU times: user 38.1 s, sys: 80 ms, total: 38.1 s
Wall time: 38.4 s


##### Adjectives

In [14]:
%%time

adjectives = [word.lower() for word, pos in tagged_all_text if word not in stopwords.words('english') and len(word) > 1 and pos.startswith('J')]

CPU times: user 1min 20s, sys: 5 s, total: 1min 25s
Wall time: 1min 25s


In [15]:
counter_adjs = Counter(adjectives)

In [16]:
counter_adjs.most_common(100)

[('good', 799),
 ('little', 579),
 ('big', 563),
 ('uh', 550),
 ('old', 462),
 ('different', 383),
 ('much', 324),
 ('many', 308),
 ('um', 273),
 ('whole', 273),
 ('long', 245),
 ('right', 228),
 ('first', 222),
 ('fish', 200),
 ('last', 199),
 ('certain', 181),
 ('able', 179),
 ('particular', 171),
 ('next', 168),
 ('young', 166),
 ('great', 166),
 ('indian', 153),
 ('wild', 146),
 ('high', 129),
 ('sure', 125),
 ('real', 119),
 ('important', 114),
 ('environmental', 114),
 ('hard', 112),
 ('black', 107),
 ('new', 104),
 ('nice', 102),
 ('white', 99),
 ('sweet', 96),
 ('inaudible', 92),
 ('bad', 91),
 ('small', 90),
 ('customary', 89),
 ('natural', 88),
 ('traditional', 87),
 ('younger', 85),
 ('main', 80),
 ('common', 75),
 ('best', 75),
 ('commercial', 73),
 ('open', 73),
 ('older', 69),
 ('native', 69),
 ('full', 66),
 ('past', 65),
 ('social', 64),
 ('uh…', 63),
 ('better', 63),
 ('red', 60),
 ('fresh', 59),
 ('rid', 58),
 ('public', 58),
 ('late', 58),
 ('large', 56),
 ('early', 

##### Nouns

In [17]:
%%time

nouns = [word.lower() for word, pos in tagged_all_text if word not in stopwords.words('english') and len(word) > 1 and pos.startswith('N')]

CPU times: user 1min 17s, sys: 5.25 s, total: 1min 22s
Wall time: 1min 22s


In [18]:
counter_nouns = Counter(nouns)

In [19]:
counter_nouns.most_common(100)

[('people', 1990),
 ('yea', 1734),
 ('time', 1537),
 ('things', 1404),
 ('hmm', 1233),
 ('way', 1204),
 ('lot', 1182),
 ('something', 911),
 ('thing', 787),
 ('community', 689),
 ('water', 677),
 ('kind', 596),
 ('stuff', 596),
 ('uh', 496),
 ('years', 483),
 ('today', 481),
 ('well', 478),
 ('area', 451),
 ('dad', 434),
 ('bush', 433),
 ('anything', 432),
 ('island', 425),
 ('day', 411),
 ('everything', 405),
 ('everybody', 378),
 ('land', 372),
 ('kids', 368),
 ('home', 365),
 ('house', 356),
 ('year', 349),
 ('walpole', 341),
 ('road', 338),
 ('family', 335),
 ('fish', 327),
 ('food', 321),
 ('place', 317),
 ('marsh', 311),
 ('yeah', 307),
 ('stories', 303),
 ('did', 301),
 ('part', 288),
 ('money', 285),
 ('environment', 282),
 ('oh', 279),
 ('days', 277),
 ('person', 275),
 ('guys', 274),
 ('right', 272),
 ('laughs', 271),
 ('somebody', 268),
 ('name', 258),
 ('ducks', 255),
 ('school', 250),
 ('hunting', 246),
 ('river', 245),
 ('wood', 238),
 ('areas', 224),
 ('life', 223),
 ('h

##### Verbs

In [20]:
%%time

verbs = [word.lower() for word, pos in tagged_all_text if word not in stopwords.words('english') and len(word) > 1 and pos.startswith('V')]

CPU times: user 1min 18s, sys: 5.5 s, total: 1min 24s
Wall time: 1min 24s


In [21]:
counter_verbs = Counter(verbs)

In [22]:
counter_verbs.most_common(100)

[('know', 2420),
 ('go', 1859),
 ('used', 1823),
 ('remember', 1495),
 ('get', 1446),
 ('think', 1422),
 ('got', 1257),
 ('going', 1250),
 ('see', 881),
 ('say', 847),
 ('said', 766),
 ('come', 747),
 ('went', 662),
 ('take', 625),
 ('use', 621),
 ('put', 568),
 ('want', 510),
 ('recall', 502),
 ('make', 497),
 ('do', 428),
 ('made', 354),
 ('came', 343),
 ('laughs', 340),
 ('told', 299),
 ('need', 299),
 ('guess', 298),
 ('coming', 295),
 ('look', 287),
 ('hunting', 283),
 ('bring', 280),
 ('getting', 271),
 ('tell', 270),
 ('done', 256),
 ('took', 255),
 ('talking', 251),
 ('eat', 239),
 ('knew', 239),
 ('help', 236),
 ('give', 228),
 ('uh', 212),
 ('work', 211),
 ('says', 210),
 ('call', 208),
 ('live', 207),
 ('talk', 203),
 ('seen', 201),
 ('keep', 199),
 ('start', 197),
 ('lived', 191),
 ('wanted', 187),
 ('thought', 187),
 ('mean', 180),
 ('gone', 179),
 ('hunt', 177),
 ('called', 177),
 ('find', 177),
 ('happened', 175),
 ('hear', 171),
 ('saying', 171),
 ('looking', 169),
 ('g

### Graphs

In [23]:
def first_names(names):
    return [x.split()[0] for x in names]

In [24]:
def last_names(names):
    r = []
    for x in names:
        if '(' in x:
            x = x.split('(')[0].strip()
        xs = x.split()
        if len(xs) > 1:
            ys = xs[1:]
            r.extend(ys)
    return r

In [25]:
interviewees = list(set(y for x in transcripts_df.INTERVIEWEES for y in x))

In [26]:
interviewees_first_names = first_names(interviewees)

In [27]:
interviewees_last_names = last_names(interviewees)

In [28]:
aliases = list(set(z for x in transcripts_df.ALIASES for y in x for z in y))

In [29]:
all_names = set(interviewees + interviewees_first_names + interviewees_last_names + aliases)

In [30]:
all_names_dict = {
    'Aimee': 'Aimee Johnson',
    'Aimee Johnson': 'Aimee Johnson',
    'Andrew': 'Andrew Peters',
    'Andrew Peters': 'Andrew Peters',
    'Anita': 'Anita Smith',
    'Anita Smith': 'Anita Smith',
    'Apollo': 'Apollo Blackeagle',
    'Apollo Blackeagle': 'Apollo Blackeagle',
    'Aquash': 'Mickey Aquash',
    'Archie': 'Archie',
    'Baxter': 'Eli Baxter',
    'Becky': 'Becky',
    'Bill': 'Bill Sands',
    'Bill Sands': 'Bill Sands',
    'Blackbird': 'Jennie Blackbird',
    'Blackeagle': 'Apollo Blackeagle',
    'Brenda': 'Brenda Wheat',
    'Brenda Wheat': 'Brenda Wheat',
    'Cal': 'Cal',
    'Cameron': 'Cameron',
    'Carl': 'Carl Smith',
    'Carl Smith (Resource Protection Officer)': 'Carl Smith',
    'Carmen': 'Carmen Wrightman',
    'Carmen Wrightman': 'Carmen Wrightman',
    'Carrie': 'Carrie Isaac',
    'Carrie Isaac': 'Carrie Isaac',
    'Charles': 'Charles Wright',
    'Charles Wright': 'Charles Wright',
    'Cheryl': 'Cheryl',
    'Chief': 'Chief Gilbert',
    'Chief Gilbert': 'Chief Gilbert',
    'Chief Joseph Gilbert': 'Chief Gilbert',
    'Chris': 'Chris Riley',
    'Chris Riley': 'Chris Riley',
    'Daniel': 'Daniel',
    'Darren': 'Darren',
    'Day': '??? Day',
    'Dean': 'Dean Jacobs',
    'Dean Jacobs': 'Dean Jacobs',
    'Dot': 'Dot Peters',
    'Dot Peters': 'Dot Peters',
    'Doug': 'Doug',
    'Doug (Resource Protection Officer)': 'Doug',
    'Elaine': 'Elaine Jacobs',
    'Elaine Jacobs': 'Elaine Jacobs',
    'Eli': 'Eli Baxter',
    'Eli Baxter': 'Eli Baxter',
    'Eliza': 'Eliza John',
    'Eliza John': 'Eliza John',
    'Eric': 'Eric Isaac',
    'Eric Isaac': 'Eric Isaac',
    'Frank': 'Frank',
    'Georgina': 'Georgina',
    'Gilbert': 'Chief Gilbert',
    'Greg': 'Greg Isaac',
    'Greg Isaac': 'Greg Isaac',
    'Gus': 'Gus',
    'Harold': 'Harold Peters',
    'Harold Peters': 'Harold Peters',
    'Hoeksma': 'Mel Hoeksma',
    'Isaac': '??? Isaac',
    'Isabelle': 'Isabelle',
    'Jacobs': '??? Jacobs',
    'Jane': 'Jane Jacobs',
    'Jane Jacobs': 'Jane Jacobs',
    'Jasper': 'Jasper John',
    'Jasper John': 'Jasper John',
    'Jean': 'Jean Wrightman',
    'Jean Wrightman': 'Jean Wrightman',
    'Jen': 'Jennie Blackbird',
    'Jennie': 'Jennie Blackbird',
    'Jennie Blackbird': 'Jennie Blackbird',
    'Jerome': 'Jerome',
    'Jerry': 'Jerry',
    'Jessica': 'Jessica',
    'Joanne': 'Joanne Day',
    'Joanne Day': 'Joanne Day',
    'Joe': 'Joe Isaac',
    'Joe Isaac': 'Joe Isaac',
    'John': 'John',
    'Johnson': '??? Johnson',
    'Jones': '??? Jones',
    'Joseph': 'Chief Gilbert',
    'Julia': 'Julia',
    'Karen': 'Karen Lalleen',
    'Karen Lalleen': 'Karen Lalleen',
    'Kenneth': 'Kenneth',
    'Kennon': 'Kennon Johnson',
    'Kennon Johnson': 'Kennon Johnson',
    'Kevin': 'Kevin Smith',
    'Kevin Smith': 'Kevin Smith',
    'Lalleen': 'Karen Lalleen',
    'Lee': 'Lee White',
    'Lee White': 'Lee White',
    'Linda': 'Linda White',
    'Linda White': 'Linda White',
    'Liz': 'Lizzie Isaac',
    'Lizzie': 'Lizzie Isaac',
    'Lizzie Isaac': 'Lizzie Isaac',
    'Lloyd': 'Lloyd Day',
    'Lloyd Day': 'Lloyd Day',
    'Lois': 'Lois Wrightman',
    'Lois Wrightman': 'Lois Wrightman',
    'Lyndsay': 'Lyndsay Sword',
    'Lyndsay Sword': 'Lyndsay Sword',
    'Mark': 'Mark',
    'Mel': 'Mel Hoeksma',
    'Mel Hoeksma': 'Mel Hoeksma',
    'Mickey': 'Mickey Aquash',
    'Mickey Aquash': 'Mickey Aquash',
    'Morris': 'Morris Wrightman',
    'Morris Wrightman': 'Morris Wrightman',
    'Myrna': 'Myrna',
    'Naomi': 'Naomi Williams',
    'Naomi Williams': 'Naomi Williams',
    'PD': 'Puppydog',
    'Pat': 'Pat Riley',
    'Pat Riley': 'Pat Riley',
    'Patricia': 'Patricia',
    'Patty': 'Patty Isaac',
    'Patty Isaac': 'Patty Isaac',
    'Paul': 'Paul',
    'Peters': '??? Peters',
    'Puppydog': 'Puppydog',
    'Rachel': 'Rachel',
    'Ralph': 'Ralph ???',
    'Ralph Johnson': 'Ralph Johnson',
    'Ralph Jones': 'Ralph Jones',
    'Riley': '??? Riley',
    'Rita': 'Rita Sands',
    'Rita Sands': 'Rita Sands',
    'Ron': 'Ron',
    'Rose': 'Rose',
    'Sands': '??? Sands',
    'Sarah': 'Sarah',
    'Shirley': 'Shirley',
    'Smith': '??? Smith',
    'Stanley': 'Stanley',
    'Stuart': 'Stuart',
    'Suzie': 'Suzie ???',
    'Suzie Isaac': 'Suzie Isaac',
    'Suzie Jones': 'Suzie Jones',
    'Sword': 'Lyndsay Sword',
    'Terry': 'Terry Sands',
    'Terry Sands': 'Terry Sands',
    'Tom': 'Tom',
    'Vernon': 'Vernon Jones',
    'Vernon Jones': 'Vernon Jones',
    'Wheat': 'Brenda Wheat',
    'White': '??? White',
    'Williams': 'Naomi Williams',
    'Wright': 'Charles Wright',
    'Wrightman': '??? Wrightman',
}

In [31]:
grammar = 'PROPER_NOUN: {<NNP>+}'
cp = nltk.RegexpParser(grammar)

#### People

In [32]:
%%time

g = nx.DiGraph()
interviewers_names = ['Dave', 'Rick', 'Clint']
for interview in transcripts_df.INTERVIEW:
    for index, name, text in interview:
        if text:
            if name not in interviewers_names:
                name = all_names_dict[name]
                if not g.has_node(name):
                    g.add_node(name)
                tagged_text = pos_tag(word_tokenize(text))
                parsed_text = cp.parse(tagged_text)
                for e in parsed_text:
                    if isinstance(e, nltk.tree.Tree) and e.label() == 'PROPER_NOUN':
                        names = [word for word, tag in e if len(word) > 1]
                        if all_names.intersection(names):
                            proper_noun = ' '.join(names)
                            if proper_noun in all_names_dict and '???' not in all_names_dict[proper_noun]:
                                proper_noun = all_names_dict[proper_noun]
                            if proper_noun != name:
                                if not g.has_edge(name, proper_noun):
                                    g.add_edge(name, proper_noun, weight=0)
                                g[name][proper_noun]['weight'] += 1

CPU times: user 29.8 s, sys: 60 ms, total: 29.8 s
Wall time: 29.9 s


#### Other

In [33]:
with open('data/wordsEn.txt') as f:
    english_words = set(word.strip().lower() for word in f)

In [34]:
%%time

h = nx.DiGraph()
interviewers_names = ['Dave', 'Rick', 'Clint']
for interview in transcripts_df.INTERVIEW:
    for index, name, text in interview:
        if text:
            if name not in interviewers_names:
                name = all_names_dict[name]
                if not h.has_node(name):
                    h.add_node(name, type='interviewee')
                tagged_text = pos_tag(word_tokenize(text))
                parsed_text = cp.parse(tagged_text)
                for e in parsed_text:
                    if isinstance(e, nltk.tree.Tree) and e.label() == 'PROPER_NOUN':
                        names = [word for word, tag in e if len(word) > 1]
                        if names and not all_names.intersection(names) and \
                           (len(names) > 1 or names[0].lower() not in english_words):
                            proper_noun = ' '.join(names)
                            if proper_noun != name:
                                if not h.has_node(proper_noun):
                                    h.add_node(proper_noun, type='other')
                                if not h.has_edge(name, proper_noun):
                                    h.add_edge(name, proper_noun, weight=0)
                                h[name][proper_noun]['weight'] += 1

CPU times: user 29.3 s, sys: 96 ms, total: 29.4 s
Wall time: 29.5 s


## Saving data

In [35]:
nx.write_gexf(g, 'data/out/people.gexf')

In [36]:
nx.write_gexf(h, 'data/out/other.gexf')