# Transcripts

In [1]:
from collections import Counter

import networkx as nx
import nltk
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag

## Loading data

In [2]:
transcripts_df = pd.read_csv(
    'data/out/transcripts_1.csv',
    converters={'INTERVIEWERS': eval, 'INTERVIEWEES': eval, 'ALIASES': eval, 'INTERVIEW': eval},
)
transcripts_df = transcripts_df[['ID', 'INTERVIEWERS', 'INTERVIEWEES', 'ALIASES', 'INTERVIEW']]

In [3]:
transcripts_df.head()

Unnamed: 0,ID,INTERVIEWERS,INTERVIEWEES,ALIASES,INTERVIEW
0,Aimee Johnson – 17 September 2010,[Rick Fehr],[Aimee Johnson],[],"[(0, Rick, Ok. We’re recording now, I’m sitti..."
1,Andrew Adult Male – 18 September 2010,[Dave White],[Andrew Peters],[],"[(0, Dave, So we’d like to ask you about thing..."
2,Anita Smith -,[Dave White],[Anita Smith],[],"[(0, Dave, How did we use to use the environme..."
3,Apollo Blackeagle – 27 October 2010,"[Rick Fehr, David White]",[Apollo Blackeagle],[],"[(0, Rick, Ok, its October 27th I believe, we’..."
4,Bill Sands,[Dave White],[Bill Sands],[],"[(0, Dave, In the past, there’s concern today ..."


In [4]:
transcripts_df.shape

(68, 5)

In [5]:
transcripts_df.head()

Unnamed: 0,ID,INTERVIEWERS,INTERVIEWEES,ALIASES,INTERVIEW
0,Aimee Johnson – 17 September 2010,[Rick Fehr],[Aimee Johnson],[],"[(0, Rick, Ok. We’re recording now, I’m sitti..."
1,Andrew Adult Male – 18 September 2010,[Dave White],[Andrew Peters],[],"[(0, Dave, So we’d like to ask you about thing..."
2,Anita Smith -,[Dave White],[Anita Smith],[],"[(0, Dave, How did we use to use the environme..."
3,Apollo Blackeagle – 27 October 2010,"[Rick Fehr, David White]",[Apollo Blackeagle],[],"[(0, Rick, Ok, its October 27th I believe, we’..."
4,Bill Sands,[Dave White],[Bill Sands],[],"[(0, Dave, In the past, there’s concern today ..."


## Processing data

In [6]:
all_text = '\n'.join([text for interview in transcripts_df.INTERVIEW for (index, name, text) in interview])

In [7]:
with open('data/out/all_text.txt', 'w') as f:
    f.write(all_text)

### Keywords

#### All

In [8]:
def clean_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    words = [token for token in tokens if token not in stopwords.words('english') and len(token) > 1]
    text = ' '.join(words)
    return text

In [9]:
%%time

cleaned_all_text = clean_text(all_text)

CPU times: user 1min 15s, sys: 4.56 s, total: 1min 20s
Wall time: 1min 20s


In [10]:
all_words = cleaned_all_text.split()

In [11]:
counter_all = Counter(all_words)

In [12]:
counter_all.most_common(20)

[('like', 2817),
 ('would', 2627),
 ('know', 2450),
 ('yea', 2440),
 ('people', 1995),
 ('yeah', 1924),
 ('go', 1883),
 ('used', 1824),
 ('one', 1702),
 ('remember', 1549),
 ('get', 1545),
 ('time', 1537),
 ('uh', 1517),
 ('think', 1465),
 ('things', 1404),
 ('well', 1390),
 ('back', 1364),
 ('got', 1273),
 ('going', 1250),
 ('hmm', 1245)]

#### By POS tag

In [13]:
%%time

tagged_all_text = pos_tag(word_tokenize(all_text))

CPU times: user 33 s, sys: 54 ms, total: 33 s
Wall time: 33 s


##### Adjectives

In [14]:
adjectives = [word for word, pos in tagged_all_text if len(word) > 1 and pos.startswith('J')]

In [15]:
counter_adjs = Counter(adjectives)

In [16]:
counter_adjs.most_common(20)

[('other', 976),
 ('re', 838),
 ('good', 785),
 ('little', 571),
 ('big', 553),
 ('uh', 550),
 ('old', 462),
 ('more', 435),
 ('different', 382),
 ('same', 330),
 ('ll', 327),
 ('much', 322),
 ('many', 307),
 ('whole', 272),
 ('um', 272),
 ('ve', 260),
 ('long', 243),
 ('own', 232),
 ('right', 217),
 ('first', 193)]

##### Nouns

In [17]:
nouns = [word for word, pos in tagged_all_text if len(word) > 1 and pos.startswith('N')]

In [18]:
counter_nouns = Counter(nouns)

In [19]:
counter_nouns.most_common(20)

[('people', 1919),
 ('time', 1535),
 ('things', 1397),
 ('Yea', 1200),
 ('way', 1199),
 ('lot', 1182),
 ('hmm', 1046),
 ('something', 910),
 ('thing', 787),
 ('community', 684),
 ('water', 667),
 ('kind', 591),
 ('stuff', 590),
 ('yea', 534),
 ('today', 463),
 ('Well', 459),
 ('area', 450),
 ('years', 424),
 ('anything', 418),
 ('bush', 406)]

##### Verbs

In [20]:
verbs = [word for word, pos in tagged_all_text if len(word) > 1 and pos.startswith('V')]

In [21]:
counter_verbs = Counter(verbs)

In [22]:
counter_verbs.most_common(20)

[('was', 6131),
 ('have', 3258),
 ('is', 2699),
 ('be', 2587),
 ('had', 2480),
 ('know', 2418),
 ('were', 2391),
 ('do', 2096),
 ('go', 1852),
 ('used', 1813),
 ('don', 1759),
 ('remember', 1489),
 ('get', 1431),
 ('are', 1420),
 ('think', 1420),
 ('got', 1256),
 ('going', 1237),
 ('did', 918),
 ('see', 858),
 ('say', 844)]

### People

In [23]:
def first_names(names):
    return [x.split()[0] for x in names]

In [24]:
interviewees = list(set(y for x in transcripts_df.INTERVIEWEES for y in x))

In [25]:
interviewees_first_names = first_names(interviewees)

In [26]:
aliases = list(set(z for x in transcripts_df.ALIASES for y in x for z in y))

In [27]:
all_names = set(interviewees + interviewees_first_names + aliases)

In [28]:
all_names_dict = {
    'Aimee': 'Aimee Johnson',
    'Aimee Johnson': 'Aimee Johnson',
    'Andrew': 'Andrew Peters',
    'Andrew Peters': 'Andrew Peters',
    'Anita': 'Anita Smith',
    'Anita Smith': 'Anita Smith',
    'Apollo': 'Apollo Blackeagle',
    'Apollo Blackeagle': 'Apollo Blackeagle',
    'Archie': 'Archie',
    'Becky': 'Becky',
    'Bill': 'Bill Sands',
    'Bill Sands': 'Bill Sands',
    'Brenda': 'Brenda Wheat',
    'Brenda Wheat': 'Brenda Wheat',
    'Cal': 'Cal',
    'Cameron': 'Cameron',
    'Carl': 'Carl Smith',
    'Carl Smith (Resource Protection Officer)': 'Carl Smith',
    'Carmen': 'Carmen Wrightman',
    'Carmen Wrightman': 'Carmen Wrightman',
    'Carrie': 'Carrie Isaac',
    'Carrie Isaac': 'Carrie Isaac',
    'Charles': 'Charles Wright',
    'Charles Wright': 'Charles Wright',
    'Cheryl': 'Cheryl',
    'Chief': 'Chief Gilbert',
    'Chief Gilbert': 'Chief Gilbert',
    'Chief Joseph Gilbert': 'Chief Gilbert',
    'Chris': 'Chris Riley',
    'Chris Riley': 'Chris Riley',
    'Daniel': 'Daniel',
    'Darren': 'Darren',
    'Dean': 'Dean Jacobs',
    'Dean Jacobs': 'Dean Jacobs',
    'Dot': 'Dot Peters',
    'Dot Peters': 'Dot Peters',
    'Doug': 'Doug',
    'Doug (Resource Protection Officer)': 'Doug',
    'Elaine': 'Elaine Jacobs',
    'Elaine Jacobs': 'Elaine Jacobs',
    'Eli': 'Eli Baxter',
    'Eli Baxter': 'Eli Baxter',
    'Eliza': 'Eliza John',
    'Eliza John': 'Eliza John',
    'Eric': 'Eric Isaac',
    'Eric Isaac': 'Eric Isaac',
    'Frank': 'Frank',
    'Georgina': 'Georgina',
    'Greg': 'Greg Isaac',
    'Greg Isaac': 'Greg Isaac',
    'Gus': 'Gus',
    'Harold': 'Harold Peters',
    'Harold Peters': 'Harold Peters',
    'Isabelle': 'Isabelle',
    'Jane': 'Jane Jacobs',
    'Jane Jacobs': 'Jane Jacobs',
    'Jasper': 'Jasper John',
    'Jasper John': 'Jasper John',
    'Jean': 'Jean Wrightman',
    'Jean Wrightman': 'Jean Wrightman',
    'Jen': 'Jennie Blackbird',
    'Jennie': 'Jennie Blackbird',
    'Jennie Blackbird': 'Jennie Blackbird',
    'Jerome': 'Jerome',
    'Jerry': 'Jerry',
    'Jessica': 'Jessica',
    'Joanne': 'Joanne Day',
    'Joanne Day': 'Joanne Day',
    'Joe': 'Joe Isaac',
    'Joe Isaac': 'Joe Isaac',
    'John': 'John',
    'Julia': 'Julia',
    'Karen': 'Karen Lalleen',
    'Karen Lalleen': 'Karen Lalleen',
    'Kenneth': 'Kenneth',
    'Kennon': 'Kennon Johnson',
    'Kennon Johnson': 'Kennon Johnson',
    'Kevin': 'Kevin Smith',
    'Kevin Smith': 'Kevin Smith',
    'Lee': 'Lee White',
    'Lee White': 'Lee White',
    'Linda': 'Linda White',
    'Linda White': 'Linda White',
    'Liz': 'Lizzie Isaac',
    'Lizzie': 'Lizzie Isaac',
    'Lizzie Isaac': 'Lizzie Isaac',
    'Lloyd': 'Lloyd Day',
    'Lloyd Day': 'Lloyd Day',
    'Lois': 'Lois Wrightman',
    'Lois Wrightman': 'Lois Wrightman',
    'Lyndsay': 'Lyndsay Sword',
    'Lyndsay Sword': 'Lyndsay Sword',
    'Mark': 'Mark',
    'Mel': 'Mel Hoeksma',
    'Mel Hoeksma': 'Mel Hoeksma',
    'Mickey': 'Mickey Aquash',
    'Mickey Aquash': 'Mickey Aquash',
    'Morris': 'Morris Wrightman',
    'Morris Wrightman': 'Morris Wrightman',
    'Myrna': 'Myrna',
    'Naomi': 'Naomi Williams',
    'Naomi Williams': 'Naomi Williams',
    'PD': 'Puppydog',
    'Pat': 'Pat Riley',
    'Pat Riley': 'Pat Riley',
    'Patricia': 'Patricia',
    'Patty': 'Patty Isaac',
    'Patty Isaac': 'Patty Isaac',
    'Paul': 'Paul',
    'Puppydog': 'Puppydog',
    'Rachel': 'Rachel',
    'Ralph': 'Ralph ???',
    'Ralph Johnson': 'Ralph Johnson',
    'Ralph Jones': 'Ralph Jones',
    'Rita': 'Rita Sands',
    'Rita Sands': 'Rita Sands',
    'Ron': 'Ron',
    'Rose': 'Rose',
    'Sarah': 'Sarah',
    'Shirley': 'Shirley',
    'Stanley': 'Stanley',
    'Stuart': 'Stuart',
    'Suzie': 'Suzie ???',
    'Suzie Isaac': 'Suzie Isaac',
    'Suzie Jones': 'Suzie Jones',
    'Terry': 'Terry Sands',
    'Terry Sands': 'Terry Sands',
    'Tom': 'Tom',
    'Vernon': 'Vernon Jones',
    'Vernon Jones': 'Vernon Jones',
}

In [None]:
grammar = 'PROPER_NOUN: {<NNP>+}'
cp = nltk.RegexpParser(grammar)

In [51]:
%%time

g = nx.DiGraph()

interviewers_names = ['Dave', 'Rick', 'Clint']
for interview in transcripts_df.INTERVIEW:
    for index, name, text in interview:
        if text:
            if name not in interviewers_names:
                name = all_names_dict.get(name, name)
                if not g.has_node(name):
                    g.add_node(name)
                tagged_text = pos_tag(word_tokenize(text))
                parsed_text = cp.parse(tagged_text)
                for e in parsed_text:
                    if isinstance(e, nltk.tree.Tree) and e.label() == 'PROPER_NOUN':
                        names = [word for word, tag in e if len(word) > 1]
                        if len(names) > 1 or names[0] in all_names:
                            proper_noun = ' '.join(names)
                            proper_noun = all_names_dict.get(proper_noun, proper_noun)
                            if not g.has_edge(name, proper_noun):
                                g.add_edge(name, proper_noun, weight=0)
                            g[name][proper_noun]['weight'] += 1

IndexError: list index out of range

In [55]:
e

ImportError: No module named '_tkinter', please install the python3-tk package

Tree('PROPER_NOUN', [('’', 'NNP')])

In [57]:
for a,b in e:
    print(a)

’


In [60]:
len(a)

1

In [48]:
nx.write_gexf(g, 'data/out/graph.gexf')

In [31]:
sentence = [("the", "DT"), ("little", "NNP"), ("yellow", "NNP"), ("dog", "NNP"), ("barked", "VBD"), ("at", "NNP"),  ("the", "DT"), ("cat", "NN")]
grammar = 'PROPER_NOUN: {<NNP>+}'
cp = nltk.RegexpParser(grammar)
result = cp.parse(sentence)

In [32]:
for x in result:
    print(x)

('the', 'DT')
(PROPER_NOUN little/NNP yellow/NNP dog/NNP)
('barked', 'VBD')
(PROPER_NOUN at/NNP)
('the', 'DT')
('cat', 'NN')


In [33]:
for x in result:
    if isinstance(x, nltk.tree.Tree):
        for y in x:
            print(y)
        break

('little', 'NNP')
('yellow', 'NNP')
('dog', 'NNP')


In [34]:
x.label()

'PROPER_NOUN'

In [35]:
result



ImportError: No module named '_tkinter', please install the python3-tk package

Tree('S', [('the', 'DT'), Tree('PROPER_NOUN', [('little', 'NNP'), ('yellow', 'NNP'), ('dog', 'NNP')]), ('barked', 'VBD'), Tree('PROPER_NOUN', [('at', 'NNP')]), ('the', 'DT'), ('cat', 'NN')])

In [36]:
proper_nouns = [word for word, pos in tagged_all_text if len(word) > 1 and pos == 'NNP']

In [37]:
mentioned_proper_nouns = all_names.intersection(proper_nouns)

In [38]:
mentioned_proper_nouns

{'Aimee',
 'Andrew',
 'Anita',
 'Apollo',
 'Archie',
 'Becky',
 'Bill',
 'Brenda',
 'Cameron',
 'Carl',
 'Carmen',
 'Carrie',
 'Charles',
 'Chief',
 'Chris',
 'Daniel',
 'Dean',
 'Dot',
 'Doug',
 'Elaine',
 'Eli',
 'Eliza',
 'Eric',
 'Frank',
 'Georgina',
 'Greg',
 'Harold',
 'Isabelle',
 'Jane',
 'Jasper',
 'Jean',
 'Jen',
 'Jerry',
 'Jessica',
 'Joe',
 'John',
 'Julia',
 'Karen',
 'Kenneth',
 'Kennon',
 'Kevin',
 'Lee',
 'Linda',
 'Liz',
 'Lloyd',
 'Lois',
 'Lyndsay',
 'Mark',
 'Mickey',
 'Morris',
 'Myrna',
 'Naomi',
 'PD',
 'Pat',
 'Patricia',
 'Patty',
 'Paul',
 'Rachel',
 'Ralph',
 'Rita',
 'Ron',
 'Rose',
 'Sarah',
 'Stanley',
 'Stuart',
 'Suzie',
 'Terry',
 'Tom',
 'Vernon'}

In [39]:
qqq='hola me llamo antonio jimenez mavillard'

In [40]:
qqq.find('antonio')

14

In [41]:
qqq.find('aantonio')

-1

In [42]:
qqq.index('antonio')

14

In [43]:
qqq.index('aantonio')

ValueError: substring not found

## Saving data

In [None]:
# transcripts_df.to_csv('data/out/transcripts_2.csv', index=False)