# Transcripts

In [1]:
import pickle
from collections import Counter
from collections import defaultdict
from string import punctuation

import networkx as nx
import nltk
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.util import ngrams

## Loading data

In [2]:
transcripts_df = pd.read_csv(
    'data/out/transcripts_1.csv',
    converters={'INTERVIEWERS': eval, 'INTERVIEWEES': eval, 'ALIASES': eval, 'INTERVIEW': eval},
)
transcripts_df = transcripts_df[['ID', 'INTERVIEWERS', 'INTERVIEWEES', 'ALIASES', 'INTERVIEW']]

In [3]:
transcripts_df.head()

Unnamed: 0,ID,INTERVIEWERS,INTERVIEWEES,ALIASES,INTERVIEW
0,Aimee Johnson – 17 September 2010,[Rick Fehr],[Aimee Johnson],[],"[(0, Rick, Ok. We’re recording now, I’m sitti..."
1,Anita Smith -,[Dave White],[Anita Smith],[],"[(0, Dave, How did we use to use the environme..."
2,Apollo Blackeagle – 27 October 2010,"[Rick Fehr, David White]",[Apollo Blackeagle],[],"[(0, Rick, Ok, its October 27th I believe, we’..."
3,Bill Sands,[Dave White],[Bill Sands],[],"[(0, Dave, In the past, there’s concern today ..."
4,Brenda Wheat – 24 May 2011,[Rick Fehr],[Brenda Wheat],[],"[(0, Rick, So what we’ll be using is just a li..."


In [4]:
transcripts_df.shape

(50, 5)

In [5]:
transcripts_df.head()

Unnamed: 0,ID,INTERVIEWERS,INTERVIEWEES,ALIASES,INTERVIEW
0,Aimee Johnson – 17 September 2010,[Rick Fehr],[Aimee Johnson],[],"[(0, Rick, Ok. We’re recording now, I’m sitti..."
1,Anita Smith -,[Dave White],[Anita Smith],[],"[(0, Dave, How did we use to use the environme..."
2,Apollo Blackeagle – 27 October 2010,"[Rick Fehr, David White]",[Apollo Blackeagle],[],"[(0, Rick, Ok, its October 27th I believe, we’..."
3,Bill Sands,[Dave White],[Bill Sands],[],"[(0, Dave, In the past, there’s concern today ..."
4,Brenda Wheat – 24 May 2011,[Rick Fehr],[Brenda Wheat],[],"[(0, Rick, So what we’ll be using is just a li..."


In [6]:
with open('data/out/counter_all.pickle', 'rb') as f:
    counter_all = pickle.load(f)

In [7]:
with open('data/out/counter_adjs.pickle', 'rb') as f:
    counter_adjs = pickle.load(f)

In [8]:
with open('data/out/counter_nouns.pickle', 'rb') as f:
    counter_nouns = pickle.load(f)

In [9]:
with open('data/out/counter_verbs.pickle', 'rb') as f:
    counter_verbs = pickle.load(f)

## Processing data

In [10]:
def my_ngrams(tokens, rg):
    ngrms = []
    for i in range(rg[0], rg[1] + 1):
        ngrms_aux = [ngrm for ngrm in ngrams(tokens, i)]
        ngrms.extend(ngrms_aux)
    return ngrms

In [11]:
all_text = '\n'.join([text for interview in transcripts_df.INTERVIEW for (index, name, text) in interview])

In [12]:
with open('data/out/all_text.txt') as f:
    all_text = f.read()

### n-grams

In [13]:
def clean_text(text):
    text = text.lower()
    text = text.replace('…', '...')
    tokens = word_tokenize(text)
    words = [token for token in tokens if token not in punctuation]
    text = ' '.join(words)
    return text

In [14]:
%%time

cleaned_all_text = clean_text(all_text)

CPU times: user 4.32 s, sys: 16 ms, total: 4.34 s
Wall time: 4.34 s


In [15]:
all_words = cleaned_all_text.split()

In [16]:
bigrams = my_ngrams(all_words, (2, 2))

In [17]:
trigrams = my_ngrams(all_words, (3, 3))

In [18]:
THRESHOLD = 100

#### Adjectives

In [19]:
top_adjs = counter_adjs.most_common(THRESHOLD)

In [20]:
adjs_blacklist = [
    'uh…', 'yea', 'and…', 'it…', 'there…', 'the…', 'that…',
]

In [21]:
top_adjs = [(w, f) for (w, f) in top_adjs if w not in adjs_blacklist]

In [22]:
top_adjs

[('good', 593),
 ('little', 444),
 ('big', 428),
 ('old', 332),
 ('different', 307),
 ('much', 242),
 ('many', 229),
 ('whole', 191),
 ('right', 181),
 ('long', 176),
 ('first', 168),
 ('fish', 152),
 ('last', 139),
 ('able', 129),
 ('next', 126),
 ('certain', 125),
 ('young', 124),
 ('great', 120),
 ('indian', 118),
 ('particular', 111),
 ('wild', 101),
 ('sure', 97),
 ('real', 91),
 ('high', 90),
 ('environmental', 85),
 ('important', 83),
 ('sweet', 82),
 ('inaudible', 80),
 ('hard', 79),
 ('nice', 76),
 ('new', 76),
 ('black', 73),
 ('small', 71),
 ('white', 71),
 ('traditional', 68),
 ('bad', 66),
 ('younger', 66),
 ('open', 57),
 ('customary', 56),
 ('natural', 56),
 ('older', 55),
 ('commercial', 55),
 ('main', 54),
 ('full', 53),
 ('best', 50),
 ('better', 49),
 ('common', 49),
 ('past', 46),
 ('native', 45),
 ('invasive', 44),
 ('red', 43),
 ('public', 42),
 ('social', 41),
 ('fresh', 40),
 ('early', 40),
 ('dry', 39),
 ('late', 38),
 ('rid', 38),
 ('specific', 38),
 ('wrong',

##### 2-grams

In [23]:
adjs_bigrams_dict = defaultdict(list)
for w, f in top_adjs:
    for ng in bigrams:
        if w in ng:
            adjs_bigrams_dict[w].append(ng)

adjs_bigrams_dict = dict(adjs_bigrams_dict)
for k in adjs_bigrams_dict:
    adjs_bigrams_dict[k] = [(ng, f) for (ng, f) in Counter(adjs_bigrams_dict[k]).most_common() if f > 1]

##### 3-grams

In [24]:
adjs_trigrams_dict = defaultdict(list)
for w, f in top_adjs:
    for ng in trigrams:
        if w in ng:
            adjs_trigrams_dict[w].append(ng)

adjs_trigrams_dict = dict(adjs_trigrams_dict)
for k in adjs_trigrams_dict:
    adjs_trigrams_dict[k] = [(ng, f) for (ng, f) in Counter(adjs_trigrams_dict[k]).most_common() if f > 1]

#### Nouns

In [25]:
top_nouns = counter_nouns.most_common(THRESHOLD)

In [26]:
nouns_blacklist = [
    'yea', 'hmm', 'something', 'thing', 'well', 'anything', 'everything', 'did', 'somebody', 'yeah', 'yea…',
    'bit', 'huh', 'one', 'nothing', 'yep', 'someone',
]

In [27]:
top_nouns = [(w, f) for (w, f) in top_nouns if w not in nouns_blacklist]

In [28]:
top_nouns

[('people', 1358),
 ('time', 1088),
 ('things', 985),
 ('way', 870),
 ('lot', 857),
 ('water', 496),
 ('community', 461),
 ('stuff', 458),
 ('kind', 439),
 ('years', 343),
 ('bush', 320),
 ('area', 317),
 ('island', 309),
 ('dad', 305),
 ('today', 299),
 ('day', 275),
 ('everybody', 259),
 ('kids', 259),
 ('house', 258),
 ('year', 257),
 ('land', 256),
 ('family', 242),
 ('walpole', 242),
 ('home', 242),
 ('road', 241),
 ('place', 239),
 ('fish', 228),
 ('food', 226),
 ('right', 210),
 ('stories', 209),
 ('marsh', 202),
 ('part', 202),
 ('environment', 201),
 ('days', 201),
 ('name', 198),
 ('money', 195),
 ('laughs', 187),
 ('guys', 187),
 ('person', 184),
 ('river', 180),
 ('ducks', 179),
 ('wood', 172),
 ('school', 166),
 ('use', 166),
 ('hunting', 161),
 ('times', 160),
 ('areas', 159),
 ('hall', 158),
 ('deer', 151),
 ('trees', 149),
 ('fishing', 149),
 ('life', 146),
 ('guy', 146),
 ('side', 142),
 ('ways', 138),
 ('ones', 137),
 ('cause', 136),
 ('end', 130),
 ('language', 130),

##### 2-grams

In [29]:
nouns_bigrams_dict = defaultdict(list)
for w, f in top_nouns:
    for ng in bigrams:
        if w in ng:
            nouns_bigrams_dict[w].append(ng)

nouns_bigrams_dict = dict(nouns_bigrams_dict)
for k in nouns_bigrams_dict:
    nouns_bigrams_dict[k] = [(ng, f) for (ng, f) in Counter(nouns_bigrams_dict[k]).most_common() if f > 1]

##### 3-grams

In [30]:
nouns_trigrams_dict = defaultdict(list)
for w, f in top_nouns:
    for ng in trigrams:
        if w in ng:
            nouns_trigrams_dict[w].append(ng)

nouns_trigrams_dict = dict(nouns_trigrams_dict)
for k in nouns_trigrams_dict:
    nouns_trigrams_dict[k] = [(ng, f) for (ng, f) in Counter(nouns_trigrams_dict[k]).most_common() if f > 1]

#### Verbs

In [31]:
top_verbs = counter_verbs.most_common(THRESHOLD)

In [32]:
verbs_blacklist = [
    'get', 'got', 'done', 'getting', 'yea',
]

In [33]:
top_verbs = [(w, f) for (w, f) in top_verbs if w not in verbs_blacklist]

In [34]:
top_verbs

[('know', 1867),
 ('used', 1307),
 ('remember', 1089),
 ('think', 992),
 ('going', 868),
 ('see', 655),
 ('say', 616),
 ('come', 552),
 ('use', 516),
 ('said', 512),
 ('went', 492),
 ('take', 468),
 ('put', 426),
 ('make', 382),
 ('want', 371),
 ('recall', 327),
 ('laughs', 289),
 ('came', 259),
 ('made', 251),
 ('told', 212),
 ('coming', 206),
 ('look', 205),
 ('guess', 197),
 ('bring', 195),
 ('need', 192),
 ('took', 190),
 ('hunting', 189),
 ('tell', 186),
 ('eat', 176),
 ('knew', 173),
 ('talking', 172),
 ('help', 169),
 ('give', 168),
 ('live', 163),
 ('call', 154),
 ('work', 152),
 ('keep', 147),
 ('lived', 143),
 ('says', 143),
 ('wanted', 140),
 ('talk', 140),
 ('thought', 138),
 ('start', 137),
 ('seen', 136),
 ('find', 129),
 ('mean', 128),
 ('goes', 127),
 ('looking', 126),
 ('gone', 126),
 ('called', 125),
 ('hunt', 125),
 ('hear', 123),
 ('happened', 122),
 ('cut', 118),
 ('taking', 114),
 ('saying', 114),
 ('comes', 112),
 ('trying', 111),
 ('started', 107),
 ('working', 

##### 2-grams

In [35]:
verbs_bigrams_dict = defaultdict(list)
for w, f in top_verbs:
    for ng in bigrams:
        if w in ng:
            verbs_bigrams_dict[w].append(ng)

verbs_bigrams_dict = dict(verbs_bigrams_dict)
for k in verbs_bigrams_dict:
    verbs_bigrams_dict[k] = [(ng, f) for (ng, f) in Counter(verbs_bigrams_dict[k]).most_common() if f > 1]

##### 3-grams

In [36]:
verbs_trigrams_dict = defaultdict(list)
for w, f in top_verbs:
    for ng in trigrams:
        if w in ng:
            verbs_trigrams_dict[w].append(ng)

verbs_trigrams_dict = dict(verbs_trigrams_dict)
for k in verbs_trigrams_dict:
    verbs_trigrams_dict[k] = [(ng, f) for (ng, f) in Counter(verbs_trigrams_dict[k]).most_common() if f > 1]