# Transcripts

In [1]:
import string
from collections import Counter
from collections import defaultdict

import networkx as nx
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from textblob import Word

## Loading data

### Transcripts

In [2]:
transcripts_df = pd.read_csv(
    'data/out/transcripts_1.csv',
    converters={'INTERVIEWERS': eval, 'INTERVIEWEES': eval, 'ALIASES': eval, 'INTERVIEW': eval},
)
transcripts_df = transcripts_df[['ID', 'INTERVIEWERS', 'INTERVIEWEES', 'ALIASES', 'INTERVIEW']]

In [3]:
transcripts_df.head()

Unnamed: 0,ID,INTERVIEWERS,INTERVIEWEES,ALIASES,INTERVIEW
0,Aimee Johnson – 17 September 2010,[Rick Fehr],[Aimee Johnson],[],"[(0, Rick, Ok. We’re recording now, I’m sitti..."
1,Anita Smith -,[Dave White],[Anita Smith],[],"[(0, Dave, How did we use to use the environme..."
2,Apollo Blackeagle – 27 October 2010,"[Rick Fehr, David White]",[Apollo Blackeagle],[],"[(0, Rick, Ok, its October 27th I believe, we’..."
3,Bill Sands,[Dave White],[Bill Sands],[],"[(0, Dave, In the past, there’s concern today ..."
4,Brenda Wheat – 24 May 2011,[Rick Fehr],[Brenda Wheat],[],"[(0, Rick, So what we’ll be using is just a li..."


In [4]:
transcripts_df.shape

(50, 5)

### Concepts

In [5]:
concepts = [
    'water', 'animal', 'medicine', 'clothing', 'plant', 'food', #'activity', 'place', 'tool',
    'treaty', 'ceremony', 'ancestor',
]

### Specific keywords

In [6]:
specific_words = {}
current_category = ''
with open('data/specific_keywords.txt') as f:
    for line in f:
        line = line.strip()
        if line:
            if line.endswith(':'):
                current_category = line.split(':')[0].lower()
                specific_words[current_category] = []
            else:
                specific_words[current_category].append(line.lower())

### Most frequent words

In [7]:
most_freq_words = defaultdict(dict)
current_category = ''
with open('data/most_freq_words.txt') as f:
    for line in f:
        if line.startswith('#'):
            current_category = line.split('-')[0].lower()[2:-1]
        else:
            word = line[3:].split("'")[0].lower()
            freq = line[3:].split("'")[1][1:-3]
            most_freq_words[current_category].update({word: freq})
most_freq_words = dict(most_freq_words)

### Proper names

#### People

In [8]:
g = nx.read_gexf('data/out/people.gexf')
people = g.nodes()

#### Others

In [9]:
h = nx.read_gexf('data/out/other.gexf')
others = h.nodes()

## Processing data

In [10]:
def all_hypernyms(ss):
    return ss.closure(lambda x: x.hypernyms())

In [11]:
all_text = '\n'.join([text for interview in transcripts_df.INTERVIEW for (index, name, text) in interview])

In [12]:
wn.synsets('Turtle')

[Synset('turtleneck.n.01'),
 Synset('turtle.n.02'),
 Synset('capsize.v.01'),
 Synset('turtle.v.02')]

In [13]:
stop = stopwords.words('english')
stop.extend(string.punctuation)

In [14]:
def clean_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    words = [token for token in tokens if token not in stop and token and len(token) > 2]
    text = ' '.join(words)
    return text

In [15]:
cleaned_all_text = clean_text(all_text)

In [16]:
all_found_terms = set()

### Concepts

In [17]:
# concepts
wordnet_concepts_dict = {
    'water': ['body_of_water.n.01'],
    'animal': ['animal.n.01'],
    'medicine': ['medicine.n.02'],
    'clothing': ['clothing.n.01'],
#     'tool': ['instrumentality.n.03'],
#     'activity': ['interact.v.01', 'act.v.01'],
#     'place': ['location.n.01', 'building.n.01'],
    'plant': ['plant.n.02'],
    'food': ['food.n.01', 'food.n.02'],
    'treaty': ['treaty.n.01'],
    'ceremony': ['ceremony.n.01', 'ceremony.n.02', 'ceremony.n.03'],
    'ancestor': ['ancestor.n.01'],
}
synsets = [wn.synset(ss) for ls in wordnet_concepts_dict.values() for ss in ls]

In [18]:
def inverse_list_dict(d):
    i = {}
    for k in d:
        for v in d[k]:
            i[v] = k
    return i

In [19]:
inverse_concepts = inverse_list_dict(wordnet_concepts_dict)

In [20]:
inverse_concepts

{'ancestor.n.01': 'ancestor',
 'animal.n.01': 'animal',
 'body_of_water.n.01': 'water',
 'ceremony.n.01': 'ceremony',
 'ceremony.n.02': 'ceremony',
 'ceremony.n.03': 'ceremony',
 'clothing.n.01': 'clothing',
 'food.n.01': 'food',
 'food.n.02': 'food',
 'medicine.n.02': 'medicine',
 'plant.n.02': 'plant',
 'treaty.n.01': 'treaty'}

In [21]:
%%time

found_concepts = defaultdict(list)
for w in cleaned_all_text.split():
    sss = wn.synsets(w)
    for ss in sss:
        intersect = set(all_hypernyms(ss)).intersection(synsets)
        if intersect:
            all_found_terms.add(w)
            for ssi in intersect:
                cateogory = inverse_concepts[ssi.name()]
                if cateogory == 'ancestor':
                    # In interpersonal relationships, it is intersting to see if they talk about
                    # their mother/father (singular, their own) mothers/fathers (plural, everyone's in general).
                    # Also, if they say mother/father (more formal) or mom/dad (more familiar).
                    term = w # original word in the transcriptions
                else:
                    term = ss.name().split('.')[0] # generic concept
                found_concepts[cateogory].append(term)

CPU times: user 50 s, sys: 188 ms, total: 50.2 s
Wall time: 50.3 s


In [22]:
found_concepts_counter = {}
for k in found_concepts:
    found_concepts_counter[k] = Counter(found_concepts[k])

In [23]:
for k in found_concepts_counter:
    print(k, len(found_concepts_counter[k]))

treaty 3
food 286
ceremony 17
water 27
animal 251
plant 159
medicine 13
ancestor 27
clothing 71


In [24]:
found_concepts_counter

{'ancestor': Counter({'dad': 329,
          'dads': 5,
          'father': 110,
          'fathers': 12,
          'forefathers': 4,
          'gran': 2,
          'grandfather': 75,
          'grandfathers': 12,
          'grandma': 16,
          'grandmas': 1,
          'grandmother': 59,
          'grandmothers': 4,
          'grandpa': 20,
          'grandparent': 1,
          'grandparents': 41,
          'jacob': 2,
          'jacobs': 26,
          'mom': 107,
          'moms': 2,
          'mother': 95,
          'mothers': 2,
          'papa': 1,
          'parent': 6,
          'parents': 58,
          'pop': 11,
          'pops': 1,
          'simeon': 3}),
 'animal': Counter({'abalone': 1,
          'adult': 26,
          'alligator': 3,
          'american_bison': 3,
          'amphibian': 2,
          'ant': 13,
          'ass': 3,
          'baby': 80,
          'bass': 38,
          'basset': 7,
          'bat': 7,
          'bay': 75,
          'bear': 15,
          'b

In [25]:
len(all_found_terms)

973

### Specific keywords

In [26]:
specific_words

{'animals': ['quail',
  'bob white',
  'muskrat',
  'mink',
  'turtles',
  'snapping turtles',
  'turtle eggs',
  'fish',
  'frogs'],
 'others': ['fasting ceremony',
  'strawberry moon',
  'blueberry moon',
  'moon',
  'grandmother',
  'grandfather'],
 'plants': ['phragmites',
  'bull rush',
  'cat tail',
  'pin weed',
  'sweet grass',
  'milk weed',
  'tobacco',
  'prairie grass',
  'tall-grass prairie',
  'meadow',
  'chestnuts',
  'lady slipper',
  'wild rice',
  'birch',
  'horsetail'],
 'water': ['bkejwanong', 'lake st. clair', 'marsh']}

In [27]:
def reduce(ls):
    ls1 = ls.copy()
    changed = True
    while changed:
        ls2 = ls1.copy()
        for x in ls1.copy():
            for y in ls1.copy():
                if x != y and x in y:
                    ls1.remove(y)
        changed = len(ls1) != len(ls2)
    return ls2

In [29]:
specific_words_sing_plur = defaultdict(dict)
for category in specific_words:
    for w in specific_words[category]:
        specific_words_sing_plur[category][w] = list(reduce({
            w,
            w.replace('-', ''),
            w.replace('-', ' '),
            Word(w).pluralize().string,
            Word(w).singularize().string,
            Word(w.replace('-', '')).pluralize().string,
            Word(w.replace('-', ' ')).pluralize().string,
            Word(w.replace('-', '')).singularize().string,
            Word(w.replace('-', ' ')).singularize().string,
        }))
specific_words_sing_plur = dict(specific_words_sing_plur)

In [30]:
specific_words_sing_plur

{'animals': {'bob white': ['bob white'],
  'fish': ['fish'],
  'frogs': ['frog'],
  'mink': ['mink'],
  'muskrat': ['muskrat'],
  'quail': ['quail'],
  'snapping turtles': ['snapping turtle'],
  'turtle eggs': ['turtle egg'],
  'turtles': ['turtle']},
 'others': {'blueberry moon': ['blueberry moon'],
  'fasting ceremony': ['fasting ceremony', 'fasting ceremonies'],
  'grandfather': ['grandfather'],
  'grandmother': ['grandmother'],
  'moon': ['moon'],
  'strawberry moon': ['strawberry moon']},
 'plants': {'birch': ['birch'],
  'bull rush': ['bull rush'],
  'cat tail': ['cat tail'],
  'chestnuts': ['chestnut'],
  'horsetail': ['horsetail'],
  'lady slipper': ['lady slipper'],
  'meadow': ['meadow'],
  'milk weed': ['milk weed'],
  'phragmites': ['phragmite'],
  'pin weed': ['pin weed'],
  'prairie grass': ['prairie gras'],
  'sweet grass': ['sweet gras'],
  'tall-grass prairie': ['tallgrass prairie',
   'tall-grass prairie',
   'tall grass prairie'],
  'tobacco': ['tobacco'],
  'wild ri

In [32]:
specific_words_counter = defaultdict(dict)
for category in specific_words_sing_plur:
    group = specific_words_sing_plur[category]
    for k in group:
        if k not in all_found_terms:
            all_found_terms.add(k)
            specific_words_counter[category].setdefault(k, 0)
            for w in group[k]:
                specific_words_counter[category][k] += all_text.count(w)

In [33]:
specific_words_counter = dict(specific_words_counter)
for k in specific_words_counter:
    specific_words_counter[k] = Counter(specific_words_counter[k])

In [34]:
specific_words_counter

{'animals': Counter({'bob white': 4, 'snapping turtles': 3, 'turtle eggs': 3}),
 'others': Counter({'blueberry moon': 0,
          'fasting ceremony': 0,
          'moon': 13,
          'strawberry moon': 0}),
 'plants': Counter({'birch': 0,
          'bull rush': 3,
          'cat tail': 2,
          'chestnuts': 0,
          'horsetail': 0,
          'lady slipper': 1,
          'meadow': 0,
          'milk weed': 0,
          'phragmites': 64,
          'pin weed': 0,
          'prairie grass': 3,
          'sweet grass': 72,
          'tall-grass prairie': 7,
          'wild rice': 2}),
 'water': Counter({'bkejwanong': 0, 'lake st. clair': 0, 'marsh': 297})}

In [35]:
len(all_found_terms)

997

### Most frequent words

In [36]:
most_freq_words

{'adjectives': {'alive': ' 28',
  'big': ' 428',
  'commercial': ' 55',
  'environmental': ' 85',
  'geese': ' 28',
  'good': ' 593',
  'healthy': ' 33',
  'human': ' 25',
  'indian': ' 118',
  'invasive': ' 44',
  'little': ' 444',
  'live': ' 28',
  'local': ' 24',
  'native': ' 45',
  'natural': ' 56',
  'non-native': ' 26',
  'north': ' 23',
  'old': ' 332',
  'older': ' 55',
  'public': ' 42',
  'residential': ' 27',
  'right': ' 181',
  'social': ' 41',
  'sustainable': ' 33',
  'traditional': ' 68',
  'wild': ' 101',
  'young': ' 124',
  'younger': ' 66'},
 'nouns': {'area': ' 317',
  'areas': ' 159',
  'bush': ' 320',
  'community': ' 461',
  'corn': ' 114',
  'dad': ' 305',
  'day': ' 275',
  'days': ' 201',
  'deer': ' 151',
  'duck': ' 115',
  'ducks': ' 179',
  'environment': ' 201',
  'family': ' 242',
  'fish': ' 228',
  'fishing': ' 149',
  'food': ' 226',
  'garden': ' 109',
  'grass': ' 104',
  'ground': ' 110',
  'hall': ' 158',
  'home': ' 242',
  'horses': ' 128',
 

In [53]:
most_freq_words_counter = defaultdict(dict)
for pos in most_freq_words:
    group = most_freq_words[pos]
    for w in group:
#         if w not in all_found_terms:
            sss = wn.synsets(w)
            for ss in sss:
                intersect = set(all_hypernyms(ss)).intersection(synsets)
                if intersect:
                    all_found_terms.add(w)
                    for ssi in intersect:
                        category = inverse_concepts[ssi.name()]
                        if category == 'ancestor':
                            # In interpersonal relationships, it is intersting to see if they talk about
                            # their mother/father (singular, their own) mothers/fathers (plural, everyone's in general).
                            # Also, if they say mother/father (more formal) or mom/dad (more familiar).
                            term = w # original word in the transcriptions
                        else:
                            term = ss.name().split('.')[0] # generic concept
                        most_freq_words_counter[category][term] = most_freq_words[pos][w]

In [54]:
most_freq_words_counter

defaultdict(dict,
            {'ancestor': {'dad': ' 305', 'mom': ' 106'},
             'animal': {'deer': ' 151',
              'duck': ' 179',
              'fish': ' 228',
              'goose': ' 28',
              'homo': ' 25',
              'horse': ' 128',
              'kid': ' 259',
              'muskrat': ' 103',
              'young': ' 124'},
             'food': {'corn': ' 114',
              'corn_whiskey': ' 114',
              'cut': ' 118',
              'duck': ' 179',
              'eatage': ' 104',
              'fish': ' 228',
              'goose': ' 28',
              'ice': ' 120',
              'water': ' 496'},
             'plant': {'corn': ' 114',
              'grass': ' 104',
              'shrub': ' 320',
              'tree': ' 149'},
             'water': {'lake': ' 102', 'river': ' 180', 'rivulet': ' 62'}})

In [None]:
DAD no esta pero MOM esta arriba (107 veces y no 106 tho)

In [52]:
most_freq_words_counter = dict(most_freq_words_counter)
for k in most_freq_words_counter:
    most_freq_words_counter[k] = Counter(specific_words_counter[k])

KeyError: 'animal'

In [50]:
most_freq_words_counter

{}

In [None]:
len(all_found_terms)

### Proper names

#### People

In [8]:
# people
Buscar tal cual en el texto

#### Others

In [9]:
# others
Buscar tal cual en el texto

In [85]:
ss=wn.synsets('hammer')[1]

In [86]:
ss.hypernyms()

[Synset('hand_tool.n.01')]

In [88]:
list(all_hypernyms(ss))

[Synset('hand_tool.n.01'),
 Synset('tool.n.01'),
 Synset('implement.n.01'),
 Synset('instrumentality.n.03'),
 Synset('artifact.n.01'),
 Synset('whole.n.02'),
 Synset('object.n.01'),
 Synset('physical_entity.n.01'),
 Synset('entity.n.01')]

In [90]:
wn.synset('instrumentality.n.03').hyponyms()

[Synset('ceramic.n.01'),
 Synset('connection.n.03'),
 Synset('container.n.01'),
 Synset('conveyance.n.03'),
 Synset('device.n.01'),
 Synset('equipment.n.01'),
 Synset('furnishing.n.02'),
 Synset('hardware.n.02'),
 Synset('implement.n.01'),
 Synset('means.n.02'),
 Synset('medium.n.01'),
 Synset('system.n.01'),
 Synset('toiletry.n.01'),
 Synset('weaponry.n.01')]