# Transcripts

In [17]:
import string

import networkx as nx
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn

## Loading data

### Transcripts

In [2]:
transcripts_df = pd.read_csv(
    'data/out/transcripts_1.csv',
    converters={'INTERVIEWERS': eval, 'INTERVIEWEES': eval, 'ALIASES': eval, 'INTERVIEW': eval},
)
transcripts_df = transcripts_df[['ID', 'INTERVIEWERS', 'INTERVIEWEES', 'ALIASES', 'INTERVIEW']]

In [3]:
transcripts_df.head()

Unnamed: 0,ID,INTERVIEWERS,INTERVIEWEES,ALIASES,INTERVIEW
0,Aimee Johnson – 17 September 2010,[Rick Fehr],[Aimee Johnson],[],"[(0, Rick, Ok. We’re recording now, I’m sitti..."
1,Anita Smith -,[Dave White],[Anita Smith],[],"[(0, Dave, How did we use to use the environme..."
2,Apollo Blackeagle – 27 October 2010,"[Rick Fehr, David White]",[Apollo Blackeagle],[],"[(0, Rick, Ok, its October 27th I believe, we’..."
3,Bill Sands,[Dave White],[Bill Sands],[],"[(0, Dave, In the past, there’s concern today ..."
4,Brenda Wheat – 24 May 2011,[Rick Fehr],[Brenda Wheat],[],"[(0, Rick, So what we’ll be using is just a li..."


In [4]:
transcripts_df.shape

(50, 5)

### Concepts

In [5]:
concepts = [
    'water', 'animal', 'medicine', 'clothe', 'tool', 'activity', 'place', 'plant', 'food',
    'treaty', 'ceremony', 'ancestor',
]

### Specific keywords

In [6]:
specific_keywords = {}
current_category = ''
with open('data/specific_keywords.txt') as f:
    for line in f:
        line = line.strip()
        if line:
            if line.endswith(':'):
                current_category = line.split(':')[0].lower()
                specific_keywords[current_category] = []
            else:
                specific_keywords[current_category].append(line.lower())

### Most frequent words

In [7]:
most_freq_words = {}
current_category = ''
with open('data/most_freq_words.txt') as f:
    for line in f:
        if line.startswith('#'):
            current_category = line.split('-')[0].lower()[2:-1]
            most_freq_words[current_category] = []
        else:
            word = line[3:].split("'")[0].lower()
            most_freq_words[current_category].append(word)
most_freq_words = sorted(most_freq_words)

### Proper names

#### People

In [8]:
g = nx.read_gexf('data/out/people.gexf')
people = g.nodes()

#### Others

In [9]:
h = nx.read_gexf('data/out/other.gexf')
others = h.nodes()

## Processing data

In [10]:
def all_hypernyms(ss):
    return ss.closure(lambda x: x.hypernyms())

In [11]:
all_text = '\n'.join([text for interview in transcripts_df.INTERVIEW for (index, name, text) in interview])

In [12]:
wn.synsets('Turtle')

[Synset('turtleneck.n.01'),
 Synset('turtle.n.02'),
 Synset('capsize.v.01'),
 Synset('turtle.v.02')]

In [None]:
stop = stopwords.words('english').extend(string.punctuation)

In [None]:
def clean_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    words = [token for token in tokens if token not in stop and len(token) > 2]
    text = ' '.join(words)
    return text

In [None]:
cleaned_all_text = clean_text(all_text)

### Concepts

In [5]:
# concepts
Encontrar synsets correspondientes
Articulo de behaviours (ventana de palabras) para clothe, tools (instrumentaliy)

### Specific keywords

In [6]:
# specific_keywords
Buscarlas tal cual, en singular, plurar, masculino y femenino en el texto

### Most frequent words

In [7]:
# most_freq_words
Procesar por POS tag

### Proper names

#### People

In [8]:
# people
Buscar tal cual en el texto

#### Others

In [9]:
# others
Buscar tal cual en el texto

In [85]:
ss=wn.synsets('hammer')[1]

In [86]:
ss.hypernyms()

[Synset('hand_tool.n.01')]

In [88]:
list(all_hypernyms(ss))

[Synset('hand_tool.n.01'),
 Synset('tool.n.01'),
 Synset('implement.n.01'),
 Synset('instrumentality.n.03'),
 Synset('artifact.n.01'),
 Synset('whole.n.02'),
 Synset('object.n.01'),
 Synset('physical_entity.n.01'),
 Synset('entity.n.01')]

In [90]:
wn.synset('instrumentality.n.03').hyponyms()

[Synset('ceramic.n.01'),
 Synset('connection.n.03'),
 Synset('container.n.01'),
 Synset('conveyance.n.03'),
 Synset('device.n.01'),
 Synset('equipment.n.01'),
 Synset('furnishing.n.02'),
 Synset('hardware.n.02'),
 Synset('implement.n.01'),
 Synset('means.n.02'),
 Synset('medium.n.01'),
 Synset('system.n.01'),
 Synset('toiletry.n.01'),
 Synset('weaponry.n.01')]