# Transcripts

In [64]:
from collections import Counter

import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag

## Loading data

In [20]:
transcripts_df = pd.read_csv(
    'data/out/transcripts_1.csv',
    converters={'INTERVIEWERS': eval, 'INTERVIEWEES': eval, 'INTERVIEW': eval},
)
transcripts_df = transcripts_df[['ID', 'INTERVIEWERS', 'INTERVIEWEES', 'INTERVIEW']]

In [21]:
transcripts_df.shape

(68, 4)

In [22]:
transcripts_df.head()

Unnamed: 0,ID,INTERVIEWERS,INTERVIEWEES,INTERVIEW
0,Aimee Johnson – 17 September 2010,[Rick Fehr],[Aimee Johnson],"[(0, Rick, Ok. We’re recording now, I’m sitti..."
1,Andrew Adult Male – 18 September 2010,[Dave White],[Andrew Peters],"[(0, Dave, So we’d like to ask you about thing..."
2,Anita Smith -,[Dave White],[Anita Smith],"[(0, Dave, How did we use to use the environme..."
3,Apollo Blackeagle – 27 October 2010,"[Rick Fehr, David White]",[Apollo Blackeagle],"[(0, Rick, Ok, its October 27th I believe, we’..."
4,Bill Sands,[Dave White],[Bill Sands],"[(0, Dave, In the past, there’s concern today ..."


## Processing data

In [39]:
all_text = '\n'.join([text for interview in transcripts_df.INTERVIEW for (index, name, text) in interview])

In [41]:
with open('data/out/all_text.txt', 'w') as f:
    f.write(all_text)

In [37]:
interviewers_names = ['Dave', 'Rick', 'Clint']

In [38]:
interviewers_text = '\n'.join(
    [text
     for interview in transcripts_df.INTERVIEW
     for (index, name, text) in interview
     if name not in interviewers_names]
)

In [40]:
with open('data/out/interviewers_text.txt', 'w') as f:
    f.write(interviewers_text)

### POS tags

In [67]:
%%time

tagged_all_text = pos_tag(word_tokenize(all_text))

CPU times: user 38.1 s, sys: 88 ms, total: 38.2 s
Wall time: 38.2 s


In [None]:
Quedarse con los NNP
Buscar las menciones de cada interviewee

### Keywords

#### All

In [68]:
def clean_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    words = [token for token in tokens if token not in stopwords.words('english') and len(token) > 1]
    text = ' '.join(words)
    return text

In [69]:
%%time

cleaned_all_text = clean_text(all_text)

CPU times: user 1min 11s, sys: 4.98 s, total: 1min 16s
Wall time: 1min 16s


In [70]:
words = cleaned_all_text.split()

In [71]:
counter = Counter(words)

In [73]:
counter.most_common(20)

[('like', 2817),
 ('would', 2627),
 ('know', 2450),
 ('yea', 2440),
 ('people', 1995),
 ('yeah', 1924),
 ('go', 1883),
 ('used', 1824),
 ('re', 1735),
 ('one', 1702),
 ('remember', 1549),
 ('get', 1545),
 ('time', 1537),
 ('uh', 1517),
 ('think', 1465),
 ('things', 1404),
 ('well', 1390),
 ('back', 1364),
 ('got', 1273),
 ('going', 1250)]

### Proper nouns

In [55]:
%%time

tagged_all_text = pos_tag(word_tokenize(cleaned_all_text))

CPU times: user 34.6 s, sys: 64 ms, total: 34.6 s
Wall time: 34.6 s


## Saving data

In [23]:
# transcripts_df.to_csv('data/out/transcripts_2.csv', index=False)