# Transcripts

In [1]:
import os
from collections import Counter
from collections import defaultdict
from datetime import datetime

import pandas as pd
from docx import Document

## Loading data

In [2]:
df_list = []

for filename in sorted(os.listdir('data/src/')):
    if filename.endswith('#'):
        continue
    # metadata
    if filename == 'Aimee Johnson – 17 September 2010.docx':
        date = datetime(2010, 9, 17)
        interviewers = ['Rick Fehr']
        interviewees = ['Aimee Johnson']
    elif filename == 'Andrew Adult Male – 18 September 2010.docx':
        date = datetime(2010, 9, 18)
        interviewers = ['Dave White']
        interviewees = ['Andrew Peters']
    elif filename == 'Anita Smith - .docx':
        date = None
        interviewers = ['Dave White']
        interviewees = ['Anita Smith']
    elif filename == 'Apollo Blackeagle – 27 October 2010.docx':
        date = datetime(2010, 10, 27)
        interviewers = ['Rick Fehr', 'David White']
        interviewees = ['Apollo Blackeagle']
    elif filename == 'Bill Sands.docx':
        date = None
        interviewers = ['Dave White']
        interviewees = ['Bill Sands']
    elif filename == 'Brenda Wheat – 24 May 2011.docx':
        date = datetime(2011, 11, 24)
        interviewers = ['Rick Fehr']
        interviewees = ['Brenda Wheat']
    elif filename == 'Cameron 21 October 2010.docx':
        date = datetime(2010, 10, 21)
        interviewers = ['Dave White']
        interviewees = ['Cameron']
    elif filename == 'Carl Smith – 15 October 2010.docx':
        date = datetime(2010, 10, 15)
        interviewers = ['David White']
        interviewees = ['Carl Smith (Resource Protection Officer)']
    elif filename == 'Carmen and Jean Wrightman – 24 September 2010.docx':
        date = datetime(2010, 9, 24)
        interviewers = ['David White']
        interviewees = ['Carmen Wrightman', 'Jean Wrightman']
    elif filename == 'Cheryl Adult female – 3 September 2010.docx':
        date = datetime(2010, 9, 3)
        interviewers = ['Rick Fehr']
        interviewees = ['Cheryl']
    elif filename == 'Chief Joseph Gilbert – 6 October 2010.docx':
        date = datetime(2010, 10, 6)
        interviewers = ['Dave White']
        interviewees = ['Chief Joseph Gilbert', 'Chief Gilbert'] # alias
    elif filename == 'Chris Riley – 16 September 2010.docx':
        date = datetime(2010, 9, 16)
        interviewers = ['Dave White', 'Rick Fehr']
        interviewees = ['Chris Riley']
    elif filename == 'Daniel Adult male – 8 September 2010.docx':
        date = datetime(2010, 9, 8)
        interviewers = ['Rick Fehr']
        interviewees = ['Daniel']
    elif filename == 'Darren Adult Male 15 September 2010.docx':
        date = datetime(2010, 9, 15)
        interviewers = ['Dave White', 'Rick Fehr']
        interviewees = ['Darren']
    elif filename == 'Dean Jacobs – 3 September 2010.docx':
        date = datetime(2010, 9, 3)
        interviewers = ['Rick Fehr']
        interviewees = ['Dean Jacobs']
    elif filename == 'Dot Peters.docx':
        date = None
        interviewers = ['David White']
        interviewees = ['Dot Peters']
    elif filename == 'Doug Adult male – 15 October 2010.docx':
        date = datetime(2010, 10, 15)
        interviewers = ['David White']
        interviewees = ['Doug (Resource Protection Officer)']
    elif filename == 'Elaine Jacobs – 24 September 2010.docx':
        date = datetime(2010, 9, 24)
        interviewers = ['Rick Fehr']
        interviewees = ['Elaine Jacobs']
    elif filename == 'Eli Baxter – 14 December 2010.docx':
        date = datetime(2010, 12, 14)
        interviewers = ['Rick Fehr']
        interviewees = ['Eli Baxter']
    elif filename == 'Eric and Patty Isaac – 7 October 2010.docx':
        date = datetime(2010, 10, 7)
        interviewers = ['David White']
        interviewees = ['Eric Isaac', 'Patty Isaac']
    elif filename == 'Frank Adult male – 28 September 2010.docx':
        date = datetime(2010, 9, 28)
        interviewers = ['Rick Fehr', 'David White']
        interviewees = ['Frank']
    elif filename == 'Geogrina.docx':
        date = datetime(2010, 10, 18)
        interviewers = ['Dave White']
        interviewees = ['Georgina']
    elif filename == 'Greg Isaac –.docx':
        date = None
        interviewers = ['David White']
        interviewees = ['Greg Isaac']
    elif filename == 'Growing up on Walpole.docx':
        date = None
        interviewers = []
        interviewees = ['Jennie Blackbird']
    elif filename == 'Gus 5 October 2010.docx':
        date = datetime(2010, 10, 5)
        interviewers = ['David White']
        interviewees = ['Gus']
    elif filename == 'Harold Peters – 18 September 2010.docx':
        date = datetime(2010, 9, 18)
        interviewers = ['Dave White']
        interviewees = ['Harold Peters']
    elif filename == 'Isabelle 13 October 2010.docx':
        date = datetime(2010, 10, 13)
        interviewers = ['David White']
        interviewees = ['Isabelle']
    elif filename == 'J&L.docx':
        date = datetime(2010, 11, 4)
        interviewers = ['Dave White']
        interviewees = ['Jennie Blackbird', 'Lizzie Isaac', 'Jen', 'Liz'] # alias
    elif filename == 'Jane – 24 September 2010.docx':
        date = datetime(2010, 9, 24)
        interviewers = ['Rick Fehr']
        interviewees = ['Jane Jacobs']
    elif filename == 'Jasper and Eliza John –.docx':
        date = None
        interviewers = ['David White']
        interviewees = ['Jasper John', 'Eliza John']
    elif filename == 'Jerome.docx':
        date = datetime(2010, 9, 30)
        interviewers = ['Dave White']
        interviewees = ['Jerome']
    elif filename == 'Jerry Adult male – 7 September 2010.docx':
        date = datetime(2010, 9, 7)
        interviewers = ['Rick Fehr']
        interviewees = ['Jerry']
    elif filename == 'Joanne Day.docx':
        date = None
        interviewers = ['David White']
        interviewees = ['Joanne Day']
    elif filename == 'Joe and Carrie Elders – 7 October 2010.docx':
        date = datetime(2010, 10, 7)
        interviewers = ['David White']
        interviewees = ['Joe Isaac', 'Carrie Isaac']
    elif filename == 'John Adult male – 6 December 2010.docx':
        date = datetime(2010, 12, 6)
        interviewers = ['David White']
        interviewees = ['John']
    elif filename == 'Julia.docx':
        date = datetime(2010, 10, 4)
        interviewers = ['David White']
        interviewees = ['Julia']
    elif filename == 'Karen Lalleen – 4 October 2010.docx':
        date = datetime(2010, 10, 4)
        interviewers = ['David White']
        interviewees = ['Karen Lalleen']
    elif filename == 'Kenneth adult male – 21 September 2010.docx':
        date = datetime(2010, 9, 21)
        interviewers = ['Rick Fehr', 'Dave White']
        interviewees = ['Kenneth']
    elif filename == 'Kennon Johnson – 10 November 2010.docx':
        date = datetime(2010, 11, 10)
        interviewers = ['Dave White']
        interviewees = ['Kennon Johnson']
    elif filename == 'Kevin Smith – 6 December 2010.docx':
        date = datetime(2010, 12, 6)
        interviewers = ['Dave White']
        interviewees = ['Kevin Smith']
    elif filename == 'Lee White 16 September 2010.docx':
        date = datetime(2010, 9, 16)
        interviewers = ['Dave White', 'Rick Fehr']
        interviewees = ['Lee White']
    elif filename == 'Linda.docx':
        date = datetime(2010, 11, 3)
        interviewers = ['Clint Jacobs']
        interviewees = ['Linda White']
    elif filename == 'Lloyd Day –.docx':
        date = None
        interviewers = ['David White']
        interviewees = ['Lloyd Day', 'Woman speaking'] # woman speaking
    elif filename == 'Lyndsay Sword – 14 October 2010.docx':
        date = datetime(2010, 10, 14)
        interviewers = ['David White']
        interviewees = ['Lyndsay Sword']
    elif filename == 'Mark – 16 September 2010.docx':
        date = datetime(2010, 9, 16)
        interviewers = ['Dave White', 'Rick Fehr']
        interviewees = ['Mark']
    elif filename == 'Mel Hoeksma – 7 September 2010.docx':
        date = datetime(2010, 9, 7)
        interviewers = ['Rick Fehr']
        interviewees = ['Mel Hoeksma']
    elif filename == 'Mickey Aquash 15 September 2010.docx':
        date = datetime(2010, 9, 15)
        interviewers = ['Dave White', 'Rick Fehr']
        interviewees = ['Mickey Aquash']
    elif filename == 'Morris and Lois.docx':
        date = datetime(2010, 9, 24)
        interviewers = ['David White']
        interviewees = ['Morris Wrightman', 'Lois Wrightman']
    elif filename == 'Myrna and Shirley 4 November 2010.docx':
        date = datetime(2010, 11, 4)
        interviewers = ['Dave White']
        interviewees = ['Myrna', 'Shirley']
    elif filename == 'Naomi Williams interview 2 September 2010.docx':
        date = datetime(2010, 9, 2)
        interviewers = ['Rick Fehr']
        interviewees = ['Naomi Williams']
    elif filename == 'Organic Garden Group (Coding) – 10 November 2010.docx':
        date = datetime(2010, 11, 10)
        interviewers = ['Dave White', 'Clint Jacobs', 'Rick Fehr']
        interviewees = ['Patricia', 'Jessica', 'Becky', 'Joanne', 'Rose']
    elif filename == 'Pat.docx':
        date = datetime(2010, 11, 30)
        interviewers = ['Dave White']
        interviewees = ['Pat Riley']
    elif filename == 'Paul Adult male – 6 October 2010.docx':
        date = datetime(2010, 10, 6)
        interviewers = ['Dave White']
        interviewees = ['Paul']
    elif filename == 'Puppydog.docx':
        date = datetime(2010, 9, 22)
        interviewers = ['Dave White', 'Rick Fehr', 'Clint Jacobs']
        interviewees = ['Ralph Johnson', 'PD'] # alias
    elif filename == 'Rachel 22 November 2010.docx':
        date = datetime(2010, 11, 22)
        interviewers = ['Dave White']
        interviewees = ['Rachel']
    elif filename == 'Ralph Jones – 14 September 2010.docx':
        date = datetime(2010, 9, 14)
        interviewers = ['Dave White']
        interviewees = ['Ralph Jones']
    elif filename == 'Ron adult male – 10 November 2010.docx':
        date = datetime(2010, 11, 10)
        interviewers = ['Dave White']
        interviewees = ['Ron']
    elif filename == 'Sarah Adult female – 17 September 2010.docx':
        date = datetime(2010, 9, 17)
        interviewers = ['Rick Fehr']
        interviewees = ['Sarah']
    elif filename == 'Stanley.docx':
        date = datetime(2010, 11, 1)
        interviewers = ['David White']
        interviewees = ['Stanley']
    elif filename == 'Stuart 8 October 2010.docx':
        date = datetime(2010, 10, 8)
        interviewers = ['David White']
        interviewees = ['Stuart']
    elif filename == 'Suzie Isaac - 4 September 2010.docx':
        date = datetime(2010, 9, 4)
        interviewers = ['Rick Fehr']
        interviewees = ['Suzie Isaac']
    elif filename == 'Terry Sands – 28 September 2010.docx':
        date = datetime(2010, 9, 28)
        interviewers = ['Rick Fehr', 'David White']
        interviewees = ['Terry Sands']
    elif filename == 'Tom adult male 16 September 2010.docx':
        date = datetime(2010, 9, 16)
        interviewers = ['Dave White', 'Rick Fehr']
        interviewees = ['Tom']
    elif filename == 'Vernon and Suzi Jones –.docx':
        date = None
        interviewers = ['David White']
        interviewees = ['Vernon Jones', 'Suzie Jones']
    elif filename == 'archie.docx':
        date = None
        interviewers = ['Dave White']
        interviewees = ['Archie']
    elif filename == 'cal.docx':
        date = None
        interviewers = ['Dave White']
        interviewees = ['Cal']
    elif filename == 'charlie.docx':
        date = datetime(2010, 10, 21)
        interviewers = ['Dave White']
        interviewees = ['Charles Wright']
    elif filename == 'rita.docx':
        date = datetime(2010, 11, 3)
        interviewers = ['Dave White', 'Clint Jacobs']
        interviewees = ['Rita Sands']
    # text
    document = Document('data/src/' + filename)
    text = '\n'.join([par.text for par in document.paragraphs])
    # dataframe
    df = pd.DataFrame(
        [[filename[:-5], date, interviewers, interviewees, text]],
        columns=['ID', 'DATE', 'INTERVIEWERS', 'INTERVIEWEES', 'TEXT']
    )
    df_list.append(df)
transcripts_df = pd.concat(df_list, ignore_index=True)

In [3]:
transcripts_df.shape

(68, 5)

In [4]:
transcripts_df.head()

Unnamed: 0,ID,DATE,INTERVIEWERS,INTERVIEWEES,TEXT
0,Aimee Johnson – 17 September 2010,2010-09-17,[Rick Fehr],[Aimee Johnson],Aimee Johnson – 17 September 2010\n\nInterview...
1,Andrew Adult Male – 18 September 2010,2010-09-18,[Dave White],[Andrew Peters],Andrew Peters – 18 September 2010\n\nInterview...
2,Anita Smith -,NaT,[Dave White],[Anita Smith],Interviewer: Dave White\nInterviewee: Anita Sm...
3,Apollo Blackeagle – 27 October 2010,2010-10-27,"[Rick Fehr, David White]",[Apollo Blackeagle],Apollo Blackeagle – 27 October 2010\n\nIntervi...
4,Bill Sands,NaT,[Dave White],[Bill Sands],Interviewer: Dave White\nInterviewee: Bill San...


## Checking data

### No date

In [5]:
transcripts_df[transcripts_df.DATE.isnull()]

Unnamed: 0,ID,DATE,INTERVIEWERS,INTERVIEWEES,TEXT
2,Anita Smith -,NaT,[Dave White],[Anita Smith],Interviewer: Dave White\nInterviewee: Anita Sm...
4,Bill Sands,NaT,[Dave White],[Bill Sands],Interviewer: Dave White\nInterviewee: Bill San...
15,Dot Peters,NaT,[David White],[Dot Peters],Dot Peters – \n\nInterviewer: David White\nInt...
22,Greg Isaac –,NaT,[David White],[Greg Isaac],Greg Isaac – \n\nInterviewer: David White\nInt...
23,Growing up on Walpole,NaT,[],[Jennie Blackbird],\t\t Growing up on Walpole\n\t\...
29,Jasper and Eliza John –,NaT,[David White],"[Jasper John, Eliza John]",Jasper and Eliza John – \n\nInterviewer: David...
32,Joanne Day,NaT,[David White],[Joanne Day],Joanne Day – \n\nInterviewer: David White\nInt...
42,Lloyd Day –,NaT,[David White],"[Lloyd Day, Woman speaking]",Lloyd Day – \n\nInterviewer: David White\nInte...
63,Vernon and Suzi Jones –,NaT,[David White],"[Vernon Jones, Suzie Jones]",Vernon and Suzie Jones – \n\nInterviewer: Davi...
64,archie,NaT,[Dave White],[Archie],Dave: So we’re trying to retrieve information ...


### No interviewer

In [6]:
transcripts_df[transcripts_df.INTERVIEWERS.apply(lambda x: not x)]

Unnamed: 0,ID,DATE,INTERVIEWERS,INTERVIEWEES,TEXT
23,Growing up on Walpole,NaT,[],[Jennie Blackbird],\t\t Growing up on Walpole\n\t\...


### Interviewed more than once

In [7]:
interviewee_list = [y for x in transcripts_df.INTERVIEWEES for y in x]

In [8]:
Counter(interviewee_list).most_common(5)

[('Jennie Blackbird', 2),
 ('Dot Peters', 1),
 ('Kennon Johnson', 1),
 ('Myrna', 1),
 ('Carl Smith (Resource Protection Officer)', 1)]

In [9]:
transcripts_df[transcripts_df.INTERVIEWEES.apply(lambda x: 'Jennie Blackbird' in x)]

Unnamed: 0,ID,DATE,INTERVIEWERS,INTERVIEWEES,TEXT
23,Growing up on Walpole,NaT,[],[Jennie Blackbird],\t\t Growing up on Walpole\n\t\...
27,J&L,2010-11-04,[Dave White],"[Jennie Blackbird, Lizzie Isaac, Jen, Liz]","Date: Thursday November 4, 2010\nInterviewer: ..."


### People

In [10]:
def first_names(names):
    return [x.split()[0] for x in names]

#### Interviewers' names

In [11]:
interviewers = list(set(y for x in transcripts_df.INTERVIEWERS for y in x))

In [12]:
interviewers_names = interviewers + first_names(interviewers)

In [13]:
interviewers_names

['Clint Jacobs',
 'Rick Fehr',
 'Dave White',
 'David White',
 'Clint',
 'Rick',
 'Dave',
 'David']

In [14]:
interviewers_dict = {
    'Clint Jacobs': 'Clint',
    'David White': 'Dave',
    'Rick Fehr': 'Rick',
    'Dave White': 'Dave',
    'Clint': 'Clint',
    'David': 'Dave',
    'Rick': 'Rick',
    'Dave': 'Dave',
}

#### Odd names

In [15]:
for i, row in transcripts_df.iterrows():
    odd = set()
    interviewees_names = row.INTERVIEWEES + first_names(row.INTERVIEWEES)
    names = interviewers_names + interviewees_names
    for line in row.TEXT.split('\n'):
        line = line.strip()
        if ':' in line:
            first = line.split(':')[0].strip()
            if not first.startswith('Interviewe') and first != 'Date':
                if first not in names:
                    odd.add(first)
    if odd:
        print(i, odd)

## Processing data

### Interviews

In [16]:
def get_interview_beginning_aux(text, names):
    index = 0
    for line in text.split('\n'):
        line = line.strip()
        if any(map(lambda x: line.startswith(x + ':') or line.startswith(x + ' :'), names)):
            break
        else:
            index += 1
    return index

def get_interview_beginning(row):
    interviewees_names = row.INTERVIEWEES + first_names(row.INTERVIEWEES)
    names = interviewers_names + interviewees_names
    return get_interview_beginning_aux(row.TEXT, names)

In [17]:
transcripts_df['BEGINNING'] = transcripts_df.apply(get_interview_beginning, axis=1)

In [18]:
def get_interview_aux(interviewees, text, beginning):
    interview = []
    interviewees_first_names = first_names(interviewees)
    lines = [line.strip() for line in text.split('\n')[beginning:] if line.strip()]
    current = ''
    i = 0
    for line in lines:
        speaker = ''
        seq = line.split(':')
        first = seq[0].strip()
        rest = ':'.join(seq[1:]).strip()
        if first in interviewers_dict:
            speaker = interviewers_dict[first]
            current = speaker
        elif first in interviewees:
            speaker = first
            current = speaker
        elif first in interviewees_first_names:
            speaker = interviewees[interviewees_first_names.index(first)]
            current = speaker
        else:
            speaker = current
            rest = line.strip()
        interview.append((i, speaker, rest))
        i += 1
    return interview

def get_interview(row):
    return str(get_interview_aux(row.INTERVIEWEES, row.TEXT, row.BEGINNING))

In [19]:
transcripts_df['INTERVIEW'] = transcripts_df.apply(get_interview, axis=1)
transcripts_df['INTERVIEW'] = transcripts_df.INTERVIEW.apply(eval)

In [20]:
transcripts_df.head()

Unnamed: 0,ID,DATE,INTERVIEWERS,INTERVIEWEES,TEXT,BEGINNING,INTERVIEW
0,Aimee Johnson – 17 September 2010,2010-09-17,[Rick Fehr],[Aimee Johnson],Aimee Johnson – 17 September 2010\n\nInterview...,5,"[(0, Rick, Ok. We’re recording now, I’m sitti..."
1,Andrew Adult Male – 18 September 2010,2010-09-18,[Dave White],[Andrew Peters],Andrew Peters – 18 September 2010\n\nInterview...,5,"[(0, Dave, So we’d like to ask you about thing..."
2,Anita Smith -,NaT,[Dave White],[Anita Smith],Interviewer: Dave White\nInterviewee: Anita Sm...,3,"[(0, Dave, How did we use to use the environme..."
3,Apollo Blackeagle – 27 October 2010,2010-10-27,"[Rick Fehr, David White]",[Apollo Blackeagle],Apollo Blackeagle – 27 October 2010\n\nIntervi...,5,"[(0, Rick, Ok, its October 27th I believe, we’..."
4,Bill Sands,NaT,[Dave White],[Bill Sands],Interviewer: Dave White\nInterviewee: Bill San...,3,"[(0, Dave, In the past, there’s concern today ..."


### Checking interviews

In [21]:
for i, row in transcripts_df.iterrows():
    names = [name for (index, name, text) in row.INTERVIEW]
    print(i, Counter(names))

0 Counter({'Aimee Johnson': 106, 'Rick': 92})
1 Counter({'Andrew Peters': 194, 'Dave': 193})
2 Counter({'Dave': 32, 'Anita Smith': 32})
3 Counter({'Apollo Blackeagle': 13, 'Rick': 7, 'Dave': 3})
4 Counter({'Bill Sands': 32, 'Dave': 30})
5 Counter({'Rick': 14, 'Brenda Wheat': 14})
6 Counter({'Dave': 59, 'Cameron': 59})
7 Counter({'Dave': 92, 'Carl Smith (Resource Protection Officer)': 91})
8 Counter({'Dave': 267, 'Carmen Wrightman': 175, 'Jean Wrightman': 159})
9 Counter({'Cheryl': 45, 'Rick': 42})
10 Counter({'Chief Gilbert': 83, 'Dave': 81})
11 Counter({'Rick': 48, 'Chris Riley': 45, 'Dave': 29, 'Clint': 1})
12 Counter({'Rick': 52, 'Daniel': 50})
13 Counter({'Darren': 42, 'Dave': 30, 'Rick': 28})
14 Counter({'Dean Jacobs': 41, 'Rick': 40})
15 Counter({'Dot Peters': 391, 'Dave': 385})
16 Counter({'Dave': 92, 'Doug (Resource Protection Officer)': 91})
17 Counter({'Elaine Jacobs': 77, 'Rick': 74})
18 Counter({'Eli Baxter': 35, 'Rick': 12})
19 Counter({'Eric Isaac': 170, 'Dave': 166, 'Pat

#### Duplicated interviews

In [22]:
interviews_values_dict = defaultdict(list)
for i, row in transcripts_df.iterrows():
    names = [name for (index, name, text) in row.INTERVIEW]
    counter = Counter(names)
    tup = tuple(counter.values())
    interviews_values_dict[tup].append(row.ID)
for tup in interviews_values_dict:
    if len(interviews_values_dict[tup]) > 1:
        for ide in interviews_values_dict[tup]:
            print(ide)
        print()

Carmen and Jean Wrightman – 24 September 2010
Morris and Lois

Anita Smith - 
Geogrina

Andrew Adult Male – 18 September 2010
Harold Peters – 18 September 2010

Cameron 21 October 2010
charlie

Lee White 16 September 2010
Tom adult male 16 September 2010

Carl Smith – 15 October 2010
Doug Adult male – 15 October 2010

Jerry Adult male – 7 September 2010
Mel Hoeksma – 7 September 2010

Chief Joseph Gilbert – 6 October 2010
Paul Adult male – 6 October 2010

Frank Adult male – 28 September 2010
Terry Sands – 28 September 2010

John Adult male – 6 December 2010
Kevin Smith – 6 December 2010

Kennon Johnson – 10 November 2010
Ron adult male – 10 November 2010

Elaine Jacobs – 24 September 2010
Jane – 24 September 2010

Julia
Karen Lalleen – 4 October 2010

Bill Sands
Jerome

Aimee Johnson – 17 September 2010
Sarah Adult female – 17 September 2010



## Saving data

In [23]:
transcripts_df.to_csv('data/out/transcripts_1.csv', index=False)