Purpose:  Script for loading debate transcript.
Author:  Carol Sniegoski
Date:  April 12, 2016
Course:  DSE MAS Capstone, Spring 2016

In [9]:
import pandas as pd
import re
print 'done'

done


In [10]:

# Return line with all misspelled instances of the correct_words replaced with the correct_word.
# Here we define a misspelling as missing one letter other than the first or last letter in a word.
# Case insensitive.
def correct_spellings(line, correct_words):
    
    for correct_word in correct_words:
        word_len = len(correct_word)
        for i in range(1,word_len-1):
            
            # Create the misspelled version of the word to look for.
            missp = correct_word[0:i]+correct_word[i+1:word_len]
            
            # Do a case-insensitive replacement.
            #test_string.replace(missp, correct_word)
            pattern = re.compile(missp, re.IGNORECASE)
            line = pattern.sub(correct_word, line) 
            
    # Fixing the "O'Malley" spelling variations.
    line = line.replace("â€™", "'")
    line = line.replace("O' ", "O'")
    
    return line

        
# Return list of text chunks, each starting with either the beginning of the line
# or a new speaker or audience event.
def get_splits_by_speaker(line, candidates, moderators, audience):
    return_list = []
    
    # Get splits for each speaker type. Unfortunately they are all handled slightly differently.
    splits = get_candidate_splits(line, candidates) + get_moderator_splits(line, moderators) + get_audience_splits(line, audience)

    # Sort list of tuples by 3rd element, which is the start index in the line.
    splits.sort(key=lambda x: x[2])
    
    # Use indices to split line into text chunks.

    # If there are no indices, just return the line, with speaker unknown.
    if len(splits)==0:
        return_list = [ ('', '', line) ]
        return return_list
    
    # If the first entry does not start at index zero, add an initial text chunk with speaker unknown.
    if splits[0][2]!=0:
        return_list.append( ('', '', line[0:splits[0][2]]) )
    
    # Add the rest of the text chunks.
    #ix_end = len(line)
    for i in range(len(splits)):
        ix_start = splits[i][2]
        speaker_name = splits[i][0]
        speaker_type = splits[i][1]
        try:
            ix_end = splits[i+1][2]
        except:
            ix_end = len(line)
        return_list.append( ( speaker_name, speaker_type, line[ix_start:ix_end] )  )
    
    # Discard text chunks with no content but whitespace.
    return_list = filter(lambda x: len(x[2].strip())>0, return_list)
    
    return return_list

# Split before.
def get_candidate_splits(line, candidates):
    return_list = []
    
    pattern = '(' + '|'.join(candidates) + '):'
    compiled_pattern = re.compile(pattern, re.IGNORECASE)
    
    matched = compiled_pattern.finditer(line)

    for m in matched:
        #print 'group="'+m.group(1)+'"', m.start(), m.end()
        return_list.append( (m.group(1), 'candidate', m.start()) )

    #print 'get_candidate_splits() returns: ', return_list
    
    return return_list

# Split before.
def get_moderator_splits(line, moderators):
    return_list = []
    
    #pattern = '(' + '|'.join(moderators) + '):'
    pattern = '(' + '|'.join(moderators) + ')' + '(?: *\[[a-z ]*\])?:'
    #compiled_pattern = re.compile("(ramos|salinas)(?:\[[a-z ]*\])?:", re.IGNORECASE)
    
    compiled_pattern = re.compile(pattern, re.IGNORECASE)
    matched = compiled_pattern.finditer(line)

    for m in matched:
        #print 'group="'+m.group(1)+'"', m.start(), m.end()
        return_list.append( (m.group(1), 'moderator', m.start()) )
    
    # Discard splits with no name or body.
    return_list = filter(lambda x: len(x[0])>0, return_list)
    
    #print 'get_moderator_splits() returns: ', return_list
    
    return return_list

# Split both before and after.
def get_audience_splits(line, audience):
    return_list = []
    
    # First split before.
    pattern = '[\(\[](' + '|'.join(audience) + ')'
    compiled_pattern = re.compile(pattern, re.IGNORECASE)
    matched = compiled_pattern.finditer(line)
    for m in matched:
        return_list.append( (m.group(1), 'audience', m.start()) )
        
    # Then split after.
    pattern = '(' + '|'.join(audience) + ')[\)\]]'
    compiled_pattern = re.compile(pattern, re.IGNORECASE)
    matched = compiled_pattern.finditer(line)
    for m in matched:
        if (m.end() != len(line)):
            return_list.append( ('', '', m.end()) ) 

    #print 'get_audience_splits() returns: ', return_list
    
    return return_list

# Add line to dataframe.
def add_line(speaker, speaker_type, speech, start_time, duration, rows_list):
    #print '**add_line(): speaker=' + speaker + ', speaker_type=' + speaker_type + ', text=' + speech
    dict = {'speaker':speaker, 'speaker_type':speaker_type, 'text':speech, 'start_time':start_time, 'duration':duration}
    rows_list.append(dict)
    
# Return list of last names of candidates in this debate
def get_candidates(cstr):
    returnlist = [s.strip() for s in cstr.split(',')]
    returnlist = [s.upper().split(' ')[1] for s in returnlist]
    return returnlist

# Return list of last names of moderators of this debate
def get_moderators(mstr):
    returnlist = [s.strip() for s in mstr.split(',')]
    #returnlist = [s.upper().split(' ')[1] for s in returnlist]
    returnlist = [s.upper().split(' ') for s in returnlist]
    returnlist = [s[len(s)-1] for s in returnlist]
    return returnlist

# Return list of audience events
def get_audience():
    returnlist = ['APPLAUSE', 'BOOING', 'CHEERING', 'LAUGHTER']
    return returnlist

# Return map of strings for event name, date, party, location, start_time, end_time, duration
def get_event_info():
    returnmap = {}
    return returnmap

# Generate the conventional infile name for this debate.
def get_infilename(row):
    returnstr = ""
    returnstr += row['Party'] + str(int(row['Ordinal'])) + '.txt'
    return returnstr

# Generate the conentional outfile name for this debate.
def get_outfilename(row):
    returnstr = ""
    returnstr += row['Party'] + str(int(row['Ordinal'])) + '_proc.csv'
    return returnstr

print 'done'

done


In [364]:
testline = "BLITZER: We're live here at the University of Houston for the 10th Republican presidential debate. (applause)"

get_audience_splits(testline, ['APPLAUSE', 'LAUGHTER'])
result = get_splits_by_speaker(testline, ['TRUMP'],['BLITZER'],['APPLAUSE', 'LAUGHTER'])
print result

[('BLITZER', 'moderator', "BLITZER: We're live here at the University of Houston for the 10th Republican presidential debate. "), ('applause', 'audience', '(applause)')]


In [11]:
# Function def

# Load and parse a text file representing a debate transcript. 
# Assume the debate candidates, moderators, and audience evets are as provided.
# Write results to csv.

def process_transcript(infilename, outfilename, candidates, moderators, audience):
    rows_list = []
    #with open('transcript_v2.txt') as infile:
    with open(infilename) as infile:
        speaker=""
        speaker_type=""
        speech=""
        time=0
        #overlap_time=0
        
        #threshold = 20
        #i = 0
        for line in infile:
            
            #i+=1;
            #if (i>threshold):
            #    break
            
            line = line.strip()
            line = correct_spellings(line, [x+':' for x in candidates+moderators+audience])
            if len(line)==0:
                continue
            
            # Split line into chunks, each from a single speaker, of format (speaker, speaker_type, speech).
            splits = get_splits_by_speaker(line, candidates=candidates, moderators=moderators, audience=audience) 
            #print 'splits=', splits
            
            for split in splits:
                #print 'split=', split
                
                new_speaker = split[0]
                new_speaker_type = split[1]
                new_speech = split[2]
                
                # Audience event, e.g. "(laughter, applause)".
                # Do not change the speaker or end the current speech.
                # Add one or more audience events with appropriate timing estimates.
                if (new_speaker_type=='audience'):
                    for event in audience:
                        est_duration=50
                        if event.lower() in new_speech.lower():
                            add_line(event, "audience", new_speech, time+len(speech), est_duration, rows_list)  
                
                # No change of speaker.
                # Just continue to accumulate the current speech text.
                elif ( (new_speaker=='') | (new_speaker==speaker) ):
                    speech+= (' ' + new_speech.strip())
                
                # Change of speaker, e.g. "TRUMP: Blah."
                # Cut off the current speech and add it to the results list.
                # Change speaker and speaker type. Begin accumulating the next speech.
                elif (new_speaker!=speaker):
                   
                    # Cut off the previous speech and add it to the df.
                    add_line(speaker, speaker_type, speech, time, len(speech), rows_list)
                   
                    # Begin the next speech.
                    time += len(speech)
                    overlap_time = 0
                    speech = new_speech
                    speaker = new_speaker
                    speaker_type = new_speaker_type
                    
                else:
                    print "ERROR! No line type recognized for split", split
                    
        add_line(speaker, speaker_type, speech, time, len(speech), rows_list)
       
    # Now create the dataframe & write it to csv.
    df = pd.DataFrame(rows_list[1:]) 
    df.to_csv(outfilename)
    print 'done processing file ', infilename, '; shape=', df.shape, '; results written to file ', outfilename

print 'done'

done


In [12]:
# Locate the data.
%ls ../data/debateTranscripts/
print
%ls ../data

D1.txt                    D8_proc.csv               R4_proc.csv
D1_proc.csv               D9.txt                    R5.txt
D2.txt                    R1.txt                    R5_proc.csv
D2_proc.csv               R10.txt                   R6.txt
D3.txt                    R10_proc.csv              R6_proc.csv
D3_proc.csv               R11.txt                   R7.txt
D4.txt                    R11_proc.csv              R7_proc.csv
D4_proc.csv               R12.txt                   R8.txt
D5.txt                    R12_proc.csv              R8_proc.csv
D5_proc.csv               R1_proc.csv               R9.txt
D6.txt                    R2.txt                    R9_proc.csv
D6_proc.csv               R2_proc.csv               debateTranscripts.tar.gz
D7.txt                    R3.txt                    [34mlabeled_transcripts[m[m/
D7_proc.csv               R3_proc.csv               transcript_test.txt
D8.txt                    R4.txt

[34mKaggle_1stGOPDebateTweets[m[m/
[34mKaggle_Prim

In [13]:
# Load the schedule of debates and primaries.
df_dp = pd.read_csv("../data/unifiedCalendar_04-26-16.csv")
#df_dp.head(2)
#print type(df_dp["Candidates"][0])

# Get only entries for EventType 'debate'.
df_dp = df_dp[df_dp['EventType']=='debate']
df_dp.head(2)
print df_dp.shape


(22, 17)


In [14]:
# Process the transcripts for all debates included in the schedule.

#for row in df_dp.itertuples():
#    print row.EventType

#limit = 1
#i = 0
for ix, row in df_dp.iterrows():
    #i+=1;
    #if (i>limit):
    #    break
        
    #print ix, row['EventType']
    
    prefix = '../data/debateTranscripts/'
    if (row['Candidates']=="CANCELLED"):
        continue
    
    candidates = get_candidates(row['Candidates'])
    moderators = get_moderators(row['Moderators'])
    audience = get_audience()
    
    infilename = prefix + get_infilename(row)
    outfilename = prefix + get_outfilename(row)
    
    print ix, infilename, outfilename, candidates, moderators
    #print
    process_transcript(infilename, outfilename, candidates, moderators, audience)
    print
print 'done'

0 ../data/debateTranscripts/R1.txt ../data/debateTranscripts/R1_proc.csv ['TRUMP', 'BUSH', 'WALKER', 'HUCKABEE', 'CARSON', 'CRUZ', 'RUBIO', 'PAUL', 'CHRISTIE', 'KASICH'] ['BAIER', 'KELLY', 'WALLACE']
done processing file  ../data/debateTranscripts/R1.txt ; shape= (417, 5) ; results written to file  ../data/debateTranscripts/R1_proc.csv

1 ../data/debateTranscripts/D1.txt ../data/debateTranscripts/D1_proc.csv ['CLINTON', 'SANDERS', "O'MALLEY", 'WEBB', 'CHAFEE'] ['COOPER', 'BASH', 'LOPEZ']
done processing file  ../data/debateTranscripts/D1.txt ; shape= (530, 5) ; results written to file  ../data/debateTranscripts/D1_proc.csv

2 ../data/debateTranscripts/R2.txt ../data/debateTranscripts/R2_proc.csv ['TRUMP', 'CARSON', 'BUSH', 'CRUZ', 'WALKER', 'RUBIO', 'FIORINA', 'HUCKABEE', 'PAUL', 'KASICH', 'CHRISTIE'] ['TAPPER', 'HEWITT', 'BASH']
done processing file  ../data/debateTranscripts/R2.txt ; shape= (794, 5) ; results written to file  ../data/debateTranscripts/R2_proc.csv

3 ../data/debateTra

In [None]:
### DO NOT USE BELOW THIS POINT ###

In [274]:
testline = "blah TRUMP: blah. (applause) CRUZ: blah (applause, laughter) blah"
audience = ['APPLAUSE', 'BOOING', 'CHEERING', 'LAUGHTER']
candidates = ['TRUMP', 'CRUZ', 'CARSON']
#print get_candidate_splits(testline, candidates)
print get_splits_by_speaker(testline, candidates, [], audience)

[('', '', 'blah '), ('TRUMP', 'candidate', 'TRUMP: blah. '), ('applause', 'audience', '(applause)'), ('CRUZ', 'candidate', 'CRUZ: blah '), ('applause', 'audience', '(applause, laughter)'), ('', '', ' blah')]


In [267]:
teststring = ' '
print '"' + teststring.strip() + '"'

""


In [185]:
testlist = ['TRUMP', 'CARSON', 'CRUZ']
pattern = '(' + '|'.join(testlist) + '):'
print pattern


(TRUMP|CARSON|CRUZ):


In [333]:
#testline = "Trump: blah about Carson. [applause] Trump: blah. Carson: blah. (applause)"
#testline = "blah (applause) talking. (applause) More text. (applause)"
testline = "Ramos [fsd]: blah [applause] blah"

#pattern = r"(?i)trump"  # Also works (like cypher).
#pattern = r"(trump|carson)"
#matched = re.findall(pattern, testline, re.IGNORECASE)
#print matched

#compiled_pattern = re.compile("(?=(trump|carson):)", re.IGNORECASE)
#compiled_pattern = re.compile("(trump|carson):", re.IGNORECASE)
compiled_pattern = re.compile("[\[\(](applause|booing)", re.IGNORECASE)
#compiled_pattern = re.compile("(ramos|salinas)(?: *\[[a-z ]*\])?:", re.IGNORECASE)

matched = compiled_pattern.findall(testline)
print matched  # findall returns a list of strings

matched = compiled_pattern.finditer(testline)
print matched  # finditer returns an iterator over match objects

splits = []
for m in matched:
    print 'group="'+m.group(1)+'"', m.start(), m.end()
    splits.append( (m.group(1), 'candidate', m.start()) )
print splits

# Good for candidate, moderator
chunks = []
ix_end = 0
for split in splits:
    ix_start = ix_end
    ix_end = split[2]
    speaker_name = split[0]
    speaker_type = split[1]
    chunks.append( ( speaker_name, speaker_type, testline[ix_start:ix_end] )  )
chunks.append( testline[ix_end:len(testline)] )
    
print chunks



['applause']
<callable-iterator object at 0x1068e1190>
group="applause" 18 27
[('applause', 'candidate', 18)]
[('applause', 'candidate', 'Ramos [fsd]: blah '), '[applause] blah']


In [106]:
# Debate names.
candidates = ['TRUMP', 'BUSH', 'WALKER', 'HUCKABEE', 'CARSON', 'CRUZ', 'RUBIO', 'PAUL', 'CHRISTIE', 'KASICH', 'CROSSTALK', 'UNKNOWN']
moderators = ['KELLY', 'BAIER', 'WALLACE']
audience = ['APPLAUSE', 'BOOING', 'LAUGHTER', 'CHEERING']
print 'done'


done


In [120]:
"xxxxABCDyyyy".find("ABC")

#testline = "blah CARSON: blah. Trump: Blah blah (Laughter, booing, applause) More txt"
testline = "Trump: blah blah. Trump: blah."
#testline = "blah (applause) talking. (applause) More text. (applause)"
splits = []
#print filter(lambda x: x>=0, map(testline.lower().find, map(str.lower, candidates)))
splits = filter(lambda x: x>=0, map(testline.lower().find, map(lambda y: y.lower()+':', candidates+moderators)))
#splits += filter(lambda x: x>=0, map(testline.lower().find, map(lambda y: '('+y.lower(), audience)))
#splits += filter(lambda x: x>=0, map(testline.lower().find, map(lambda y: y.lower()+')', audience)))
#splits += map(lambda x: len(testline)-len(x[0]), map(testline.lower().split, map(lambda y: y.lower()+')', audience)))
#splits += filter(lambda x: x>0, map(lambda x: len(testline)-len(x[0]), map(testline.lower().split, map(lambda y: y.lower()+')', audience))))
#splits += map(lambda z: len(testline)-len(z[1]), 
#              filter(lambda x: len(x)>1, 
#                     map(testline.lower().split, 
#                         map(lambda y: y.lower()+')', audience))))
splits += filter(lambda z: len(z)>1, [testline.split(x) for x in [y.lower()+')' for y in audience]])

#splits += map(testline.lower().split, map(lambda y: y.lower()+')', audience))
#splits += filter(lambda z: len(z)>1, map(testline.lower().split, map(lambda y: y.lower()+')', audience)))

splits.sort()
print 'splits=', splits

#c = [(m.start(), m.end()-1) for m in re.finditer(r'\S+', a)]
chunks = []
ix_end = 0
for ix in splits:
    ix_start = ix_end
    ix_end = ix
    chunks.append( testline[ix_start:ix_end] )
chunks.append( testline[ix_end:len(testline)] )
    
print chunks


splits= [0]
['', 'Trump: blah blah. Trump: blah.']


In [54]:
testline = "blah CARSON: blah. TRUMP: Blah blah (Laughter, booing)"
print filter(lambda x: x.lower()+':' in testline.lower(), candidates+moderators)
print filter(lambda x: x.lower() in testline.lower(), audience)
print ( filter(lambda x: x.lower()+':' in testline.lower(), candidates+moderators) + 
       filter(lambda x: x.lower() in testline.lower(), audience) )


['TRUMP']
['BOOING', 'LAUGHTER', 'booing', 'laughter']
['TRUMP', 'BOOING', 'LAUGHTER', 'booing', 'laughter']


In [70]:
testline = "TRUMP: Blah blah"
reduce(lambda x,y: x or y, map(testline.startswith, candidates))

True

In [65]:
#testline = "TRUMP: I will not make the pledge at this time."
testline = "(LAUGHTER, BOOING)"
print is_new_speaker(testline)

False


In [4]:
# Create dataframe for debate info.
#cols = ['text', 'speaker', 'speaker_type', 'time', 'duration', 'topic']
#df = pd.DataFrame(columns=cols)
#df = df.append({'name': 'Zed', 'age': 9, 'height': 2}, ignore_index=True)
#df.head()

Unnamed: 0,text,speaker,speaker_type,time,duration,topic


In [104]:
df.shape
df.tail(10)
df.to_csv("transcript_v2_proc.csv")

In [75]:
with open('transcript_test.txt') as infile:
    for line in infile:
        line = line.strip()
        if len(line)==0:
            continue
        print line, len(line)
        if is_new_speaker(line):
            print "new_speaker!"
        else:
            print "no new speaker"

TRUMP: I will not make the pledge at this time. 47
new_speaker!
BAIER: OK. Alright. 19
new_speaker!
Enough. 7
no new speaker
KELLY: Gentlemen, our first round of questions is on the subject of electability in the general election. 105
new_speaker!
and we start tonight with you, Dr. Carson. 42
no new speaker


In [40]:
test_list = [4,6,3]
test_list.sort()
print test_list

[3, 4, 6]


In [25]:
correct_words = ['TRUMP', 'CARSON', "WALLACE"]
test_string = 'TUMP: blah. CRSON: blah. Trump: blah. WAlace: blah'
for correct_word in correct_words:
    word_len = len(correct_word)
    print correct_word, word_len
    for i in range(1,word_len-1):
        #print i
        missp = correct_word[0:i]+correct_word[i+1:word_len]
        print missp
        #test_string.replace(missp, correct_word)
        pattern = re.compile(missp, re.IGNORECASE)
        test_string = pattern.sub(correct_word, test_string)
print test_string

TRUMP 5
TUMP
TRMP
TRUP
CARSON 6
CRSON
CASON
CARON
CARSN
WALLACE 7
WLLACE
WALACE
WALACE
WALLCE
WALLAE
TRUMP: blah. CARSON: blah. Trump: blah. WALLACE: blah


In [20]:
test_line = "TUMP: blah"
import re
pattern = re.compile("TUMP", re.IGNORECASE)
test_line = pattern.sub("TRUMP", test_line)
print test_line

TRUMP: blah


In [15]:
import re
pattern = re.compile("hello", re.IGNORECASE)
pattern.sub("bye", "hello HeLLo HELLO")
## 'bye bye bye'

'bye bye bye'

In [62]:
my_string="hello python world , i'm a beginner "
print my_string.split("world",1)[0] 

hello python 


In [86]:
a = ['a', 'b']
len(a)

2