In [None]:
# Purpose:  For each debate, load the already-topic-labeled debate transcript, assign estimated time durations
#           per statement, calculate total time durations by topic and candidate, and write results to csv.
#           Or, write results to csv with time durations per statement, to support visualizing it as a Gantt chart.
# Author:  Carol Sniegoski
# Date:  May 27, 2016
# Course:  DSE MAS Capstone, Spring 2016

In [1]:
import pandas as pd
print 'done'

done


In [15]:
# Function definitions.

# Return list of last names of candidates in this debate
def get_candidates(cstr):
    returnlist = [s.strip() for s in cstr.split(',')]
    returnlist = [s.upper().split(' ')[1] for s in returnlist]
    return returnlist

# Generate the conventional infile name for the manually-labeled, processed text of this debate.
def get_labeled_infilename(row):
    returnstr = ""
    returnstr += row['Party'] + str(int(row['Ordinal'])) + '_proc_labeled.csv'
    return returnstr

# Generate the event name (eg., D1, R2).
def get_eventname(row):
    returnstr = ""
    returnstr += row['Party'] + str(int(row['Ordinal']))
    return returnstr

# Generate the conventional outfile name for manually-labeled, processed text of this debate.
def get_labeled_outfilename(row):
    returnstr = ""
    returnstr += row['Party'] + str(int(row['Ordinal'])) + '_timeDurations.csv'
    return returnstr

def time_process_transcript_forGantt( infilename, eventname, candidates ):
    # Load infile.
    try:
        df_transcript = pd.read_csv(infilename)
    except:
        print "ERROR: File {0} does not exist.".format(infilename)
        return
    
    # Remove audience speech acts, as they always overlap with speech acts from candidates or moderators.
    df_transcript = df_transcript[ df_transcript['speaker_type']!='audience' ]
    
    # Get total debate length in characters.
    totalChars = df_transcript['duration'].sum()
    print totalChars
    
    # Normalize by total character length to get number of seconds. Assume each debate lasts 120 minutes.
    df_transcript['timeDurationSec'] = (df_transcript['duration']*120*60)/totalChars
    df_transcript['startTimeSec'] = (df_transcript['start_time']*120*60)/totalChars

    # Add event field.
    df_transcript['event'] = eventname
    
    return df_transcript
    

def time_process_transcript(infilename, outfilename, candidates):
    # Load infile.
    try:
        df_transcript = pd.read_csv(infilename)
    except:
        print "ERROR: File {0} does not exist.".format(infilename)
        return
    
    #print df_transcript.head(2)
    
    # Remove audience speech acts, as they always overlap with speech acts from candidates or moderators.
    df_transcript = df_transcript[ df_transcript['speaker_type']!='audience' ]
    #print df_transcript.head(2)
    
    # Get total debate length in characters.
    totalChars = df_transcript['duration'].sum()
    print totalChars
        
    # Do not include moderator speech.
    df_transcript = df_transcript[df_transcript['speaker_type']=='candidate']
    #df_transcript = df_transcript[df_transcript['speaker_type'].isin(['candidate', 'moderator'])]

    # Drop unwanted topics.
    unwanted_topics = ['intro', 'closing']
    df_transcript = df_transcript[~df_transcript['topic'].isin(unwanted_topics)]

    # Sum lengths of speeches (in characters) by candidate and topic. 
    df_transcript_summed = df_transcript.groupby(['speaker', 'topic'])['duration'].sum().reset_index()
    df_transcript_summed.fillna(0, inplace=True)
    #print df_transcript_summed.head(6)
    
    # Normalize by total character length to get number of minutes. Assume each debate lasts 120 minutes.
    df_transcript_summed['timeDurationSec'] = (df_transcript_summed['duration']*120*60)/totalChars
    
    print df_transcript_summed.shape
    #print df_transcript_summed.head(6)
    #print df_transcript_summed['speaker'].value_counts()
    #print df_transcript_summed['topic'].value_counts()
    print df_transcript_summed.groupby(['speaker'])['timeDurationSec'].sum()/60
    print df_transcript_summed['timeDurationSec'].sum()/60

    # Write results to outfile.
    df_transcript_summed.to_csv(outfilename)
    print "Wrote results to {}.".format(outfilename)
    
print 'done'

done


In [3]:
# Locate the labeled transcript data.
labeledTranscripts_prefix = "../data/debateTranscripts/labeled_transcripts/"
%ls ../data/debateTranscripts/labeled_transcripts/

D1_proc.csv            D4_timeDurations.csv   D8_proc.csv            R1_proc.csv            R4_proc_labeled.csv    R8_proc_labeled.csv
D1_proc_labeled.csv    D5_proc.csv            D8_proc_labeled.csv    R1_proc_labeled.csv    R4_timeDurations.csv   R8_timeDurations.csv
D1_timeDurations.csv   D5_proc_labeled.csv    D8_timeDurations.csv   R1_timeDurations.csv   R5_proc.csv            R9_proc.csv
D2_proc labeled.csv    D5_timeDurations.csv   R10_proc.csv           R2_proc.csv            R6_proc.csv            R9_proc_labeled.csv
D2_proc.csv            D6_proc.csv            R10_proc_labeled.csv   R2_proc_labeled.csv    R6_proc_labeled.csv    R9_timeDurations.csv
D3_proc.csv            D6_proc_labeled.csv    R10_timeDurations.csv  R2_timeDurations.csv   R6_timeDurations.csv
D3_proc_labeled.csv    D6_timeDurations.csv   R11_proc.csv           R3_proc.csv            R7_proc.csv
D3_timeDurations.csv   D7_proc.csv            R11_proc_labeled.csv   R3_proc_labeled.csv    R7_proc_labeled

In [4]:
# Locate the debate calendar.
%ls ../data/calendar/

[34munifiedCalendar_03-26-16[m[m/                unifiedCalendar_04-26-16.csv
unifiedCalendar_03-26-16.csv             unifiedCalendar_DebatesAndPrimaries.csv


In [5]:
# Load the schedule of debates and primaries.
df_dp = pd.read_csv("../data/calendar/unifiedCalendar_04-26-16.csv")
#df_dp.head(2)
#print type(df_dp["Candidates"][0])

# Get only entries for EventType 'debate'.
df_dp = df_dp[df_dp['EventType']=='debate']
df_dp.head(2)
print df_dp.shape

(22, 17)


In [21]:
# Process the manually-labeled transcripts for all debates included in the schedule.
# This version is for a Gantt chart.
# It drops audience events and adds an estimated time duration to each remaining speech segment,
# and returns the df.

#for row in df_dp.itertuples():
#    print row.EventType

df_list = []
limit = 100
i = 0
for ix, row in df_dp.iterrows():
    i+=1;
    if (i>limit):
        break
        
    #print ix, row['EventType']
    
    if (row['Candidates']=="CANCELLED"):
        continue
    
    candidates = get_candidates(row['Candidates'])
    #moderators = get_moderators(row['Moderators'])
    #audience = get_audience()
    event = get_eventname(row)
    
    infilename = labeledTranscripts_prefix + get_labeled_infilename(row)
    outfilename = labeledTranscripts_prefix + get_labeled_outfilename(row)
    
    print ix, event, infilename, candidates
    #print
    df_list.append( time_process_transcript_forGantt(infilename, event, candidates) )
    print
print 'done'

0 R1 ../data/debateTranscripts/labeled_transcripts/R1_proc_labeled.csv ['TRUMP', 'BUSH', 'WALKER', 'HUCKABEE', 'CARSON', 'CRUZ', 'RUBIO', 'PAUL', 'CHRISTIE', 'KASICH']
106812.0

1 R2 ../data/debateTranscripts/labeled_transcripts/R2_proc_labeled.csv ['TRUMP', 'CARSON', 'BUSH', 'CRUZ', 'WALKER', 'RUBIO', 'FIORINA', 'HUCKABEE', 'PAUL', 'KASICH', 'CHRISTIE']
187100.0

2 D1 ../data/debateTranscripts/labeled_transcripts/D1_proc_labeled.csv ['CLINTON', 'SANDERS', "O'MALLEY", 'WEBB', 'CHAFEE']
129174

3 R3 ../data/debateTranscripts/labeled_transcripts/R3_proc_labeled.csv ['TRUMP', 'CARSON', 'BUSH', 'RUBIO', 'CRUZ', 'HUCKABEE', 'PAUL', 'FIORINA', 'CHRISTIE', 'KASICH']
121236.0

4 R4 ../data/debateTranscripts/labeled_transcripts/R4_proc_labeled.csv ['TRUMP', 'CARSON', 'RUBIO', 'CRUZ', 'BUSH', 'FIORINA', 'KASICH', 'PAUL']
114573.0

5 D2 ../data/debateTranscripts/labeled_transcripts/D2_proc_labeled.csv ['CLINTON', 'SANDERS', "O'MALLEY"]
ERROR: File ../data/debateTranscripts/labeled_transcripts/D2_

In [23]:
df_list[0].head(1)

Unnamed: 0.1,Unnamed: 0,duration,speaker,speaker_type,start_time,text,topic,timeDurationSec,startTimeSec,event
2,2,427,KELLY,moderator,0,KELLY: Welcome to the first debate night of th...,intro,28.783283,0,R1


In [24]:
df_all = pd.concat(df_list)
print df_all.shape

(5751, 10)


In [25]:
# Write to csv.
out_prefix = "../data/topic/"
out_fname = "All_durations_Gantt_06-09-16.csv"
df_all.to_csv(out_prefix + out_fname)
print "Written to", out_prefix + out_fname

Written to ../data/topic/All_durations_Gantt_06-09-16.csv


In [6]:
# Process the manually-labeled transcripts for all debates included in the schedule.
# This version rolls up each transcript by topic and speaker, sums the time in minutes, and writes
# each result to csv file.

#for row in df_dp.itertuples():
#    print row.EventType

limit = 400
i = 0
for ix, row in df_dp.iterrows():
    i+=1;
    if (i>limit):
        break
        
    #print ix, row['EventType']
    
    if (row['Candidates']=="CANCELLED"):
        continue
    
    candidates = get_candidates(row['Candidates'])
    #moderators = get_moderators(row['Moderators'])
    #audience = get_audience()
    
    infilename = labeledTranscripts_prefix + get_labeled_infilename(row)
    outfilename = labeledTranscripts_prefix + get_labeled_outfilename(row)
    
    print ix, infilename, candidates
    #print
    time_process_transcript(infilename, outfilename, candidates)
    print
print 'done'

0 ../data/debateTranscripts/labeled_transcripts/R1_proc_labeled.csv ['TRUMP', 'BUSH', 'WALKER', 'HUCKABEE', 'CARSON', 'CRUZ', 'RUBIO', 'PAUL', 'CHRISTIE', 'KASICH']
106812.0
(65, 4)
speaker
BUSH         8.347377
CARSON       6.027413
CHRISTIE     7.492417
CRUZ         6.028536
HUCKABEE     6.884620
KASICH       7.302550
PAUL         5.656668
RUBIO        7.929446
TRUMP       12.017751
WALKER       6.950904
Name: timeDurationSec, dtype: float64
74.6376811594
Wrote results to ../data/debateTranscripts/labeled_transcripts/R1_timeDurations.csv.

1 ../data/debateTranscripts/labeled_transcripts/D1_proc_labeled.csv ['CLINTON', 'SANDERS', "O'MALLEY", 'WEBB', 'CHAFEE']
129174
(83, 4)
speaker
CHAFEE       5.858145
CLINTON     24.365275
O'MALLEY    13.576107
SANDERS     21.410191
WEBB        11.362348
Name: timeDurationSec, dtype: float64
76.572065586
Wrote results to ../data/debateTranscripts/labeled_transcripts/D1_timeDurations.csv.

2 ../data/debateTranscripts/labeled_transcripts/R2_proc_label

In [None]:
### DO NOT USE BELOW THIS POINT ###

In [10]:
# Define functions.

# Return line with all misspelled instances of the correct_words replaced with the correct_word.
# Here we define a misspelling as missing one letter other than the first or last letter in a word.
# Case insensitive.
def correct_spellings(line, correct_words):
    
    for correct_word in correct_words:
        word_len = len(correct_word)
        for i in range(1,word_len-1):
            
            # Create the misspelled version of the word to look for.
            missp = correct_word[0:i]+correct_word[i+1:word_len]
            
            # Do a case-insensitive replacement.
            #test_string.replace(missp, correct_word)
            pattern = re.compile(missp, re.IGNORECASE)
            line = pattern.sub(correct_word, line) 
            
    # Fixing the "O'Malley" spelling variations.
    line = line.replace("’", "'")
    line = line.replace("O' ", "O'")
    
    return line

        
# Return list of text chunks, each starting with either the beginning of the line
# or a new speaker or audience event.
def get_splits_by_speaker(line, candidates, moderators, audience):
    return_list = []
    
    # Get splits for each speaker type. Unfortunately they are all handled slightly differently.
    splits = get_candidate_splits(line, candidates) + get_moderator_splits(line, moderators) + get_audience_splits(line, audience)

    # Sort list of tuples by 3rd element, which is the start index in the line.
    splits.sort(key=lambda x: x[2])
    
    # Use indices to split line into text chunks.

    # If there are no indices, just return the line, with speaker unknown.
    if len(splits)==0:
        return_list = [ ('', '', line) ]
        return return_list
    
    # If the first entry does not start at index zero, add an initial text chunk with speaker unknown.
    if splits[0][2]!=0:
        return_list.append( ('', '', line[0:splits[0][2]]) )
    
    # Add the rest of the text chunks.
    #ix_end = len(line)
    for i in range(len(splits)):
        ix_start = splits[i][2]
        speaker_name = splits[i][0]
        speaker_type = splits[i][1]
        try:
            ix_end = splits[i+1][2]
        except:
            ix_end = len(line)
        return_list.append( ( speaker_name, speaker_type, line[ix_start:ix_end] )  )
    
    # Discard text chunks with no content but whitespace.
    return_list = filter(lambda x: len(x[2].strip())>0, return_list)
    
    return return_list

# Split before.
def get_candidate_splits(line, candidates):
    return_list = []
    
    pattern = '(' + '|'.join(candidates) + '):'
    compiled_pattern = re.compile(pattern, re.IGNORECASE)
    
    matched = compiled_pattern.finditer(line)

    for m in matched:
        #print 'group="'+m.group(1)+'"', m.start(), m.end()
        return_list.append( (m.group(1), 'candidate', m.start()) )

    #print 'get_candidate_splits() returns: ', return_list
    
    return return_list

# Split before.
def get_moderator_splits(line, moderators):
    return_list = []
    
    #pattern = '(' + '|'.join(moderators) + '):'
    pattern = '(' + '|'.join(moderators) + ')' + '(?: *\[[a-z ]*\])?:'
    #compiled_pattern = re.compile("(ramos|salinas)(?:\[[a-z ]*\])?:", re.IGNORECASE)
    
    compiled_pattern = re.compile(pattern, re.IGNORECASE)
    matched = compiled_pattern.finditer(line)

    for m in matched:
        #print 'group="'+m.group(1)+'"', m.start(), m.end()
        return_list.append( (m.group(1), 'moderator', m.start()) )
    
    # Discard splits with no name or body.
    return_list = filter(lambda x: len(x[0])>0, return_list)
    
    #print 'get_moderator_splits() returns: ', return_list
    
    return return_list

# Split both before and after.
def get_audience_splits(line, audience):
    return_list = []
    
    # First split before.
    pattern = '[\(\[](' + '|'.join(audience) + ')'
    compiled_pattern = re.compile(pattern, re.IGNORECASE)
    matched = compiled_pattern.finditer(line)
    for m in matched:
        return_list.append( (m.group(1), 'audience', m.start()) )
        
    # Then split after.
    pattern = '(' + '|'.join(audience) + ')[\)\]]'
    compiled_pattern = re.compile(pattern, re.IGNORECASE)
    matched = compiled_pattern.finditer(line)
    for m in matched:
        if (m.end() != len(line)):
            return_list.append( ('', '', m.end()) ) 

    #print 'get_audience_splits() returns: ', return_list
    
    return return_list

# Add line to dataframe.
def add_line(speaker, speaker_type, speech, start_time, duration, rows_list):
    #print '**add_line(): speaker=' + speaker + ', speaker_type=' + speaker_type + ', text=' + speech
    dict = {'speaker':speaker, 'speaker_type':speaker_type, 'text':speech, 'start_time':start_time, 'duration':duration}
    rows_list.append(dict)
    
# Return list of last names of candidates in this debate
def get_candidates(cstr):
    returnlist = [s.strip() for s in cstr.split(',')]
    returnlist = [s.upper().split(' ')[1] for s in returnlist]
    return returnlist

# Return list of last names of moderators of this debate
def get_moderators(mstr):
    returnlist = [s.strip() for s in mstr.split(',')]
    #returnlist = [s.upper().split(' ')[1] for s in returnlist]
    returnlist = [s.upper().split(' ') for s in returnlist]
    returnlist = [s[len(s)-1] for s in returnlist]
    return returnlist

# Return list of audience events
def get_audience():
    returnlist = ['APPLAUSE', 'BOOING', 'CHEERING', 'LAUGHTER']
    return returnlist

# Return map of strings for event name, date, party, location, start_time, end_time, duration
def get_event_info():
    returnmap = {}
    return returnmap

# Generate the conventional infile name for this debate.
def get_infilename(row):
    returnstr = ""
    returnstr += row['Party'] + str(int(row['Ordinal'])) + '.txt'
    return returnstr

# Generate the conentional outfile name for this debate.
def get_outfilename(row):
    returnstr = ""
    returnstr += row['Party'] + str(int(row['Ordinal'])) + '_proc.csv'
    return returnstr

print 'done'

done


In [364]:
testline = "BLITZER: We're live here at the University of Houston for the 10th Republican presidential debate. (applause)"

get_audience_splits(testline, ['APPLAUSE', 'LAUGHTER'])
result = get_splits_by_speaker(testline, ['TRUMP'],['BLITZER'],['APPLAUSE', 'LAUGHTER'])
print result

[('BLITZER', 'moderator', "BLITZER: We're live here at the University of Houston for the 10th Republican presidential debate. "), ('applause', 'audience', '(applause)')]


In [11]:
# Function def

# Load and parse a text file representing a debate transcript. 
# Assume the debate candidates, moderators, and audience evets are as provided.
# Write results to csv.

def process_transcript(infilename, outfilename, candidates, moderators, audience):
    rows_list = []
    #with open('transcript_v2.txt') as infile:
    with open(infilename) as infile:
        speaker=""
        speaker_type=""
        speech=""
        time=0
        #overlap_time=0
        
        #threshold = 20
        #i = 0
        for line in infile:
            
            #i+=1;
            #if (i>threshold):
            #    break
            
            line = line.strip()
            line = correct_spellings(line, [x+':' for x in candidates+moderators+audience])
            if len(line)==0:
                continue
            
            # Split line into chunks, each from a single speaker, of format (speaker, speaker_type, speech).
            splits = get_splits_by_speaker(line, candidates=candidates, moderators=moderators, audience=audience) 
            #print 'splits=', splits
            
            for split in splits:
                #print 'split=', split
                
                new_speaker = split[0]
                new_speaker_type = split[1]
                new_speech = split[2]
                
                # Audience event, e.g. "(laughter, applause)".
                # Do not change the speaker or end the current speech.
                # Add one or more audience events with appropriate timing estimates.
                if (new_speaker_type=='audience'):
                    for event in audience:
                        est_duration=50
                        if event.lower() in new_speech.lower():
                            add_line(event, "audience", new_speech, time+len(speech), est_duration, rows_list)  
                
                # No change of speaker.
                # Just continue to accumulate the current speech text.
                elif ( (new_speaker=='') | (new_speaker==speaker) ):
                    speech+= (' ' + new_speech.strip())
                
                # Change of speaker, e.g. "TRUMP: Blah."
                # Cut off the current speech and add it to the results list.
                # Change speaker and speaker type. Begin accumulating the next speech.
                elif (new_speaker!=speaker):
                   
                    # Cut off the previous speech and add it to the df.
                    add_line(speaker, speaker_type, speech, time, len(speech), rows_list)
                   
                    # Begin the next speech.
                    time += len(speech)
                    overlap_time = 0
                    speech = new_speech
                    speaker = new_speaker
                    speaker_type = new_speaker_type
                    
                else:
                    print "ERROR! No line type recognized for split", split
                    
        add_line(speaker, speaker_type, speech, time, len(speech), rows_list)
       
    # Now create the dataframe & write it to csv.
    df = pd.DataFrame(rows_list[1:]) 
    df.to_csv(outfilename)
    print 'done processing file ', infilename, '; shape=', df.shape, '; results written to file ', outfilename

print 'done'

done


In [13]:
# Load the schedule of debates and primaries.
df_dp = pd.read_csv("../data/unifiedCalendar_04-26-16.csv")
#df_dp.head(2)
#print type(df_dp["Candidates"][0])

# Get only entries for EventType 'debate'.
df_dp = df_dp[df_dp['EventType']=='debate']
df_dp.head(2)
print df_dp.shape


(22, 17)


In [14]:
# Process the transcripts for all debates included in the schedule.

#for row in df_dp.itertuples():
#    print row.EventType

#limit = 1
#i = 0
for ix, row in df_dp.iterrows():
    #i+=1;
    #if (i>limit):
    #    break
        
    #print ix, row['EventType']
    
    prefix = '../data/debateTranscripts/'
    if (row['Candidates']=="CANCELLED"):
        continue
    
    candidates = get_candidates(row['Candidates'])
    moderators = get_moderators(row['Moderators'])
    audience = get_audience()
    
    infilename = prefix + get_infilename(row)
    outfilename = prefix + get_outfilename(row)
    
    print ix, infilename, outfilename, candidates, moderators
    #print
    process_transcript(infilename, outfilename, candidates, moderators, audience)
    print
print 'done'

0 ../data/debateTranscripts/R1.txt ../data/debateTranscripts/R1_proc.csv ['TRUMP', 'BUSH', 'WALKER', 'HUCKABEE', 'CARSON', 'CRUZ', 'RUBIO', 'PAUL', 'CHRISTIE', 'KASICH'] ['BAIER', 'KELLY', 'WALLACE']
done processing file  ../data/debateTranscripts/R1.txt ; shape= (417, 5) ; results written to file  ../data/debateTranscripts/R1_proc.csv

1 ../data/debateTranscripts/D1.txt ../data/debateTranscripts/D1_proc.csv ['CLINTON', 'SANDERS', "O'MALLEY", 'WEBB', 'CHAFEE'] ['COOPER', 'BASH', 'LOPEZ']
done processing file  ../data/debateTranscripts/D1.txt ; shape= (530, 5) ; results written to file  ../data/debateTranscripts/D1_proc.csv

2 ../data/debateTranscripts/R2.txt ../data/debateTranscripts/R2_proc.csv ['TRUMP', 'CARSON', 'BUSH', 'CRUZ', 'WALKER', 'RUBIO', 'FIORINA', 'HUCKABEE', 'PAUL', 'KASICH', 'CHRISTIE'] ['TAPPER', 'HEWITT', 'BASH']
done processing file  ../data/debateTranscripts/R2.txt ; shape= (794, 5) ; results written to file  ../data/debateTranscripts/R2_proc.csv

3 ../data/debateTra

In [None]:
### DO NOT USE BELOW THIS POINT ###

In [274]:
testline = "blah TRUMP: blah. (applause) CRUZ: blah (applause, laughter) blah"
audience = ['APPLAUSE', 'BOOING', 'CHEERING', 'LAUGHTER']
candidates = ['TRUMP', 'CRUZ', 'CARSON']
#print get_candidate_splits(testline, candidates)
print get_splits_by_speaker(testline, candidates, [], audience)

[('', '', 'blah '), ('TRUMP', 'candidate', 'TRUMP: blah. '), ('applause', 'audience', '(applause)'), ('CRUZ', 'candidate', 'CRUZ: blah '), ('applause', 'audience', '(applause, laughter)'), ('', '', ' blah')]


In [267]:
teststring = ' '
print '"' + teststring.strip() + '"'

""


In [185]:
testlist = ['TRUMP', 'CARSON', 'CRUZ']
pattern = '(' + '|'.join(testlist) + '):'
print pattern


(TRUMP|CARSON|CRUZ):


In [333]:
#testline = "Trump: blah about Carson. [applause] Trump: blah. Carson: blah. (applause)"
#testline = "blah (applause) talking. (applause) More text. (applause)"
testline = "Ramos [fsd]: blah [applause] blah"

#pattern = r"(?i)trump"  # Also works (like cypher).
#pattern = r"(trump|carson)"
#matched = re.findall(pattern, testline, re.IGNORECASE)
#print matched

#compiled_pattern = re.compile("(?=(trump|carson):)", re.IGNORECASE)
#compiled_pattern = re.compile("(trump|carson):", re.IGNORECASE)
compiled_pattern = re.compile("[\[\(](applause|booing)", re.IGNORECASE)
#compiled_pattern = re.compile("(ramos|salinas)(?: *\[[a-z ]*\])?:", re.IGNORECASE)

matched = compiled_pattern.findall(testline)
print matched  # findall returns a list of strings

matched = compiled_pattern.finditer(testline)
print matched  # finditer returns an iterator over match objects

splits = []
for m in matched:
    print 'group="'+m.group(1)+'"', m.start(), m.end()
    splits.append( (m.group(1), 'candidate', m.start()) )
print splits

# Good for candidate, moderator
chunks = []
ix_end = 0
for split in splits:
    ix_start = ix_end
    ix_end = split[2]
    speaker_name = split[0]
    speaker_type = split[1]
    chunks.append( ( speaker_name, speaker_type, testline[ix_start:ix_end] )  )
chunks.append( testline[ix_end:len(testline)] )
    
print chunks



['applause']
<callable-iterator object at 0x1068e1190>
group="applause" 18 27
[('applause', 'candidate', 18)]
[('applause', 'candidate', 'Ramos [fsd]: blah '), '[applause] blah']


In [106]:
# Debate names.
candidates = ['TRUMP', 'BUSH', 'WALKER', 'HUCKABEE', 'CARSON', 'CRUZ', 'RUBIO', 'PAUL', 'CHRISTIE', 'KASICH', 'CROSSTALK', 'UNKNOWN']
moderators = ['KELLY', 'BAIER', 'WALLACE']
audience = ['APPLAUSE', 'BOOING', 'LAUGHTER', 'CHEERING']
print 'done'


done


In [120]:
"xxxxABCDyyyy".find("ABC")

#testline = "blah CARSON: blah. Trump: Blah blah (Laughter, booing, applause) More txt"
testline = "Trump: blah blah. Trump: blah."
#testline = "blah (applause) talking. (applause) More text. (applause)"
splits = []
#print filter(lambda x: x>=0, map(testline.lower().find, map(str.lower, candidates)))
splits = filter(lambda x: x>=0, map(testline.lower().find, map(lambda y: y.lower()+':', candidates+moderators)))
#splits += filter(lambda x: x>=0, map(testline.lower().find, map(lambda y: '('+y.lower(), audience)))
#splits += filter(lambda x: x>=0, map(testline.lower().find, map(lambda y: y.lower()+')', audience)))
#splits += map(lambda x: len(testline)-len(x[0]), map(testline.lower().split, map(lambda y: y.lower()+')', audience)))
#splits += filter(lambda x: x>0, map(lambda x: len(testline)-len(x[0]), map(testline.lower().split, map(lambda y: y.lower()+')', audience))))
#splits += map(lambda z: len(testline)-len(z[1]), 
#              filter(lambda x: len(x)>1, 
#                     map(testline.lower().split, 
#                         map(lambda y: y.lower()+')', audience))))
splits += filter(lambda z: len(z)>1, [testline.split(x) for x in [y.lower()+')' for y in audience]])

#splits += map(testline.lower().split, map(lambda y: y.lower()+')', audience))
#splits += filter(lambda z: len(z)>1, map(testline.lower().split, map(lambda y: y.lower()+')', audience)))

splits.sort()
print 'splits=', splits

#c = [(m.start(), m.end()-1) for m in re.finditer(r'\S+', a)]
chunks = []
ix_end = 0
for ix in splits:
    ix_start = ix_end
    ix_end = ix
    chunks.append( testline[ix_start:ix_end] )
chunks.append( testline[ix_end:len(testline)] )
    
print chunks


splits= [0]
['', 'Trump: blah blah. Trump: blah.']


In [54]:
testline = "blah CARSON: blah. TRUMP: Blah blah (Laughter, booing)"
print filter(lambda x: x.lower()+':' in testline.lower(), candidates+moderators)
print filter(lambda x: x.lower() in testline.lower(), audience)
print ( filter(lambda x: x.lower()+':' in testline.lower(), candidates+moderators) + 
       filter(lambda x: x.lower() in testline.lower(), audience) )


['TRUMP']
['BOOING', 'LAUGHTER', 'booing', 'laughter']
['TRUMP', 'BOOING', 'LAUGHTER', 'booing', 'laughter']


In [70]:
testline = "TRUMP: Blah blah"
reduce(lambda x,y: x or y, map(testline.startswith, candidates))

True

In [65]:
#testline = "TRUMP: I will not make the pledge at this time."
testline = "(LAUGHTER, BOOING)"
print is_new_speaker(testline)

False


In [4]:
# Create dataframe for debate info.
#cols = ['text', 'speaker', 'speaker_type', 'time', 'duration', 'topic']
#df = pd.DataFrame(columns=cols)
#df = df.append({'name': 'Zed', 'age': 9, 'height': 2}, ignore_index=True)
#df.head()

Unnamed: 0,text,speaker,speaker_type,time,duration,topic


In [104]:
df.shape
df.tail(10)
df.to_csv("transcript_v2_proc.csv")

In [75]:
with open('transcript_test.txt') as infile:
    for line in infile:
        line = line.strip()
        if len(line)==0:
            continue
        print line, len(line)
        if is_new_speaker(line):
            print "new_speaker!"
        else:
            print "no new speaker"

TRUMP: I will not make the pledge at this time. 47
new_speaker!
BAIER: OK. Alright. 19
new_speaker!
Enough. 7
no new speaker
KELLY: Gentlemen, our first round of questions is on the subject of electability in the general election. 105
new_speaker!
and we start tonight with you, Dr. Carson. 42
no new speaker


In [40]:
test_list = [4,6,3]
test_list.sort()
print test_list

[3, 4, 6]


In [25]:
correct_words = ['TRUMP', 'CARSON', "WALLACE"]
test_string = 'TUMP: blah. CRSON: blah. Trump: blah. WAlace: blah'
for correct_word in correct_words:
    word_len = len(correct_word)
    print correct_word, word_len
    for i in range(1,word_len-1):
        #print i
        missp = correct_word[0:i]+correct_word[i+1:word_len]
        print missp
        #test_string.replace(missp, correct_word)
        pattern = re.compile(missp, re.IGNORECASE)
        test_string = pattern.sub(correct_word, test_string)
print test_string

TRUMP 5
TUMP
TRMP
TRUP
CARSON 6
CRSON
CASON
CARON
CARSN
WALLACE 7
WLLACE
WALACE
WALACE
WALLCE
WALLAE
TRUMP: blah. CARSON: blah. Trump: blah. WALLACE: blah


In [20]:
test_line = "TUMP: blah"
import re
pattern = re.compile("TUMP", re.IGNORECASE)
test_line = pattern.sub("TRUMP", test_line)
print test_line

TRUMP: blah


In [15]:
import re
pattern = re.compile("hello", re.IGNORECASE)
pattern.sub("bye", "hello HeLLo HELLO")
## 'bye bye bye'

'bye bye bye'

In [62]:
my_string="hello python world , i'm a beginner "
print my_string.split("world",1)[0] 

hello python 


In [86]:
a = ['a', 'b']
len(a)

2