In [17]:
import csv
import re

file = open('switchboard2.utt')

In [27]:
#Getting rid of the speaker names and other non-dialogue information, so as to only store the dialogue

input_file = 'switchboard2.utt'

output_file = 'dialogues.txt'

with open(input_file, 'r') as in_file, open(output_file, 'w') as out_file:
    for line in in_file:
        parts = line.split(':')
        if len(parts) == 2:
            dialogue = parts[1].split('/')[0].strip()  
            out_file.write(dialogue + '\n')  

in_file.close()
out_file.close()


In [28]:
#Code for removing curly braces and their contents i.e removing the uh, um type disfluencies since we are concentrating on repair type disfluency

input_file = 'dialogues.txt'

output_file = 'modified_dialogues.txt'

def remove_curly_braces(text):
    in_braces = False
    result = []
    for char in text:
        if char == '{':
            in_braces = True
        elif char == '}':
            in_braces = False
        elif not in_braces:
            result.append(char)
    return ''.join(result)

with open(input_file, 'r') as in_file, open(output_file, 'w') as out_file:
    for line in in_file:
        modified_line = remove_curly_braces(line)
        out_file.write(modified_line)

in_file.close()
out_file.close()


In [29]:
#Removing punctuation marks
input_file = 'modified_dialogues.txt'

output_file = 'cleaned_dialogues.txt'

chars_to_remove = [',', '.', '--']

with open(input_file, 'r') as in_file, open(output_file, 'w') as out_file:
    for line in in_file:
        modified_line = line
        for char in chars_to_remove:
            modified_line = modified_line.replace(char, '')  # Remove the specified characters

        out_file.write(modified_line)

in_file.close()
out_file.close()


In [30]:
#Lowercasing all dialogue
input_file = 'cleaned_dialogues.txt'

output_file = 'lowercase_dialogues.txt'

with open(input_file, 'r') as in_file, open(output_file, 'w') as out_file:
    for line in in_file:
        # Lowercase all characters in the line
        modified_line = line.lower()
        out_file.write(modified_line)

in_file.close()
out_file.close()


In [34]:
#Removing unnecessary whitespaces
input_file = 'lowercase_dialogues.txt'

output_file = 'someBracketsLeft.txt'

with open(input_file, 'r') as in_file, open(output_file, 'w') as out_file:
    for line in in_file:
        cleaned_line = ' '.join(line.split())        
        out_file.write(cleaned_line + '\n')

in_file.close()
out_file.close()



In [1]:
#Removing other annotations, such as <> and ## which are irrelevant to our task

input_file = 'someBracketsLeft.txt'

output_file = 'onlyDisLeft.txt'

def remove_angle_brackets(text):
    result = ''
    inside_tags = 0  # Count of '<' encountered
    for char in text:
        if char == '<':
            inside_tags += 1
        elif char == '>':
            inside_tags -= 1
        elif inside_tags == 0:
            result += char
    return result

def remove_double_hashtags(text):
    result = ''
    inside_hashtags = 0  # Count of '#' encountered
    for char in text:
        if char == '#':
            inside_hashtags += 1
            if inside_hashtags % 2 == 1:
                result = result.rstrip()  # Remove trailing space before a single '#'
        elif inside_hashtags % 2 == 0:
            result += char
    return result

with open(input_file, 'r') as in_file, open(output_file, 'w') as out_file:
    for line in in_file:
        line = remove_double_hashtags(line)  # Remove content enclosed by two '#' symbols
        cleaned_line = remove_angle_brackets(line)  # Remove content enclosed by '<' and '>'
        out_file.write(cleaned_line)

in_file.close()
out_file.close()


In [3]:
#Adding an end of sentence marker
input_file = 'onlyDisLeft.txt'

output_file = 'sentence-marked.txt'

with open(input_file, 'r') as in_file, open(output_file, 'w') as out_file:
    for line in in_file:
        # Add ' $' at the end of each line
        line_with_dollar = line.rstrip() + ' $'
        out_file.write(line_with_dollar + '\n')

in_file.close()
out_file.close()



In [2]:
#Generating the tuples with the annotations for disfluency detection

# Input file
input_file = 'sentence-marked.txt'

# Dictionary to store word-classification pairs
word_classification = []

with open(input_file, 'r') as in_file:
    inbracket = False
    beforePlus = True
    
    for line in in_file:
        line = line.split() #splits the line as per the whitespace character
        for i in line:
            # print(i) #checks if alphabetical or not
            if(i == '['):
                inbracket = True
            if(i == ']'):
                inbracket = False
                beforePlus = True
                continue
            
            if(inbracket == False):
                if(i == '$'):
                    word_classification.append((i, 'EOS')) #if it is alphanumeric and outside the bracket, we give it the marker O
                else:
                    word_classification.append((i, 'O'))
            
            else:
                if(beforePlus == True):
                    if(i.isalpha()):
                        word_classification.append((i, 'RM')) #RM is the Reperandum
                    elif(i == '+'):
                        beforePlus = False
                else:
                    if(i.isalpha()):
                        word_classification.append((i, 'RP')) #RP is the repair


print(word_classification)

                
                


[('okay', 'O'), ('$', 'EOS'), ('first', 'O'), ('i', 'O'), ('need', 'O'), ('to', 'O'), ('know', 'O'), ('how', 'O'), ('do', 'O'), ('you', 'O'), ('feel', 'O'), ('about', 'RM'), ('about', 'RP'), ('sending', 'O'), ('an', 'O'), ('elderly', 'O'), ('family', 'O'), ('member', 'O'), ('to', 'O'), ('a', 'O'), ('nursing', 'O'), ('home?', 'O'), ('$', 'EOS'), ('of', 'O'), ('course', 'O'), ('one', 'O'), ('of', 'O'), ('the', 'O'), ('last', 'O'), ('few', 'O'), ('things', 'O'), ('in', 'O'), ('the', 'O'), ('world', 'O'), ("you'd", 'O'), ('ever', 'O'), ('want', 'O'), ('to', 'O'), ('do', 'O'), ('unless', 'O'), ("it's", 'O'), ('just', 'O'), ('really', 'O'), ('and', 'O'), ('for', 'RM'), ('their', 'RM'), ('for', 'RP'), ('their', 'RP'), ('own', 'O'), ('good', 'O'), ('$', 'EOS'), ('yes', 'O'), ('$', 'EOS'), ('yeah', 'O'), ('$', 'EOS'), ("i'd", 'O'), ('be', 'O'), ('very', 'O'), ('very', 'O'), ('careful', 'O'), ('and', 'RM'), ('checking', 'O'), ('them', 'O'), ('out', 'O'), ('$', 'EOS'), ('our', 'O'), ('-', 'O'), (

In [3]:
#Write the annotated tuples to a file

output_file = 'preprocesslist.txt'
with open('preprocesslist.txt', 'w') as file:
    for  i in word_classification:
        file.write(str(i) + '\n')
