### Section 1: Data extraction from Convokit's switchboard corpus






In [1]:
### Imports 
from convokit import Corpus, download
import pandas as pd
import re

In [2]:
# for pretty printing of cells within the Colab version of this notebook
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [3]:
### Globals

corpus_name = "switchboard-corpus"

In [4]:
swbc = Corpus(filename=download(corpus_name))

Dataset already exists at /Users/muhammadumair/.convokit/downloads/switchboard-corpus


In [5]:
# Each conversation has three components: Conversations, Utterances, and Speakers
# Each component has: Primary data fields / Metadata
print(swbc.print_summary_stats())

Number of Speakers: 440
Number of Utterances: 122646
Number of Conversations: 1155
None


In [7]:
### Extract all the utterances and tags in the conversations and store as a local CSV file

# Collect a list of all of dataframes representing all utterances in every conversation. 
conv_utt_dataframes = list()
conversation_ids = swbc.get_conversation_ids()
for id_ in conversation_ids:
    conv = swbc.get_conversation(id_)
    conv_utt_dataframes.append(conv.get_utterances_dataframe())

# For all utterances in every conversation, extract required values and save them as dataframes.
conversation_dfs = list()
for conv_utt_df in conv_utt_dataframes:
    data = {
        #"speakers" : list(),
        "tags" : list(),
        "utterances" : list()}
    
    conv_utt_df = conv_utt_df[["speaker","meta.tag"]]
    for index, utt in conv_utt_df.iterrows():
        speaker = utt["speaker"]
        tagged_utts = utt["meta.tag"]
        for utterance in tagged_utts:
            #data["speakers"].append(speaker)
            data["utterances"].append(utterance[0])
            data["tags"].append(utterance[1])
        
    conversation_dfs.append(pd.DataFrame(data))



In [8]:
# Cleaning up the utterances to remove all non-alphabetical characters. 
for i in range(len(conversation_dfs)):    
    df = conversation_dfs[i]
    for j in range(len(df["utterances"])):
        utterance = df.at[j,'utterances'].split()
        for k in range(len(utterance)):
            utterance[k] = "".join([c for c in utterance[k] if c.isalpha()])
        utterance = " ".join([word for word in utterance if word])
        df.at[j,'utterances'] = utterance

    

In [9]:
# Replacing all non-supported tags with NOISE tags. 
supported_tags = ("sd","b","sv", "aa", "%", "ba","qy","x","ny","fc","qw","nn","bk","h","qy^d","fo_o_fw_by_bc",
                 "bh", "^q","bf","na","ad","^2","b^m","qo","qh","^h","ar","ng","bt","no","fp","qrr","arp_nd",
                 "t3","oo_co_cc","t1","bd","aap_am","^g","qw^d","fa","ft")
noise_tag = "noise"
for i in range(len(conversation_dfs)):    
    df = conversation_dfs[i]
    for j in range(len(df["tags"])):
        tag = df.at[j,'tags']
        df.at[j,'tags'] = tag if tag in supported_tags else noise_tag


In [10]:
# Saving all files to the output directory. 

# NOTE: Path changes per user
output_dir_path = "/Users/muhammadumair/Documents/Repositories/mumair01-Repos/repair-tagging/data"
    
for df, conv_utt_df in zip(conversation_dfs,conv_utt_dataframes):
    file_path = "{}/{}.csv".format(output_dir_path,conv_utt_df["conversation_id"][0])  
    df.to_csv(file_path)
    
