# IQ2 Dataset to Convokit format conversion script
Marianne Aubin Le Quere and Lucas Van Bramer

In [1]:
# if needed, change directory to find convokit location

# import required modules and set up environment
import os

# replace file path below with your own local convokit
os.chdir('/Users/marianneaubin/Documents/Classes/CS6742/Cornell-Conversational-Analysis-Toolkit')
import convokit

In [2]:
from convokit import Corpus, User, Utterance
from tqdm import tqdm
import json

In [3]:
# generates all of the users who are listed in the metadata of a specific debate's "speakers" field
# args: debate_id is a key used by the iq2 dataset, e.g. "PerformanceEnhancingDrugs-011508"
# returns: a dictionary in which keys are speakers' full names and values are dictionaries containing metadata
def generate_users(debate_id):
    debate = iq2[debate_id]
    users = {}
    for stance in ["for", "against"]:
        for speaker in debate["speakers"][stance]:
            meta = {}
            meta["stance"] = stance
            meta["bio"] = speaker["bio"]
            meta["bio_short"] = speaker["bio_short"]
            users[speaker["name"]] = meta
    mod = debate["speakers"]["moderator"]
    modmeta = {}
    modmeta["bio"] = mod["bio"]
    modmeta["bio_short"] = mod["bio_short"]
    modmeta["stance"] = None
    users[mod["name"]] = modmeta
    users["audience"] = {"bio": None, "bio_short": None, "stance": None}
    return users

In [4]:
# generates all of the users in the iq2 dataset 
# args: dataset is the python object containing the iq2 dataset parsed from json
# returns: a dictionary in which keys are speakers' full names and values are dictionaries containing metadata
def generate_all_users_convokit(dataset):
    all_users = {}
    for debate_id in dataset.keys():
        res = generate_users(debate_id)
        for fullname, usermeta in res.items():
            all_users[fullname] = usermeta
    print(str(len(all_users.keys())) + " users generated.")
    convokit_all_users = {k: User(name = k, meta = v) for k,v in all_users.items()}
    return convokit_all_users

In [5]:
# generates all of the convokit utterances from the iq2 dataset 
# args: dataset is the python object containing the iq2 dataset parsed from json
# precondition: corpus_users must be populated in the jupyter environment because it is called from and modified here
# returns: convokit corpus representation of iq2 dataset
def generate_utterance_corpus_from_dataset(dataset, v):
    utt_id = 0
    utterance_corpus = {}
    for conversation_id in dataset.keys():
        conversation = dataset[conversation_id]
        
        # set root of the conversation to the first utterance id in the conversation
        convo_root = utt_id
        for turn in conversation["transcript"]:
            utterance = {}
            utterance["id"] = str(utt_id)
            utterance["root"] = str(convo_root)
            
            utterance["timestamp"] = None
            meta = {
                    "nontext": turn["nontext"], 
                    "segment": turn["segment"],
                    "speakertype": turn["speakertype"],
                    "debateid": conversation_id
                   } 
            utterance["meta"] = meta
            
            # sets replied-to utterance to always be the last utterance
            utterance["reply_to"] = utt_id - 1 
            
            # text is originally stored as a list of strings; this concatenates them into one string
            fulltext = "".join(turn["paragraphs"])
            utterance["text"] = fulltext
            
            # "unknown" speakers are generally the audience
            utterance["user"] = turn["speaker"] if turn["speakertype"] != "unknown" else "audience"
            
            # in the case that a speaker in the text is not a speaker contained in the debate
            # metadata, adds a speaker with the same schema but no metadata to the corpus users
            if turn["speaker"] not in corpus_users:
                meta = {}
                meta["stance"] = None
                meta["bio"] = None
                meta["bio_short"] = None
                corpus_users[turn["speaker"]] = User(name=turn["speaker"], meta=meta)
                
            # adds convokit utterance to corpus object
            utterance_corpus[utterance["id"]] = \
                Utterance(utterance["id"], 
                          corpus_users[utterance["user"]],
                          utterance["root"],
                          utterance["reply_to"],
                          utterance["timestamp"],
                          utterance["text"],
                          meta=utterance["meta"]
                         )
            # increments utterance id
            utt_id += 1
            
    # converts utterance dictionary into convokit format 
    corpus = Corpus(utterances=[utt for _, utt in utterance_corpus.items()], version=v)
    return corpus

In [6]:
# replace open location with where the dataset is

!pwd
file = open('../IQ2/iq2_data_release/iq2_data_release.json')
iq2 = json.load(file)
print(str(len(iq2.keys())) + " debates loaded.")

/Users/marianneaubin/Documents/Classes/CS6742/Cornell-Conversational-Analysis-Toolkit
108 debates loaded.


In [7]:
corpus_users = generate_all_users_convokit(iq2)
iq2_corpus = generate_utterance_corpus_from_dataset(iq2, 1)
iq2_corpus.meta["name"] = "IQ2 Debate Corpus"

470 users generated.


In [8]:
# this function determines, given a conversation, which is the winner

def determine_winner(conversation):
    results = conversation.meta["results"]
    fordelta = results["post"]["for"] - results["pre"]["for"]
    againstdelta = results["post"]["against"] - results["pre"]["against"]
    if(fordelta > againstdelta):
        return "for"
    elif(againstdelta > fordelta):
        return "against"
    else:
        return "tie"

In [9]:
# this generates conversation metadata based on each debate and updates the corpus conversation instances
for conv_id in iq2_corpus.get_conversation_ids():
    conv = iq2_corpus.get_conversation(conv_id)
    first_utt = iq2_corpus.get_utterance(conv.get_utterance_ids()[0])
    debate = iq2[first_utt.meta["debateid"]]
    debate_meta = {}
    debate_meta["summary"] = debate["summary"]
    debate_meta["title"] = debate["title"]
    debate_meta["date"] = debate["date"]
    debate_meta["url"] = debate["url"]
    debate_meta["results"] = debate["results"]
    debate_meta["speakers"] = debate["speakers"]
    conv.meta = debate_meta
    debate_meta["winner"] = determine_winner(conv)
    conv.meta = debate_meta

In [14]:
# prints summary stats and dumps the corpus to file
iq2_corpus.print_summary_stats()
iq2_corpus.dump("iq2_corpus", base_path='./datasets/iq2_corpus/' )

Number of Users: 471
Number of Utterances: 26562
Number of Conversations: 108


In [19]:
iq2 = convokit.Corpus(filename='datasets/iq2_corpus/iq2_corpus')

In [21]:
num_for, num_against, num_tie = 0,0,0

for conv_id in iq2.conversations:
    conv = iq2.get_conversation(conv_id)
    result = conv.meta['winner']
    if result == 'for':
        num_for = num_for + 1
    elif result == 'against':
        num_against = num_against + 1
    else:
        num_tie = num_tie + 1
        
print("num of for winners is " + str(num_for))
print("num of against winners is " + str(num_against))
print("num of ties is " + str(num_tie))

num of for winners is 52
num of against winners is 53
num of ties is 3
