# Explore Dataset
## IQ2 Debates
*Marianne Aubin Le Quere and Lucas Van Bramer*

We are seeking to explore the basic tenets of our dataset. We will later be exploring complexity of language, so we will focus on something that comes quite close to this: word and sentence length.

## 1. Get averages
First, we want to gather some basic average facts:
  * What is the average word length in a debate?
  * What is the average sentence length in a debate?
  * What is the average utterance length in a debate?
  * What is the variability of word, sentence, or utterance length depending on the segment of a debate?
  * What is the variability of word, sentence, or utterance length depending on the speaker type of an utterance?

In [150]:
# import required modules and set up environment
import json
import os
import re

# replace file path below with your own local convokit
os.chdir('/Users/marianneaubin/Documents/Classes/CS6742/Cornell-Conversational-Analysis-Toolkit')
import convokit

In [151]:
# open created IQ2 corpus
corpus = convokit.Corpus(filename='datasets/iq2_corpus/iq2_corpus')

In [152]:
# print basic info about the corpus
corpus.print_summary_stats()

Number of Users: 471
Number of Utterances: 26562
Number of Conversations: 108


In [153]:
import re

# for each utterance, calculate how many words and sentences are in the utterance.
utter_ids = corpus.get_utterance_ids()
word_counts = []
sentence_counts = []
num_sentences = 0
for utt_id in utter_ids:
    utt = corpus.get_utterance(utt_id)
    
    #we simply use spaces to delineate words
    words = utt.text.split()
    word_count = len(words)
    word_counts.append(word_count)
    
    #we use regex to separate sentences
    sentences = re.split(r'[.!?]+', utt.text)
    sentences = list(filter(None, sentences))
    for sentence in sentences:
        words_in_sentence = sentence.split()
        words_in_sentence_count = len(words_in_sentence)
        sentence_counts.append(words_in_sentence_count)
        num_sentences = num_sentences + 1;
        
# get average word count per utterance
word_len_sum = sum(word_counts)
utt_num = len(list(corpus.iter_utterances()))
avg_word_len = word_len_sum/utt_num
print("average number of words per utterance is " + str(round(avg_word_len,2)))

# get average word count per sentence
word_sent_len_sum = sum(sentence_counts)
avg_sentence_len = word_sent_len_sum/num_sentences
print("average number of words per sentence is " + str(round(avg_sentence_len, 2)))

average number of words per utterance is 71.18
average number of words per sentence is 14.61


In [339]:
# function get_averages
# this function will count the averages of a number of words for a specific scenario
#    inputs: list of utterances, instance of corpus
#    outputs: average number of words per utterance, average number of words per sentence

def get_averages(utt_list, corp): 
    word_counts = []
    sentence_counts = []
    letter_counts = []
    num_sentences = 0
    for utt_id in utt_list:
        utt = corp.get_utterance(utt_id)
    
        #we simply use spaces to delineate words
        words = utt.text.split()
        word_count = len(words)
        word_counts.append(word_count)
        
        for word in words:
            word_length = len(word)
            letter_counts.append(word_length)
        
        #we use regex to separate sentences
        sentences = re.split(r'[.!?]+', utt.text)
        sentences = list(filter(None, sentences))
        for sentence in sentences:
            words_in_sentence = sentence.split()
            words_in_sentence_count = len(words_in_sentence)
            sentence_counts.append(words_in_sentence_count)
            num_sentences = num_sentences + 1;
        
    # get average word count per utterance
    word_sum = sum(word_counts)
    utt_num = len(utt_list)
    avg_word_len = word_sum/utt_num
    print("average number of words per utterance is " + str(round(avg_word_len,2)))

    # get average word count per sentence
    word_sent_len_sum = sum(sentence_counts)
    avg_sentence_len = word_sent_len_sum/num_sentences
    print("average number of words per sentence is " + str(round(avg_sentence_len, 2)))
    
    # get average letter count per word
    word_len_sum = sum(letter_counts)
    avg_word_len = word_len_sum/word_sum
    print("average number of letters per word is " + str(round(avg_word_len,2)))
    
    return avg_word_len, avg_sentence_len

In [336]:
# function get_segment_utterances
# this function takes in a whole corpus and will return a list of utterances
# only for the segment specified
#    inputs: corpus, desired segment
#    outputs: list of utterances in a segment

def get_segment_utterances(corp, seg):
    seg_utter_ids = []
    utter_ids = corp.get_utterance_ids()
    for utt_id in utter_ids:
        segment = corp.get_utterance(utt_id).meta['segment']
        if segment == seg:
            seg_utter_ids.append(utt_id)
    return seg_utter_ids;

In [340]:
print("overall segment stats:")
avg_word_len, avg_sentence_len = get_averages((corpus.get_utterance_ids()), corpus)

print("\nintro segment stats:")
#get intro segment id
seg_utt_1 = get_segment_utterances(corpus, 0)
avg_word_len_1, avg_sentence_len_1 = get_averages(seg_utt_1, corpus)

print("\ndiscussion segment stats:")
#get discussion segment id
seg_utt_2 = get_segment_utterances(corpus, 1)
avg_word_len_2, avg_sentence_len_2 = get_averages(seg_utt_2, corpus)

print("\nconclusion segment stats:")
#get conclusion segment id
seg_utt_3 = get_segment_utterances(corpus, 2)
avg_word_len_3, avg_sentence_len_3 = get_averages(seg_utt_3, corpus)

overall segment stats:
average number of words per utterance is 71.18
average number of words per sentence is 14.61
average number of letters per word is 4.67

intro segment stats:
average number of words per utterance is 219.26
average number of words per sentence is 15.63
average number of letters per word is 4.75

discussion segment stats:
average number of words per utterance is 40.0
average number of words per sentence is 13.73
average number of letters per word is 4.61

conclusion segment stats:
average number of words per utterance is 161.29
average number of words per sentence is 14.9
average number of letters per word is 4.68


In [272]:
# function: get_stance
# function that will yield whether a given utterance was from a 'for' 'against' or neutral side
#    inputs: utterance
#    outputs: 0 for neutral, 1 for 'for', -1 for 'against'

def get_stance(utterance):
        
    stance = utterance.user.meta['stance']
    
    if stance == 'for':
        return 1
    elif stance == 'against':
        return -1
    else:
        return 0

In [271]:
# function: get_winner
# function that will yield whether a given utterance was from a winner or from a loser
#    inputs: utterance
#    outputs: 0 for tie, 1 for winner, -1 for loser, -2 for neutral

def get_winner(corp, utterance):
    
    root = utterance.root
    results = corp.conversations[root].meta['results']
    
    #if delta1 > delta2, the winning side was against
    #if delta2 > delta1, the winning side was for
    delta1 = int(results['post']['against']) - int(results['pre']['against'])
    delta2 = int(results['post']['for']) - int(results['pre']['for'])
    
    #if the stance of speaker is against
    #and against won, return 1
    
    stance = get_stance(utterance)
    
    #if stance is neutral, return -2
    if stance == 0:
        return -2
    
    #if stance is against, determine if the against side won
    if stance == -1:
        if delta1 > delta2:
            return 1
        elif delta2 > delta1:
            return -1
        else:
            return 0
    else:
        if delta1>delta2:
            return -1
        elif delta2>delta1:
            return 1
        else:
            return 0
        
    return None

In [343]:
def get_utt_averages(utt_list):
    word_counts = []
    sentence_counts = []
    num_sentences = 0
    letter_counts = []
    
    for utt in utt_list:
        
        #we simply use spaces to delineate words
        words = utt.text.split()
        word_count = len(words)
        word_counts.append(word_count)
        
        for word in words:
            word_length = len(word)
            letter_counts.append(word_length)
        
        #we use regex to separate sentences
        sentences = re.split(r'[.!?]+', utt.text)
        sentences = list(filter(None, sentences))
        for sentence in sentences:
            words_in_sentence = sentence.split()
            words_in_sentence_count = len(words_in_sentence)
            sentence_counts.append(words_in_sentence_count)
            num_sentences = num_sentences + 1;
            
    # get average word count per utterance
    word_sum = sum(word_counts)
    utt_num = len(utt_list)
    avg_word_len = word_sum/utt_num
    print("average number of words per utterance is " + str(round(avg_word_len,2)))

    # get average word count per sentence
    word_sent_len_sum = sum(sentence_counts)
    avg_sentence_len = word_sent_len_sum/num_sentences
    print("average number of words per sentence is " + str(round(avg_sentence_len, 2)))
    
    # get average letter count per word
    word_len_sum = sum(letter_counts)
    avg_word_len = word_len_sum/word_sum
    print("average number of letters per word is " + str(round(avg_word_len,2)))
    
    return avg_word_len, avg_sentence_len

In [344]:
# now we try and see if there is a correlation between winning utterances and losing utterances
# we also include neutral stances for completion

winning_utts = []
losing_utts = []
tied_utts = []
neutral_utts = []

#iterate through each utterance
for utt in corpus.utterances:
    
    utterance = corpus.get_utterance(utt)
    if (get_winner(corpus, utterance) == 1):
        winning_utts.append(utterance)
    elif get_winner(corpus, utterance) == 0:
        tied_utts.append(utterance)
    elif get_winner(corpus, utterance) == -1:
        losing_utts.append(utterance)
    else:
        neutral_utts.append(utterance)
        
print(len(winning_utts))
print(len(losing_utts))
print(len(tied_utts))
print(len(neutral_utts))

print("winning stats:")
get_utt_averages(winning_utts)

print("\nlosing stats")
get_utt_averages(losing_utts)

print("\ntied stats")
get_utt_averages(tied_utts)

print("\nneutral stats")
get_utt_averages(neutral_utts)

6845
6641
329
12747
winning stats:
average number of words per utterance is 98.72
average number of words per sentence is 15.83
average number of letters per word is 4.68

losing stats
average number of words per utterance is 98.28
average number of words per sentence is 15.81
average number of letters per word is 4.68

tied stats
average number of words per utterance is 129.98
average number of words per sentence is 18.19
average number of letters per word is 4.66

neutral stats
average number of words per utterance is 40.76
average number of words per sentence is 12.08
average number of letters per word is 4.66


(4.655070466452887, 12.078504974207812)