# Explore Dataset
## IQ2 Debates
*Marianne Aubin Le Quere and Lucas Van Bramer*

We are seeking to explore the basic tenets of our dataset. We will later be exploring complexity of language, so we will focus on something that comes quite close to this: word and sentence length.

## 1. Get averages
First, we want to gather some basic average facts:
  * What is the average word length in a debate?
  * What is the average sentence length in a debate?
  * What is the average utterance length in a debate?
  * What is the variability of word, sentence, or utterance length depending on the segment of a debate?
  * What is the variability of word, sentence, or utterance length depending on the speaker type of an utterance?

In [150]:
# import required modules and set up environment
import json
import os
import re

# replace file path below with your own local convokit
os.chdir('/Users/marianneaubin/Documents/Classes/CS6742/Cornell-Conversational-Analysis-Toolkit')
import convokit

In [151]:
# open created IQ2 corpus
corpus = convokit.Corpus(filename='datasets/iq2_corpus/iq2_corpus')

In [152]:
# print basic info about the corpus
corpus.print_summary_stats()

Number of Users: 471
Number of Utterances: 26562
Number of Conversations: 108


In [153]:
import re

# for each utterance, calculate how many words and sentences are in the utterance.
utter_ids = corpus.get_utterance_ids()
word_counts = []
sentence_counts = []
num_sentences = 0
for utt_id in utter_ids:
    utt = corpus.get_utterance(utt_id)
    
    #we simply use spaces to delineate words
    words = utt.text.split()
    word_count = len(words)
    word_counts.append(word_count)
    
    #we use regex to separate sentences
    sentences = re.split(r'[.!?]+', utt.text)
    sentences = list(filter(None, sentences))
    for sentence in sentences:
        words_in_sentence = sentence.split()
        words_in_sentence_count = len(words_in_sentence)
        sentence_counts.append(words_in_sentence_count)
        num_sentences = num_sentences + 1;
        
# get average word count per utterance
word_len_sum = sum(word_counts)
utt_num = len(list(corpus.iter_utterances()))
avg_word_len = word_len_sum/utt_num
print("average number of words per utterance is " + str(round(avg_word_len,2)))

# get average word count per sentence
word_sent_len_sum = sum(sentence_counts)
avg_sentence_len = word_sent_len_sum/num_sentences
print("average number of words per sentence is " + str(round(avg_sentence_len, 2)))

average number of words per utterance is 71.18
average number of words per sentence is 14.61


In [154]:
# function get_averages
# this function will count the averages of a number of words for a specific scenario
#    inputs: list of utterances, instance of corpus
#    outputs: average number of words per utterance, average number of words per sentence

def get_averages(utt_list, corp): 
    word_counts = []
    sentence_counts = []
    num_sentences = 0
    for utt_id in utt_list:
        utt = corp.get_utterance(utt_id)
    
        #we simply use spaces to delineate words
        words = utt.text.split()
        word_count = len(words)
        word_counts.append(word_count)
        
        #we use regex to separate sentences
        sentences = re.split(r'[.!?]+', utt.text)
        sentences = list(filter(None, sentences))
        for sentence in sentences:
            words_in_sentence = sentence.split()
            words_in_sentence_count = len(words_in_sentence)
            sentence_counts.append(words_in_sentence_count)
            num_sentences = num_sentences + 1;
        
    # get average word count per utterance
    word_len_sum = sum(word_counts)
    utt_num = len(list(corp.iter_utterances()))
    avg_word_len = word_len_sum/utt_num
    print("average number of words per utterance is " + str(round(avg_word_len,2)))

    # get average word count per sentence
    word_sent_len_sum = sum(sentence_counts)
    avg_sentence_len = word_sent_len_sum/num_sentences
    print("average number of words per sentence is " + str(round(avg_sentence_len, 2)))
    
    return avg_word_len, avg_sentence_len

In [160]:
# function get_segment_utterances
# this function takes in a whole corpus and will return a list of utterances
# only for the segment specified
#    inputs: corpus, desired segment
#    outputs: list of utterances in a segment

def get_segment_utterances(corp, seg):
    seg_utter_ids = []
    utter_ids = corp.get_utterance_ids()
    for utt_id in utter_ids:
        segment = corp.get_utterance(utt_id).meta['segment']
        if segment == seg:
            seg_utter_ids.append(utt_id)
    return seg_utter_ids;

In [172]:
print("overall segment stats:")
avg_word_len, avg_sentence_len = get_averages(corpus.get_utterance_ids(), corpus)

print("\nintro segment stats:")
#get intro segment id
seg_utt_1 = get_segment_utterances(corpus, 0)
avg_word_len_1, avg_sentence_len_1 = get_averages(seg_utt_1, corpus)

print("\ndiscussion segment stats:")
#get discussion segment id
seg_utt_2 = get_segment_utterances(corpus, 1)
avg_word_len_2, avg_sentence_len_2 = get_averages(seg_utt_2, corpus)

print("\nconclusion segment stats:")
#get conclusion segment id
seg_utt_3 = get_segment_utterances(corpus, 2)
avg_word_len_3, avg_sentence_len_3 = get_averages(seg_utt_3, corpus)

overall segment stats:
average number of words per utterance is 71.18
average number of words per sentence is 14.61

intro segment stats:
average number of words per utterance is 28.72
average number of words per sentence is 15.63

discussion segment stats:
average number of words per utterance is 32.23
average number of words per sentence is 13.73

conclusion segment stats:
average number of words per utterance is 10.24
average number of words per sentence is 14.9


In [201]:
# function: get_stance
# function that will yield whether a given utterance was from a 'for' 'against' or neutral side
#    inputs: utterance
#    outputs: 0 for neutral, 1 for 'for', -1 for 'against'

def get_stance(utterance):
        
    stance = utterance.user.meta['stance']
    
    if stance == 'for':
        return 1
    elif stance == 'against':
        return -1
    else:
        return 0

In [212]:
# function: get_winner
# function that will yield whether a given utterance was from a winner or from a loser
#    inputs: utterance
#    outputs: 0 for neutral, 1 for winner, -1 for loser

def get_winner(corp, utterance):
    
    print(utterance)
    print(utterance.meta)
    
    return

In [216]:
# now we try and see if there is a correlation between winning utterances and losing utterances
# we also include neutral stances for completion

#start with the for side
print("number of for iterances")

print(corpus.get_utterance('789'))
print(get_stance(corpus.get_utterance('789')))
print(get_winner(corpus, corpus.get_utterance('789')))
print(corpus.conversations.

#print(len(get_stance(corpus, 'for')))
#avg_num_words_utt_win, avg_num_words_sent_win = get_stance(corpus, 'for')

number of for iterances
Utterance({'id': '789', 'user': User([('name', 'Christopher Hitchens')]), 'root': '671', 'reply_to': 788, 'timestamp': None, 'text': 'What kind of foolishness is this— “Freedom of expression must include the license to offend”', 'meta': {'nontext': {}, 'segment': 1, 'speakertype': 'for', 'debateid': 'FreedomOfExpression-101806'}})
User([('name', 'Christopher Hitchens')])
1
Utterance({'id': '789', 'user': User([('name', 'Christopher Hitchens')]), 'root': '671', 'reply_to': 788, 'timestamp': None, 'text': 'What kind of foolishness is this— “Freedom of expression must include the license to offend”', 'meta': {'nontext': {}, 'segment': 1, 'speakertype': 'for', 'debateid': 'FreedomOfExpression-101806'}})
{'nontext': {}, 'segment': 1, 'speakertype': 'for', 'debateid': 'FreedomOfExpression-101806'}
None


AttributeError: 'dict' object has no attribute 'get_utterance'