## Imports

In [1]:
# general imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from convokit import Corpus, download
from tqdm import tqdm
import nltk
import re

# syntactic specific imports
from nltk import pos_tag
from nltk.corpus import treebank
from nltk.tree import ParentedTree
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
nltk.download('treebank')

import spacy
nlp = spacy.load("en_core_web_sm") # pre-trained English model

import stanza
stanza.download("en")
stanza_parser = stanza.Pipeline("en", processors="tokenize,pos,constituency")

[nltk_data] Downloading package treebank to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 436kB [00:00, 344MB/s]                     
2026-02-05 10:54:04 INFO: Downloaded file to /Users/nickvick/stanza_resources/resources.json
2026-02-05 10:54:04 INFO: Downloading default packages for language: en (English) ...
2026-02-05 10:54:05 INFO: File exists: /Users/nickvick/stanza_resources/en/default.zip
2026-02-05 10:54:06 INFO: Finished downloading models and saved to /Users/nickvick/stanza_resources
2026-02-05 10:54:06 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/

In [2]:
# set up for src imports
import sys
import os

# add project root to sys.path (so src/ can be imported)
project_root = os.path.abspath("..")  # adjust if notebooks are nested deeper
if project_root not in sys.path:
    sys.path.append(project_root)

# import required functions
from src.data_preprocessing import corpus_to_df, syntactic_preprocessing_df, is_complete_sentence, clean_tokens_lexical, clean_tokens_syntactic, split_sentences

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
corpus = Corpus(filename=download("subreddit-Cornell"))

Dataset already exists at /Users/nickvick/.convokit/saved-corpora/subreddit-Cornell


## Global Variables and Helper Functions

In [4]:
def create_parented_tree(sentence):
    '''Helper function to create a tree for a valid sentence'''

    '''if not is_complete_sentence(sentence):
        raise ValueError("Sentence is not complete")'''
    
    doc = stanza_parser(sentence)
    stanza_tree = doc.sentences[0].constituency
    parented_tree = ParentedTree.fromstring(str(stanza_tree))
    
    return parented_tree

## Syntactic Analysis Functions

In [5]:
def count_t_units(sentence):

    t_unit_count = 0
    is_question = False
    counted_s_label = False
    has_nested_sq_label = False
    parent_label = None
    to_decremented = False
    is_sentence = is_complete_sentence(sentence)

    # if a fragment, there are no t-units
    if not is_sentence:
        return 0
    
    # create a dependency tree
    ptree = create_parented_tree(sentence)

    # iterated through parented subtrees
    for subtree in ptree.subtrees():

        # extract relevant labels
        label = subtree.label()
        if subtree.parent():
            parent_label = subtree.parent().label()

        # flag if the sentence is a question and thus has different rules
        if label in {"SQ", "SBARQ"}:
             is_question = True
             # if we've counted a preceding S label decrement
             if counted_s_label:
                t_unit_count -= 1
                counted_s_label = False

        # logic if sentence is a question
        if is_question:
            if label == "SQ":
                t_unit_count += 1
                # if nested SQ label, flag
                if parent_label == "SQ":
                    has_nested_sq_label = True

        # logic when sentence is not a question
        else:
            # subtract occurences when "to" is considered a new subject
            if label == "TO" and not to_decremented:
                t_unit_count -= 1
                to_decremented = True
                    
            # check for subjects in regular sentences
            if label == "S":
                # if subject belongs to subordinate clause, ignore
                if parent_label == "SBAR":
                    continue
                # otherwise increment
                counted_s_label = True
                t_unit_count += 1
    
    # ignore duplicated subject labels
    if t_unit_count > 1 and not is_question:
        t_unit_count -= 1
    if has_nested_sq_label:
        t_unit_count -= 1

    # adjust for inappropriate decrements
    if t_unit_count == 0:
        if to_decremented:
            t_unit_count += 1

    # heuristic for special constructions
    if t_unit_count == 0 and is_sentence:
        return 1


    return t_unit_count

In [6]:
def count_clauses(sentence):

    clause_count = 0

    # only consider complete sentences
    if not is_complete_sentence(sentence):
        return 0
    
    t_unit_count = count_t_units(sentence)

    # create a dependency tree
    ptree = create_parented_tree(sentence)
    # print(TreePrettyPrinter(ptree))

    # iterated through parented subtrees
    for subtree in ptree.subtrees():
        # if subject belongs to subordinate clause, increment 
        if subtree.label() == "SBAR":
            clause_count += 1

    return clause_count + t_unit_count

In [7]:
def t_unit_lengths(sentence):
    '''Computes the average t-unit length (i.e., the number of words divided by number of t-units)'''

    # only consider complete sentences
    if not is_complete_sentence(sentence):
        return 0

    tokens = clean_tokens_lexical(sentence)
    num_words = len(tokens)

    num_t_units = count_t_units(sentence)

    return num_words / num_t_units

In [8]:
def fragment_ratio(text):
    '''Function to determine the ratio of fragments to lines in a given text'''

    sentences = split_sentences(text)
    total = len(sentences)
    if total == 0:
        return None

    # add complete sentences to a list
    is_complete = []
    for sent in sentences:
        if is_complete_sentence(sent):
            is_complete.append(sent)

    num_fragment = total - len(is_complete)

    fragment_ratio = num_fragment/total

    return fragment_ratio

In [19]:
def compute_syntactic_vals(df):
    '''Function to compute the syntactic metrics for each utterance in a dataframe.'''

    avg_t_units_list = []
    clause_t_unit_ratio_list = []
    avg_t_unit_length_list = []

    for utterance_sentences in tqdm(df["final"]):

        # utterance level initializations
        total_t_units = 0
        total_t_unit_length = 0

        # list of values for each sentence in a given utterance
        t_units_per_sent = [count_t_units(s) for s in utterance_sentences]
        clauses_per_sent = [count_clauses(s) for s in utterance_sentences]

        # sum of values for the entire utterance
        t_units_per_utterance = sum(t_units_per_sent)
        clause_count_per_utterance = sum(clauses_per_sent)


        for s in utterance_sentences:
            lengths = t_unit_lengths(s)
            total_t_unit_length += sum(lengths)
            total_t_units += len(lengths)

        if total_t_units == 0:
            avg_t_unit_len = 0
        else:
            avg_t_unit_len = total_t_unit_length / total_t_units

            

        if t_units_per_utterance == 0 or sum(t_unit_lengths_per_sent) == 0:
            avg_t_units_list.append(0)
            clause_t_unit_ratio_list.append(0)
            avg_t_unit_length_list.append(0)
            continue

        # average per utterance
        avg_t_units = t_units_per_utterance / len(t_units_per_sent)
        avg_clause_t_unit_ratio = clause_count_per_utterance / t_units_per_utterance
        avg_t_unit_len = sum(t_unit_lengths_per_sent) / len(t_unit_lengths_per_sent)

        # store values for the current utterance
        avg_t_units_list.append(avg_t_units)
        clause_t_unit_ratio_list.append(avg_clause_t_unit_ratio)
        avg_t_unit_length_list.append(avg_t_unit_len)

    # store all values in dataframe
    df["avg_t_units"] = avg_t_units_list
    df["clause_to_t_unit_ratio"] = clause_t_unit_ratio_list
    df["avg_t_unit_length"] = avg_t_unit_length_list

    return df

## Data Analysis

In [15]:
df = corpus_to_df(corpus)
df = syntactic_preprocessing_df(df)

  df = df[~df["text"].str.contains(BOT_TEXT_RE, regex=True)]


In [18]:
compute_syntactic_vals(df)

  0%|          | 2/65796 [00:03<36:29:29,  2.00s/it]


ZeroDivisionError: division by zero

## Visualizations