## Imports

In [1]:
# general imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from convokit import Corpus, download
from tqdm import tqdm
import nltk
import re

# syntactic specific imports
from nltk import pos_tag
from nltk.corpus import treebank
from nltk.tree import ParentedTree
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
nltk.download('treebank')

import spacy
nlp = spacy.load("en_core_web_sm") # pre-trained English model

import stanza
stanza.download("en")
stanza_parser = stanza.Pipeline("en", processors="tokenize,pos,constituency")

[nltk_data] Downloading package treebank to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 436kB [00:00, 339MB/s]                     
2026-02-06 15:14:45 INFO: Downloaded file to /Users/nickvick/stanza_resources/resources.json
2026-02-06 15:14:45 INFO: Downloading default packages for language: en (English) ...
2026-02-06 15:14:46 INFO: File exists: /Users/nickvick/stanza_resources/en/default.zip
2026-02-06 15:14:48 INFO: Finished downloading models and saved to /Users/nickvick/stanza_resources
2026-02-06 15:14:48 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/

In [2]:
# set up for src imports
import sys
import os

# add project root to sys.path (so src/ can be imported)
project_root = os.path.abspath("..")  # adjust if notebooks are nested deeper
if project_root not in sys.path:
    sys.path.append(project_root)

# import required functions
from src.data_preprocessing import corpus_to_df, syntactic_preprocessing_df, is_complete_sentence, clean_tokens_lexical, clean_tokens_syntactic, split_sentences

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
corpus = Corpus(filename=download("subreddit-Cornell"))

Dataset already exists at /Users/nickvick/.convokit/saved-corpora/subreddit-Cornell


## Global Variables and Helper Functions

In [4]:
def create_parented_tree(sentence):
    '''Helper function to create a tree for a valid sentence'''

    '''if not is_complete_sentence(sentence):
        raise ValueError("Sentence is not complete")'''
    
    doc = stanza_parser(sentence)
    stanza_tree = doc.sentences[0].constituency
    parented_tree = ParentedTree.fromstring(str(stanza_tree))
    
    return parented_tree

In [5]:
def count_t_units(sentence):

    t_unit_count = 0
    is_question = False
    counted_s_label = False
    has_nested_sq_label = False
    parent_label = None
    to_decremented = False
    is_sentence = is_complete_sentence(sentence)

    # if a fragment, there are no t-units
    if not is_sentence:
        return 0
    
    # create a dependency tree
    ptree = create_parented_tree(sentence)

    # iterated through parented subtrees
    for subtree in ptree.subtrees():

        # extract relevant labels
        label = subtree.label()
        if subtree.parent():
            parent_label = subtree.parent().label()

        # flag if the sentence is a question and thus has different rules
        if label in {"SQ", "SBARQ"}:
             is_question = True
             # if we've counted a preceding S label decrement
             if counted_s_label:
                t_unit_count -= 1
                counted_s_label = False

        # logic if sentence is a question
        if is_question:
            if label == "SQ":
                t_unit_count += 1
                # if nested SQ label, flag
                if parent_label == "SQ":
                    has_nested_sq_label = True

        # logic when sentence is not a question
        else:
            # subtract occurences when "to" is considered a new subject
            if label == "TO" and not to_decremented:
                t_unit_count -= 1
                to_decremented = True
                    
            # check for subjects in regular sentences
            if label == "S":
                # if subject belongs to subordinate clause, ignore
                if parent_label == "SBAR":
                    continue
                # otherwise increment
                counted_s_label = True
                t_unit_count += 1
    
    # ignore duplicated subject labels
    if t_unit_count > 1 and not is_question:
        t_unit_count -= 1
    if has_nested_sq_label:
        t_unit_count -= 1

    # adjust for inappropriate decrements
    if t_unit_count == 0:
        if to_decremented:
            t_unit_count += 1

    # heuristic for special constructions
    if t_unit_count == 0 and is_sentence:
        return 1


    return t_unit_count

In [6]:
def extract_t_units(sentence):
    
    t_units = []
    is_question = False
    is_sentence = is_complete_sentence(sentence)
    
    # if a fragment, there are no t-units
    if not is_sentence:
        return []
    
    # create a dependency tree
    ptree = create_parented_tree(sentence)
    
    # flag if the sentence is a question
    for subtree in ptree.subtrees():
        if subtree.label() in {"SQ", "SBARQ"}:
            is_question = True
            break
    
    # extract t-units based on sentence type
    if is_question:
        # for questions, extract SQ constituents
        sq_found = False
        for subtree in ptree.subtrees():
            if subtree.label() == "SQ":
                parent_label = subtree.parent().label() if subtree.parent() else None
                # skip nested SQ labels
                if parent_label == "SQ":
                    continue
                # for top-level SQ, check if it has coordinated SQ children
                child_sqs = [child for child in subtree if hasattr(child, 'label') and child.label() == "SQ"]
                if child_sqs:
                    # has coordinated SQ children, extract those instead
                    for child_sq in child_sqs:
                        t_unit_text = " ".join(child_sq.leaves())
                        t_units.append(t_unit_text)
                        sq_found = True
                else:
                    # no coordinated children, extract this SQ
                    t_unit_text = " ".join(subtree.leaves())
                    t_units.append(t_unit_text)
                    sq_found = True
        
        # if no SQ found, fall back to extracting the whole question
        if not sq_found:
            t_units.append(sentence)
            
    else:
        # for declarative sentences, extract S constituents that are direct children of root or coordinated
        for subtree in ptree.subtrees():
            if subtree.label() == "S":
                parent_label = subtree.parent().label() if subtree.parent() else None
                # skip if subject belongs to subordinate clause
                if parent_label == "SBAR":
                    continue
                # skip the top-most S that contains everything
                if parent_label in {None, "ROOT"} and len([s for s in ptree.subtrees() if s.label() == "S"]) > 1:
                    continue
                t_unit_text = " ".join(subtree.leaves())
                t_units.append(t_unit_text)
    
    # remove duplicates while preserving order
    seen = set()
    unique_t_units = []
    for t_unit in t_units:
        if t_unit not in seen:
            seen.add(t_unit)
            unique_t_units.append(t_unit)
    
    # filter out T-units that are only infinitive clauses (start with "to" and have no subject)
    # keep T-units that have a subject before "to" (e.g., "I want to leave")
    filtered_t_units = []
    for t_unit in unique_t_units:
        words = t_unit.strip().split()
        # if it starts with "to", likely an infinitive clause fragment - remove it, unless it's the only t-unit
        if words and words[0].lower() == "to" and len(unique_t_units) > 1:
            continue
        filtered_t_units.append(t_unit)
    
    # heuristic: if no t-units found but it's a complete sentence, return the whole sentence
    if len(filtered_t_units) == 0 and is_sentence:
        return [sentence]
    
    return filtered_t_units

In [7]:
def count_clauses(sentence):

    clause_count = 0

    # only consider complete sentences
    if not is_complete_sentence(sentence):
        return 0
    
    t_unit_count = count_t_units(sentence)

    # create a dependency tree
    ptree = create_parented_tree(sentence)
    # print(TreePrettyPrinter(ptree))

    # iterated through parented subtrees
    for subtree in ptree.subtrees():
        # if subject belongs to subordinate clause, increment 
        if subtree.label() == "SBAR":
            clause_count += 1

    return clause_count + t_unit_count

In [8]:
def t_unit_length(t_unit):
    '''Helper function that determines the number of tokens in a given t-unit, 
    a.k.a. the t-unit length '''

    tokens = clean_tokens_lexical(t_unit)
    
    return len(tokens)

## Syntactic Analysis Functions

In [23]:
def mltu(complete_sentences):
    '''Computes the Mean Length of a T-Unit (MLTU) in a particular utterance.'''

    # extract the t_units
    t_units = []
    for sent in complete_sentences:
        t_units.append(extract_t_units(sent))
    # flatten the t_unit list
    t_units = [item for sublist in t_units for item in sublist]


    lengths = []
    # determine the length of each t-unit
    for unit in t_units:
        lengths.append(t_unit_length(unit))

    return np.mean(lengths)

In [26]:
def fragment_ratio(candidate_sentences, complete_sentences):
    '''Function to determine the ratio of fragments to lines in a given text'''

    # compute the total number of candidates
    total = len(candidate_sentences)
    if total == 0:
        return np.nan

    # find the total number of fragments
    num_fragments = total - len(complete_sentences)

    return num_fragments / total

In [21]:
def avg_t_units_per_sentence(complete_sentences):
    '''Function that, given an utterance, computes the number of t_units per sentence 
    and returns the average across all sentences.'''

    # find the number of sentences
    num_sentences = len(complete_sentences)

    # find the number of t_units
    t_units_per_sent = [count_t_units(sent) for sent in complete_sentences]
    num_t_units = sum(t_units_per_sent)

    return num_t_units / num_sentences

In [22]:
def clause_t_unit_ratio(complete_sentences):

    # find the total number of clauses in the utterance
    num_clauses_per_sent = [count_clauses(sent) for sent in complete_sentences]
    total_clauses = sum(num_clauses_per_sent)

    # find the total number of t_units in the utterance
    num_t_units_per_sent = [count_t_units(sent) for sent in complete_sentences]
    total_t_units = sum(num_t_units_per_sent)

    return total_clauses / total_t_units

## Data Analysis

In [13]:
df = corpus_to_df(corpus)

In [28]:
def compute_syntactic_vals(df):
    '''Function to compute the syntactic metrics for each utterance in a dataframe.'''

    # convert the text into a list of candidate sentences
    df = syntactic_preprocessing_df(df)
    num_utterances = len(df)

    # initialize lists to store results
    fragment_ratio_list = []
    avg_t_units_list = []
    clause_t_unit_ratio_list = []
    mltu_list = []

    # compute fragment ratio
    for candidate, complete in tqdm(zip(df["candidate_sentences"], df["complete_sentences"]), total=num_utterances):
        if not candidate:
            fragment_ratio_list.append(np.nan)
        else:
            fragment_ratio_list.append(fragment_ratio(candidate, complete))

    # compute remaining metrics
    for sentences in tqdm(df["complete_sentences"], total=num_utterances):
        if not sentences:
            avg_t_units_list.append(np.nan)
            clause_t_unit_ratio_list.append(np.nan)
            mltu_list.append(np.nan)
        else:
            avg_t_units_list.append(avg_t_units_per_sentence(sentences))
            clause_t_unit_ratio_list.append(clause_t_unit_ratio(sentences))
            mltu_list.append(mltu(sentences))
        
    # store all values in dataframe
    df["fragment_ratio"] = fragment_ratio_list
    df["avg_t_units"] = avg_t_units_list
    df["clause_to_t_unit_ratio"] = clause_t_unit_ratio_list
    df["mltu"] = mltu_list

    return df

In [None]:
compute_syntactic_vals(df)

  df = df[~df["raw_text"].str.contains(BOT_TEXT_RE, regex=True)]


## Visualizations