## Imports

In [1]:
# general imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from convokit import Corpus, download
from tqdm import tqdm
import nltk
import re

# syntactic specific imports
from nltk import pos_tag
from nltk.corpus import treebank
from nltk.tree import ParentedTree
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
nltk.download('treebank')

import spacy
nlp = spacy.load("en_core_web_sm") # pre-trained English model

import stanza
stanza.download("en")
stanza_parser = stanza.Pipeline("en", processors="tokenize,pos,constituency")

[nltk_data] Downloading package treebank to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 436kB [00:00, 256MB/s]                     
2026-02-03 15:25:21 INFO: Downloaded file to /Users/nickvick/stanza_resources/resources.json
2026-02-03 15:25:21 INFO: Downloading default packages for language: en (English) ...
2026-02-03 15:25:22 INFO: File exists: /Users/nickvick/stanza_resources/en/default.zip
2026-02-03 15:25:23 INFO: Finished downloading models and saved to /Users/nickvick/stanza_resources
2026-02-03 15:25:23 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/

In [2]:
# set up for src imports
import sys
import os

# add project root to sys.path (so src/ can be imported)
project_root = os.path.abspath("..")  # adjust if notebooks are nested deeper
if project_root not in sys.path:
    sys.path.append(project_root)

# import required functions
from src.data_preprocessing import corpus_to_df, syntactic_preprocessing_df, is_complete_sentence, clean_tokens_lexical

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
corpus = Corpus(filename=download("subreddit-Cornell"))

Dataset already exists at /Users/nickvick/.convokit/saved-corpora/subreddit-Cornell


## Global Variables and Helper Functions

In [4]:
def create_parented_tree(sentence):
    '''Helper function to create a tree for a valid sentence'''

    '''if not is_complete_sentence(sentence):
        raise ValueError("Sentence is not complete")'''
    
    doc = stanza_parser(sentence)
    stanza_tree = doc.sentences[0].constituency
    parented_tree = ParentedTree.fromstring(str(stanza_tree))
    
    return parented_tree

## Syntactic Analysis Functions

In [18]:
def count_t_units(sentence):

    t_unit_count = 0
    is_question = False
    counted_s_label = False
    has_nested_sq_label = False

    # if a fragment, there are no t-units
    if not is_complete_sentence(sentence):
        return 0
    
    # create a dependency tree
    ptree = create_parented_tree(sentence)

    # iterated through parented subtrees
    for subtree in ptree.subtrees():

        # extract relevant labels
        label = subtree.label()
        if subtree.parent():
            parent_label = subtree.parent().label()

        # flag if the sentence is a question and thus has different rules
        if label in {"SQ", "SBARQ"}:
             is_question = True
             # if we've counted a preceding S label decrement
             if counted_s_label:
                 t_unit_count -= 1
                 counted_s_label = False

        # logic if sentence is a question
        if is_question:
            if label == "SQ":
                t_unit_count += 1
                # if nested SQ label, flag
                if parent_label == "SQ":
                    has_nested_sq_label = True

        # logic when sentence is not a question
        else:
            # subtract occurences when "to" is considered a new subject
            if label == "TO":
                t_unit_count -= 1
                    
            # check for subjects in regular sentences
            if label == "S":
                # if subject belongs to subordinate clause, ignore
                if parent_label == "SBAR":
                    continue
                # otherwise increment
                counted_s_label = True
                t_unit_count += 1
    
    # ignore duplicated subject labels
    if t_unit_count > 1 and not is_question:
        t_unit_count -= 1
    if has_nested_sq_label:
        t_unit_count -= 1

    return t_unit_count

In [6]:
def count_clauses(sentence):

    clause_count = 0

    # only consider complete sentences
    if not is_complete_sentence(sentence):
        return 0
    
    t_unit_count = count_t_units(sentence)

    # create a dependency tree
    ptree = create_parented_tree(sentence)
    # print(TreePrettyPrinter(ptree))

    # iterated through parented subtrees
    for subtree in ptree.subtrees():
        # if subject belongs to subordinate clause, increment 
        if subtree.label() == "SBAR":
            clause_count += 1

    return clause_count + t_unit_count

In [7]:
def t_unit_length(sentence):
    '''Computes the t-unit length (i.e., the number of words divided by number of t-units)'''

    # only consider complete sentences
    if not is_complete_sentence(sentence):
        return 0

    tokens = clean_tokens_lexical(sentence)
    num_words = len(tokens)

    num_t_units = count_t_units(sentence)

    return num_words / num_t_units

In [8]:
def fragment_ratio(text):
    '''Function to determine the ratio of fragments to lines in a given text'''

    sentences = split_sentences(text)
    total = len(sentences)
    if total == 0:
        return None

    # add complete sentences to a list
    is_complete = []
    for sent in sentences:
        if is_complete_sentence(sent):
            is_complete.append(sent)

    num_fragment = total - len(is_complete)

    fragment_ratio = num_fragment/total

    return fragment_ratio

In [14]:
def compute_syntactic_vals(df):
    '''Function to compute the syntactic metrics for each utterance in a dataframe.'''

    avg_t_units_list = []
    clause_t_unit_ratio_list = []
    avg_t_unit_length_list = []

    for utterance_sentences in tqdm(df["final"]):

        if len(utterance_sentences) == 0:
            continue

        print(utterance_sentences)

        # list of values for each sentence
        t_units_per_sent = [count_t_units(s) for s in utterance_sentences]
        clauses_per_sent = [count_clauses(s) for s in utterance_sentences]
        t_unit_lengths_per_sent = [t_unit_length(s) for s in utterance_sentences]

        if sum(t_units_per_sent) == 0:
            continue
        if sum(t_unit_lengths_per_sent) == 0:
            continue

        # average per utterance
        avg_t_units = sum(t_units_per_sent) / len(t_units_per_sent)
        avg_clause_t_unit_ratio = sum(clauses_per_sent) / sum(t_units_per_sent)
        avg_t_unit_len = sum(t_unit_lengths_per_sent) / len(t_unit_lengths_per_sent)

        # store values for the current utterance
        avg_t_units_list.append(avg_t_units)
        clause_t_unit_ratio_list.append(avg_clause_t_unit_ratio)
        avg_t_unit_length_list.append(avg_t_unit_len)

    # store all values in dataframe
    df["avg_t_units"] = avg_t_units_list
    df["clause_to_t_unit_ratio"] = clause_t_unit_ratio_list
    df["avg_t_unit_length"] = avg_t_unit_length_list

    return df

## Data Analysis

In [10]:
df = corpus_to_df(corpus)
df = syntactic_preprocessing_df(df)

  df = df[~df["text"].str.contains(BOT_TEXT_RE, regex=True)]


In [15]:
compute_syntactic_vals(df)

  0%|          | 0/65796 [00:00<?, ?it/s]

["I was just reading about the Princeton Mic-Check and it's getting [national press](URL).", 'I want to get a sense of what people felt like around campus.', 'Anything interesting happen?', 'Anything interesting coming up?']


  0%|          | 1/65796 [00:00<16:51:09,  1.08it/s]

['I have added support for Cornell to courseoff.com (URL).', 'Courseoff is a free web app to help you plan your semester schedules.', 'It is very popular with students at some of the other schools I support.', 'No signup is required to use it so feel free to try it out!', 'You can create an account which allows multiple schedules, saving schedules, and sharing schedules.', 'Let me know what you guys think!', 'Any feedback is always appreciated.', 'If you like it, tell your friends :) If you find a problem, let me know as well.']


  0%|          | 2/65796 [00:03<37:56:11,  2.08s/it]

["i don't have a facebook, so we'd need a volunteer.. just someone to let cornell on facebook know that we have a presence on reddit.. perhaps a small explanation of what reddit is?", 'now that we are almost beautiful and such.. we need more redditors!']
["so, i'm starting to mess with some of the css on our lovely subreddit.. anyone have any fun suggestions about our little envelope?", 'or up/downvote things?', 'GO NUTS.']
['Ever since SOPA put fear into the hearts of everyone that loves the internet, it looks like [The DarkNet Plan](URL) has grown by the thousands and even got [national media attention](URL).', 'What is the feasibility of doing that for our big red campus?', 'Relevant: I miss DC++']


  0%|          | 5/65796 [00:04<15:52:19,  1.15it/s]

["i'm seriously considering cornell for law school, and this intrigues me.", 'it\'s not at all a part of my decision making process, but the fact that cornell was at some point by some people known as "godless cornell" (i think i read that on wikipedia) makes me smile.', 'it seems like most of the older schools in this country are religious and cornell is non-sectarian (i believe).', 'how does this play out at the school?', "edit: to be more clear, i guess i'm wondering if the student body is especially agnostic/atheist leaning or if there's some sort of unstated distaste for religion.", "i'm agnostic atheist FWIW."]
["So I signed with Cornell as a swimmer a couple of months ago, and as a freshman-to-be I'm beginning put some thought into my future living situations.", 'As a swimmer I can choose between sharing a townhouse with three others, or taking a dorm.', 'I went on a recruit trip back in October and got to see the townhouse, but only for a little bit.', 'I was wondering if anyon

  0%|          | 7/65796 [00:06<16:23:51,  1.11it/s]

['Hey this forum!', 'I was wondering what the bars were like in Ithaca during the winter session when the undergrads are gone.', 'Is collegetown empty but the commons still alive?']


  0%|          | 8/65796 [00:07<15:40:38,  1.17it/s]

["What do you think of Cornell's big crackdown on drinking?"]


  0%|          | 8/65796 [00:07<17:59:07,  1.02it/s]


ZeroDivisionError: division by zero

## Visualizations