## Imports

In [16]:
# general imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from convokit import Corpus, download
from tqdm import tqdm
import nltk

# syntactic specific imports
from nltk import pos_tag
from nltk.corpus import treebank
from nltk.tree import ParentedTree
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
nltk.download('treebank')

import spacy
nlp = spacy.load("en_core_web_sm") # pre-trained English model

import stanza
stanza.download("en")
stanza_parser = stanza.Pipeline("en", processors="tokenize,pos,constituency")

[nltk_data] Downloading package treebank to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 436kB [00:00, 42.4MB/s]                    
2026-01-29 14:45:37 INFO: Downloaded file to /Users/nickvick/stanza_resources/resources.json
2026-01-29 14:45:37 INFO: Downloading default packages for language: en (English) ...
2026-01-29 14:45:38 INFO: File exists: /Users/nickvick/stanza_resources/en/default.zip
2026-01-29 14:45:39 INFO: Finished downloading models and saved to /Users/nickvick/stanza_resources
2026-01-29 14:45:39 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 436kB [00:00, 38.6MB/s]    

In [17]:
# set up for src imports
import sys
import os

# add project root to sys.path (so src/ can be imported)
project_root = os.path.abspath("..")  # adjust if notebooks are nested deeper
if project_root not in sys.path:
    sys.path.append(project_root)

# import required functions
from src.data_preprocessing import corpus_to_df, syntactic_preprocessing_df, is_complete_sentence

## Global Variables and Helper Functions

In [18]:
def create_parented_tree(sentence):
    '''Helper function to create a tree for a valid sentence'''

    '''if not is_complete_sentence(sentence):
        raise ValueError("Sentence is not complete")'''
    
    doc = stanza_parser(sentence)
    stanza_tree = doc.sentences[0].constituency
    parented_tree = ParentedTree.fromstring(str(stanza_tree))
    
    return parented_tree

## Syntactic Analysis Functions

In [19]:
def count_t_units(sentence):

    t_unit_count = 0

    # if a fragment, there are no t-units
    if not is_complete_sentence(sentence):
        return 0
    
    # create a dependency tree
    ptree = create_parented_tree(sentence)

    # iterated through parented subtrees
    for subtree in ptree.subtrees():

        # subtract occurences when "to" is considered a new subject
        if subtree.label() == "TO":
                t_unit_count -= 1
                
        # check for subjects
        if subtree.label() == "S":
            # if subject belongs to subordinate clause, ignore
            if subtree.parent().label() == "SBAR":
                continue
            # otherwise increment
            t_unit_count += 1
    
    # if more than one t-unit, ignore duplicated subject below root
    if t_unit_count > 1:
        t_unit_count -= 1

    return t_unit_count

In [20]:
def count_clauses(sentence):

    

    clause_count = 0




_IncompleteInputError: incomplete input (3022068186.py, line 1)

In [None]:
def t_unit_length(sentence):
    '''Computes the t-unit length (i.e., the number of words divided by number of t-units)'''

In [5]:
def fragment_ratio(text):
    '''Function to determine the ratio of fragments to lines in a given text'''

    sentences = split_sentences(text)
    total = len(sentences)
    if total == 0:
        return None

    # add complete sentences to a list
    is_complete = []
    for sent in sentences:
        if is_complete_sentence(sent):
            is_complete.append(sent)

    num_fragment = total - len(is_complete)

    fragment_ratio = num_fragment/total

    return fragment_ratio

In [6]:
def compute_syntactic_vals(df):
    '''Function to compute the syntactic metrics for each utterance in a dataframe.'''

    avg_t_units_list = []
    clause_t_unit_ratio_list = []
    avg_t_unit_length_list = []

    for utterance_sentences in tqdm(df["final"]):
        # list of values for each sentence
        t_units_per_sent = [count_t_units(s) for s in utterance_sentences]
        clauses_per_sent = [count_clauses(s) for s in utterance_sentences]
        t_unit_lengths_per_sent = [t_unit_length(s) for s in utterance_sentences]

        # average per utterance
        avg_t_units = sum(t_units_per_sent) / len(t_units_per_sent)
        avg_clause_t_unit_ratio = False
        avg_t_unit_len = sum(t_unit_lengths_per_sent) / len(t_unit_lengths_per_sent)

        # store values for the current utterance
        avg_t_units_list.append(avg_t_units)
        clause_t_unit_ratio_list.append(avg_clause_t_unit_ratio)
        avg_t_unit_length_list.append(avg_t_unit_len)

    # store all values in dataframe
    df["avg_t_units"] = avg_t_units_list
    df["clause_to_t_unit_ratio"] = clause_t_unit_ratio_list
    df["avg_t_unit_length"] = avg_t_unit_length_list

    return df

## Function Testing

## Data Analysis

## Visualizations