## Quantifying Syntactic Quality

To quantify syntactic quality, I will rely on the following metrics, the rationale for which can be found in my thesis report:
<br>

1. Average number of T-units per sentence
2. Ratio of clauses to T-units
3. Average T-unit length
4. Fragment ratio

In choosing these metrics, I successfully capture run-on frequency, clausal complexity, and sentence fragment frequency.

## Package Imports

In [4]:
# general imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from convokit import Corpus, download
from tqdm import tqdm
import nltk

# syntactic specific imports
from nltk import pos_tag
from nltk.corpus import treebank
from nltk.tree import *
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
nltk.download('treebank')

import spacy
nlp = spacy.load("en_core_web_sm") # pre-trained English model

import stanza
stanza.download("en")
stanza_parser = stanza.Pipeline("en", processors="tokenize,pos,constituency")

[nltk_data] Downloading package treebank to
[nltk_data]     /Users/nickvick/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 436kB [00:00, 176MB/s]                     
2026-01-26 16:09:59 INFO: Downloaded file to /Users/nickvick/stanza_resources/resources.json
2026-01-26 16:09:59 INFO: Downloading default packages for language: en (English) ...
2026-01-26 16:10:00 INFO: File exists: /Users/nickvick/stanza_resources/en/default.zip
2026-01-26 16:10:01 INFO: Finished downloading models and saved to /Users/nickvick/stanza_resources
2026-01-26 16:10:01 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/

In [35]:
import os
print(os.getcwd())

/Users/nickvick/Library/CloudStorage/OneDrive-PrincetonUniversity/ORFE/Thesis/ORFE-Thesis/Notebooks


In [39]:
# set up for src imports
import sys
import os

# add project root to sys.path (so src/ can be imported)
project_root = os.path.abspath("..")  # adjust if notebooks are nested deeper
if project_root not in sys.path:
    sys.path.append(project_root)

# import required functions
from src.data_preprocessing import corpus_to_df, syntactic_preprocessing_df, is_complete_sentence

## Syntactic Helper Functions

In [5]:
# define relevant sets of tags and words
FINITE_VERB_TAGS = {"VB", "VBD", "VBN", "VBP", "VBZ"}
SUBJECT_TAGS = {"NN", "NNS", "NNP", "NNPS", "PRP"}
SUBORDINATING_CONJ = {"IN"} # tag for subordinating conjunction
COORDINATING_CONJ = {"CC"} # tag for coordinating conjunction

PUNCT = '?!.({[]})-–—"\''
CLOSING_PUNCT = '.!?…'
TRAILING_CLOSERS = set(['"', "'", ')', ']', '}', '”', '’'])

# normalize curly quotes and fancy punctuation
FANCY_TO_ASCII = {
                '“': '"', '”': '"',
                '‘': "'", '’': "'",
                '—': '-', '–': '-',
                '…': '...'
                }

In [6]:
def split_sentences(text):
    '''Helper function to split a given post into separate sentences'''

    sentence_tokens = sent_tokenize(text)

    return sentence_tokens

In [7]:
def is_complete_sentence(sentence):
    '''Helper function to determine whether a sentence is complete. Recall that a complete sentence follows these rules:
    -contains at least one subject 
    -contains at least one finite verb
    -ends with appropriate punctuation (.?!) 
    -if it begins with a subordinator, has an independent clause after
    -does not end with a conjunction
    '''

    cleaned = sentence.strip() # removing trailing/leading whitespace
    # account for differences in straight vs. smart quotes
    for f, a in FANCY_TO_ASCII.items():
        cleaned = cleaned.replace(f, a)
    # remove leading/trailing quotes
    cleaned = cleaned.strip('\"')
    cleaned = cleaned.strip('\'')

    # empty string
    if not cleaned:
        return False
    
    # tokenize sentence and tag tokens
    tokens = tokenize(cleaned)
    tags = pos_tag(tokens)

    # ensure length is appropriate
    if len(tokens) < 2:
        return False

    # first letter should be capital
    j = 0
    while j < len(cleaned) and cleaned[j] in PUNCT:
        j += 1
    if j >= len(cleaned):
        return False
    if not cleaned[j].isalpha() or not cleaned[j].isupper():
        return False
        
    # last relevant char must end with proper punctuation
    i = len(cleaned) - 1
    while i > 0 and cleaned[i] in TRAILING_CLOSERS:
        i -= 1
    if i <= 0 or cleaned[i] not in CLOSING_PUNCT:
        return False
    
    # find the first words tag
    first_word = None
    first_tag = None
    for word, tag in tags:
        if word.isalpha():
            first_word = word
            first_tag = tag
            break
    # if first word is subordinating conjunction (including "when"), need independent clause after
    if first_tag in SUBORDINATING_CONJ or first_word == "When":
        if ',' in tokens: # indepdent clause will start after a comma
            comma_index = tokens.index(',')
            post_sub_tags = tags[comma_index+1:]
            # check if independent clause is a complete thought
            has_finite_verb_post_sub = any(tag in FINITE_VERB_TAGS for _, tag in post_sub_tags)
            has_subject_post_sub = any(tag in SUBJECT_TAGS for _, tag in tags)
            if not (has_finite_verb_post_sub and has_subject_post_sub):
                return False
        # if no comma separating clauses
        else:
            noun_count = sum(1 for _, tag in tags if tag in SUBJECT_TAGS)
            verb_count = sum(1 for _, tag in tags if tag in FINITE_VERB_TAGS)
            # edge case for when first word is if
            if first_word == "If" and verb_count < 2:
                return False
            # check for two nouns, if not assume fragment
            if noun_count < 2:
                return False

    # find the last words tag
    last_tag = None
    for word, tag in reversed(tags):
        if word.isalpha():
            last_tag = tag
            break
    # last word cannot be conjunction
    if last_tag in COORDINATING_CONJ:
        return False

    # check if it has finite verb and subject
    has_finite_verb = any(tag in FINITE_VERB_TAGS for _, tag in tags)
    has_subject = any(tag in SUBJECT_TAGS for _, tag in tags)

    return has_finite_verb and has_subject

## Syntactic Analysis Functions

In [8]:
def fragment_ratio(text):
    '''Function to determine the ratio of fragments to lines in a given text'''

    sentences = split_sentences(text)
    total = len(sentences)
    if total == 0:
        return None

    # add complete sentences to a list
    is_complete = []
    for sent in sentences:
        if is_complete_sentence(sent):
            is_complete.append(sent)

    num_fragment = total - len(is_complete)

    fragment_ratio = num_fragment/total

    return fragment_ratio

## Corpus Selection

In [6]:
corpus = Corpus(filename="../subreddit-teenagers")

## Data Analysis