Sentence length for synthetic datasets

## Synthetic sentiment

In [2]:
import csv

def mean_sentence_length(file_path):
    # Initialize dictionaries to store total lengths and counts for each column
    total_lengths = {}
    total_sentences = {}

    with open(file_path, mode='r', encoding='utf-8') as file:
        # Ensure the reader handles fields that might contain commas
        reader = csv.reader(file, quotechar='"')
        headers = next(reader)  # Read the header to determine the columns

        # Initialize the dictionaries based on the number of columns
        for header in headers:
            total_lengths[header] = 0
            total_sentences[header] = 0

        for row in reader:
            for i, sentence in enumerate(row):
                words = len(sentence.split())
                total_lengths[headers[i]] += words
                total_sentences[headers[i]] += 1

    # Compute mean lengths for each column
    mean_lengths = {header: (total_lengths[header] / total_sentences[header] if total_sentences[header] > 0 else 0) for header in headers}
    return mean_lengths

targets = ['abuse', 'anxiety', 'depression', 'mental_health', 'mental_illness', 'trauma']

# Example usage for multiple files
base_path = '../1_sentiment/synthetic/output/all-year/'
file_extension = '_synthetic_sentences.csv'

for target in targets:
    file_path = base_path + target + file_extension
    mean_lengths = mean_sentence_length(file_path)
    print(f"Mean sentence lengths for {target}:")
    for column, length in mean_lengths.items():
        print(f"{column}: {length:.2f} words")


Mean sentence lengths for abuse:
baseline: 27.78 words
positive_variation: 29.84 words
negative_variation: 29.42 words
Mean sentence lengths for anxiety:
baseline: 26.76 words
positive_variation: 27.69 words
negative_variation: 27.55 words
Mean sentence lengths for depression:
baseline: 26.51 words
positive_variation: 27.95 words
negative_variation: 27.70 words
Mean sentence lengths for mental_health:
baseline: 28.25 words
positive_variation: 28.72 words
negative_variation: 28.74 words
Mean sentence lengths for mental_illness:
baseline: 27.77 words
positive_variation: 28.43 words
negative_variation: 28.51 words
Mean sentence lengths for trauma:
baseline: 27.90 words
positive_variation: 30.46 words
negative_variation: 30.05 words


## Synthetic intensity

In [4]:
import csv

def mean_sentence_length(file_path):
    # Initialize dictionaries to store total lengths and counts for each column
    total_lengths = {}
    total_sentences = {}

    with open(file_path, mode='r', encoding='utf-8') as file:
        # Ensure the reader handles fields that might contain commas
        reader = csv.reader(file, quotechar='"')
        headers = next(reader)  # Read the header to determine the columns

        # Initialize the dictionaries based on the number of columns
        for header in headers:
            total_lengths[header] = 0
            total_sentences[header] = 0

        for row in reader:
            for i, sentence in enumerate(row):
                words = len(sentence.split())
                total_lengths[headers[i]] += words
                total_sentences[headers[i]] += 1

    # Compute mean lengths for each column
    mean_lengths = {header: (total_lengths[header] / total_sentences[header] if total_sentences[header] > 0 else 0) for header in headers}
    return mean_lengths

targets = ['abuse', 'anxiety', 'depression', 'mental_health', 'mental_illness', 'trauma']

# Example usage for multiple files
base_path = '../3_intensity/synthetic/output/all-year/'
file_extension = '_synthetic_sentences.csv'

for target in targets:
    file_path = base_path + target + file_extension
    mean_lengths = mean_sentence_length(file_path)
    print(f"Mean sentence lengths for {target}:")
    for column, length in mean_lengths.items():
        print(f"{column}: {length:.2f} words")


Mean sentence lengths for abuse:
baseline: 27.77 words
high_intensity: 29.62 words
low_intensity: 29.18 words
Mean sentence lengths for anxiety:
baseline: 26.49 words
high_intensity: 29.05 words
low_intensity: 28.13 words
Mean sentence lengths for depression:
baseline: 26.85 words
high_intensity: 29.88 words
low_intensity: 29.11 words
Mean sentence lengths for mental_health:
baseline: 28.07 words
high_intensity: 31.90 words
low_intensity: 29.45 words
Mean sentence lengths for mental_illness:
baseline: 27.82 words
high_intensity: 31.36 words
low_intensity: 28.66 words
Mean sentence lengths for trauma:
baseline: 27.73 words
high_intensity: 30.47 words
low_intensity: 29.60 words


## Synthetic breadth

In [5]:
import csv

def mean_sentence_length(file_path):
    # Initialize dictionaries to store total lengths and counts for each column
    total_lengths = {}
    total_sentences = {}

    with open(file_path, mode='r', encoding='utf-8') as file:
        # Ensure the reader handles fields that might contain commas
        reader = csv.reader(file, quotechar='"')
        headers = next(reader)  # Read the header to determine the columns

        # Initialize the dictionaries based on the number of columns
        for header in headers:
            total_lengths[header] = 0
            total_sentences[header] = 0

        for row in reader:
            for i, sentence in enumerate(row):
                words = len(sentence.split())
                total_lengths[headers[i]] += words
                total_sentences[headers[i]] += 1

    # Compute mean lengths for each column
    mean_lengths = {header: (total_lengths[header] / total_sentences[header] if total_sentences[header] > 0 else 0) for header in headers}
    return mean_lengths

targets = ['abuse', 'anxiety', 'depression', 'mental_health', 'mental_illness', 'trauma']

# Example usage for multiple files
base_path = '../2_breadth/synthetic/output/unique_all-year/'
file_extension = '_synthetic_sentences.csv'

for target in targets:
    file_path = base_path + target + file_extension
    mean_lengths = mean_sentence_length(file_path)
    print(f"Mean sentence lengths for {target}:")
    for column, length in mean_lengths.items():
        print(f"{column}: {length:.2f} words")


Mean sentence lengths for abuse:
sentence: 27.22 words
label: 1.00 words
year: 1.00 words
Mean sentence lengths for anxiety:
sentence: 26.38 words
label: 1.00 words
year: 1.00 words
Mean sentence lengths for depression:
sentence: 26.62 words
label: 1.00 words
year: 1.00 words
Mean sentence lengths for mental_health:
sentence: 26.12 words
label: 1.00 words
year: 1.00 words
Mean sentence lengths for mental_illness:
sentence: 26.21 words
label: 1.00 words
year: 1.00 words
Mean sentence lengths for trauma:
sentence: 26.34 words
label: 1.00 words
year: 1.00 words
