# Generating TruthyNet Social Media Post Features

In [None]:
import numpy as np
import pandas as pd


def generate_truthy_data(n_samples=2000, true_ratio=0.6, label_noise=0.1, random_state=42):
    """
    Generate a synthetic 'truthy vs non-truthy' social media dataset.

    Parameters
    ----------
    n_samples : int
        Number of rows to generate.
    true_ratio : float
        Base fraction of rows that are labeled true before noise.
    label_noise : float
        Probability of flipping each label (to make the task imperfect).
    random_state : int
        Random seed for reproducibility.

    Returns
    -------
    df : pandas.DataFrame
        Synthetic dataset with features and label_is_true.
    """
    rng = np.random.default_rng(random_state)

    # Start with "intended" labels (before noise)
    base_labels = rng.binomial(1, true_ratio, size=n_samples)

    source_credibility = np.empty(n_samples)
    has_citation = np.empty(n_samples, dtype=int)
    emotional_tone = np.empty(n_samples)
    all_caps_ratio = np.empty(n_samples)
    exclamation_count = np.empty(n_samples, dtype=int)
    reading_level = np.empty(n_samples)
    user_past_accuracy = np.empty(n_samples)

    for i, is_true in enumerate(base_labels):
        if is_true == 1:
            # TRUE POSTS: calm, cited, credible, more accurate users
            source_credibility[i] = rng.beta(5, 1.5)          # skew high (0.7–1.0)
            has_citation[i] = rng.binomial(1, 0.8)           # usually cited
            emotional_tone[i] = rng.beta(2, 5)               # mostly low
            all_caps_ratio[i] = rng.beta(1.2, 8)             # close to 0
            exclamation_count[i] = min(rng.poisson(0.3), 5)  # mostly 0–1
            reading_level[i] = np.clip(rng.normal(8, 1.2), 4, 10)
            user_past_accuracy[i] = rng.beta(5, 1.5)         # usually high
        else:
            # FALSE / MISLEADING POSTS: emotional, shouty, less credible
            source_credibility[i] = rng.beta(1.5, 4.5)       # skew low (0–0.5)
            has_citation[i] = rng.binomial(1, 0.2)           # usually no citation
            emotional_tone[i] = rng.beta(4, 2)               # higher emotion
            all_caps_ratio[i] = rng.beta(3, 3)               # wider range
            exclamation_count[i] = min(rng.poisson(1.5), 10) # more exclamation marks
            reading_level[i] = np.clip(rng.normal(5.5, 1.5), 1, 10)
            user_past_accuracy[i] = rng.beta(2, 3.5)         # usually lower

    # Add label noise to keep it realistic (model can’t be perfect)
    noisy_labels = base_labels.copy()
    flip_mask = rng.random(n_samples) < label_noise
    noisy_labels[flip_mask] = 1 - noisy_labels[flip_mask]

    df = pd.DataFrame({
        "post_id": np.arange(1, n_samples + 1),
        "source_credibility": source_credibility,
        "has_citation": has_citation,
        "emotional_tone": emotional_tone,
        "all_caps_ratio": all_caps_ratio,
        "exclamation_count": exclamation_count,
        "reading_level": reading_level,
        "user_past_accuracy": user_past_accuracy,
        "label_is_true": noisy_labels
    })

    return df




Wrote social_truthy_dataset.csv with 2000 rows.


In [3]:
df = generate_truthy_data(
    n_samples=2000,
    true_ratio=0.6,
    label_noise=0.1,
    random_state=67
)
df.to_csv("social_truthy_dataset.csv", index=False)
print("Wrote social_truthy_dataset.csv with", len(df), "rows.")

Wrote social_truthy_dataset.csv with 2000 rows.


In [2]:
df = generate_truthy_data(
    n_samples=2000,
    true_ratio=0.1,
    label_noise=0.2,
    random_state=67
)
df.to_csv("less_truthy_dataset.csv", index=False)
print("Wrote less_truthy_dataset.csv with", len(df), "rows.")

Wrote less_truthy_dataset.csv with 2000 rows.
