In [1]:
import pandas as pd
import random
import textwrap

In [2]:
# Define categories and sentence templates
categories = {
    "Urban Life": [
        "As {city} continues to expand, debates arise over how to balance rapid urbanization with sustainability. The recent rise in {transportation} usage has reshaped how people move, yet issues like {problem} remain unresolved. Experts argue that innovative city planning must incorporate {urban_solution} to create a livable future.",
        "Public transit systems in cities like {city} have undergone rapid changes due to {factor}. While ridership increases, concerns over {issue} persist. Planners suggest that future urban mobility should integrate {solution} to address the challenges posed by {trend}."
    ],
    "Cultural Trends": [
        "Generational shifts are transforming societal expectations in unexpected ways. {generation} is redefining {social_aspect}, moving away from traditional norms and embracing {modern_trend}. This cultural evolution is particularly evident in {area}, where trends such as {example} reflect deeper societal changes.",
        "Social movements like {movement} have significantly shaped public discourse, influencing {sector}. The rise of {phenomenon} has fueled debates on {issue}, leading to new cultural narratives that redefine {aspect} in contemporary society."
    ],
    "Education": [
        "With new learning technologies emerging, educators are adapting curricula to meet the demands of {industry}. Research suggests that {teaching_method} enhances student engagement, yet critics argue that traditional approaches still hold value. The ongoing debate over {education_topic} highlights the evolving nature of academic institutions.",
        "The integration of {technology} in classrooms has transformed the way students interact with course material. While some argue that {method} fosters greater independence, others warn of potential drawbacks, including {concern}. Future policies must strike a balance between {factor} and {factor}."
    ],
    "Work & Labor Trends": [
        "The increasing shift toward remote work is reshaping corporate structures. Companies in {sector} have adopted {work_model} to retain talent, yet concerns about {challenge} persist. Some experts argue that hybrid work environments could be the key to addressing {problem}, ensuring both productivity and employee satisfaction.",
        "Automation in {industry} has sparked debates about job displacement. While some fear widespread layoffs, others believe that {trend} will create opportunities in {new_field}. The challenge lies in equipping the workforce with {skills} to remain competitive in a rapidly evolving labor market."
    ],
    "Consumer Behavior": [
        "The rise of digital shopping has changed consumer expectations. Studies show that {demographic} prioritize {factor} when making purchasing decisions. With companies focusing on {strategy}, traditional retail faces pressure to innovate and align with evolving digital habits.",
        "Subscription-based models have surged in popularity across industries such as {industry}. Consumers increasingly value {factor} over ownership, driving companies to rethink their {business_model}. This shift reflects deeper changes in {consumer_trend}."
    ],
    "Technology & Digital Life": [
        "The rapid evolution of social media platforms has changed the way people interact with information. Concerns about {issue} continue to grow, especially as algorithms increasingly influence {aspect}. Experts suggest that transparency and user autonomy could help mitigate the negative effects of {technology_trend}.",
        "AI ethics debates have intensified as companies implement {technology} in decision-making processes. While advocates argue that {benefit} could revolutionize {industry}, critics warn of {risk}. Policymakers must address these ethical dilemmas to ensure responsible AI deployment."
    ],
    "Journalism": [
        "News consumption patterns are shifting dramatically as audiences turn to {platform} for information. While this trend has increased accessibility, concerns over {challenge} remain. Media outlets must adapt their strategies to maintain credibility while keeping up with changing digital habits.",
        "The decline of traditional journalism is often attributed to {factor}. As reporting moves online, the challenge lies in balancing {aspect} with {aspect}, ensuring that journalistic integrity is preserved in an era of rapid information dissemination."
    ]
}

In [3]:
# Sample values for placeholders
sample_values = {
    "city": ["New York", "London", "Tokyo", "Mexico City", "Berlin"],
    "transportation": ["subways", "electric buses", "bike-sharing systems"],
    "problem": ["overcrowding", "lack of accessibility", "high costs"],
    "urban_solution": ["smart city technologies", "sustainable infrastructure"],
    "factor": ["environmental concerns", "economic shifts", "technological advancements"],
    "issue": ["data privacy", "content moderation", "digital addiction"],
    "solution": ["public-private partnerships", "automated transit solutions"],
    "trend": ["urban migration", "remote work culture"],
    "generation": ["Gen Z", "Millennials", "Boomers"],
    "social_aspect": ["family structures", "work ethics"],
    "modern_trend": ["gig economy jobs", "minimalist lifestyles"],
    "area": ["fashion", "media", "education"],
    "example": ["sustainable fashion", "short-form video content"],
    "movement": ["climate activism", "labor unions"],
    "sector": ["corporate policies", "education", "public policy"],
    "phenomenon": ["cancel culture", "online activism"],
    "aspect": ["identity politics", "work-life balance"],
    "industry": ["tech", "finance", "manufacturing"],
    "teaching_method": ["flipped classrooms", "AI-powered tutoring"],
    "education_topic": ["standardized testing", "critical thinking in education"],
    "technology": ["AI-driven learning platforms", "virtual reality tools"],
    "method": ["self-paced online courses", "blended learning"],
    "concern": ["lack of personal interaction", "over-reliance on automation"],
    "work_model": ["hybrid schedules", "fully remote teams"],
    "challenge": ["employee burnout", "team cohesion"],
    "problem": ["collaboration", "communication barriers"],
    "trend": ["machine learning automation", "data-driven hiring"],
    "new_field": ["cybersecurity", "green energy"],
    "skills": ["AI literacy", "digital marketing expertise"],
    "demographic": ["young professionals", "digital natives"],
    "factor": ["sustainability", "affordability"],
    "strategy": ["personalized recommendations", "AI-driven marketing"],
    "platform": ["social media", "mobile apps"],
    "challenge": ["misinformation", "algorithmic bias"],
    "technology_trend": ["AI-generated content", "deepfake journalism"],
    "technology": ["predictive analytics", "automated news writing"],
    "benefit": ["efficiency", "cost reduction"],
    "risk": ["job displacement", "ethical dilemmas"],
    "business_model": ["pricing structures", "customer engagement strategies"]
}

In [4]:
# Generate sentences
num_sentences = 5000
data = []

for i in range(num_sentences):
    category = random.choice(list(categories.keys()))
    sentence_template = random.choice(categories[category])

    for key in sample_values.keys():
        if "{" + key + "}" in sentence_template:
            sentence_template = sentence_template.replace("{" + key + "}", random.choice(sample_values[key]))
    
    # Ensure sentence is within 400-700 character range
    sentence = textwrap.shorten(sentence_template, width=random.randint(400, 700), placeholder="...")
    
    data.append([i + 1, sentence, category])

In [5]:
# Create DataFrame
df = pd.DataFrame(data, columns=["id", "sentence", "category"])

# Save to CSV
file_name = "synthetic_societal_sentences.csv"
df.to_csv(file_name, index=False)

print(f"Dataset created with {len(df)} sentences. Saved as '{file_name}'.")


Dataset created with 5000 sentences. Saved as 'synthetic_societal_sentences.csv'.
