In [7]:
import pandas as pd
import json
from datetime import datetime

class NewsHeadlinesGenerator:
    def __init__(self):
        self.headlines = {
            'economy': [
                "South Africa sees record-breaking employment growth in Q3",
                "JSE reaches new high as investor confidence grows",
                "Manufacturing sector shows signs of recovery",
                "Export numbers exceed expectations for second quarter",
                "Local currency strengthens against major currencies",
                "Small business confidence index rises",
                "Tourism sector reports 40% increase in international visitors",
                "Tech industry investment reaches R2 billion milestone",
                "Agricultural exports boost rural economy",
                "Mining production increases by 15% year-on-year"
            ],
            'infrastructure': [
                "Load shedding disrupts businesses across the country",
                "New solar power plant opens in Northern Cape",
                "Road rehabilitation project begins in Gauteng",
                "Water infrastructure upgrade planned for Eastern Cape",
                "High-speed rail project connects major cities",
                "Port expansion project creates 1000 jobs",
                "Municipal service delivery improves in Western Cape",
                "New bridge construction alleviates traffic congestion",
                "Telecommunications network expands to rural areas",
                "Public transport system undergoes major renovation"
            ],
            'government': [
                "New legislation proposed to address water scarcity",
                "Government launches free public WiFi initiative",
                "Cabinet approves small business support package",
                "Department of Education increases school funding",
                "New anti-corruption task force established",
                "Parliament debates economic reform bill",
                "Local government elections scheduled for next quarter",
                "Minister announces healthcare system overhaul",
                "Government introduces youth employment scheme",
                "New housing development policy implemented"
            ],
            'social': [
                "Community project improves food security",
                "Local school achieves 100% matric pass rate",
                "NGO launches skills development program",
                "Healthcare workers receive additional training",
                "Youth entrepreneurship program shows positive results",
                "Cultural festival attracts international attention",
                "Sports development initiative reaches rural areas",
                "Education technology program expands to more schools",
                "Community safety initiative reduces crime rates",
                "Social housing project completed ahead of schedule"
            ],
            'environment': [
                "Drought conditions worsen in Eastern Cape",
                "Renewable energy project exceeds targets",
                "Conservation effort saves endangered species",
                "Climate change adaptation plan launched",
                "Green technology hub creates employment",
                "Environmental cleanup project shows results",
                "Sustainable farming initiative gains traction",
                "Wind farm project nears completion",
                "Marine protection area established",
                "Air quality improves in major cities"
            ]
        }
    
    def generate_labelstudio_json(self, filename='headlines_for_labeling.json'):
        """Generate JSON file in Label Studio format"""
        tasks = []
        
        for category, headlines in self.headlines.items():
            for headline in headlines:
                task = {
                    "data": {
                        "text": headline,
                        "category": category
                    },
                    "meta": {
                        "timestamp": datetime.now().isoformat()
                    }
                }
                tasks.append(task)
        
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(tasks, f, indent=2)
        
        return tasks
    
    def generate_csv(self, filename='headlines_for_labeling.csv'):
        """Generate CSV file with headlines and categories"""
        data = []
        for category, headlines in self.headlines.items():
            for headline in headlines:
                data.append({
                    'headline': headline,
                    'category': category
                })
        
        df = pd.DataFrame(data)
        df.to_csv(filename, index=False)
        return df
    
    def print_statistics(self):
        """Print dataset statistics"""
        total_headlines = sum(len(headlines) for headlines in self.headlines.values())
        print(f"\nDataset Statistics:")
        print(f"Total headlines: {total_headlines}")
        print("\nHeadlines per category:")
        for category, headlines in self.headlines.items():
            print(f"{category}: {len(headlines)}")

def main():
    generator = NewsHeadlinesGenerator()
    
    
    generator.generate_labelstudio_json()
    
   
    generator.generate_csv()
    
    
    generator.print_statistics()
    
    print("\nFiles generated:")
    print("1. headlines_for_labeling.json - Import this into Label Studio")
    print("2. headlines_for_labeling.csv - Use this for reference or alternative tools")

if __name__ == "__main__":
    main()


Dataset Statistics:
Total headlines: 50

Headlines per category:
economy: 10
infrastructure: 10
government: 10
social: 10
environment: 10

Files generated:
1. headlines_for_labeling.json - Import this into Label Studio
2. headlines_for_labeling.csv - Use this for reference or alternative tools
