In [None]:
import json
import time
import os
from datetime import datetime
import random

class StructuredStreamingProducer:
    def __init__(self, output_dir="./data/structured_streaming_names", file_interval=10, total_files=6):
        self.output_dir = output_dir
        self.file_interval = file_interval  # seconds between files
        self.total_files = total_files
        
        # Sample data 
        self.names = ["Ana", "Carlos", "Maria", "Juan", "Laura", "Diego", "Sofia", "Pedro", "Elena", "Miguel"]
        
        # Create output dir
        os.makedirs(output_dir, exist_ok=True)
        
    def generate_student_data(self, file_number):
        """Generate student data with name and GPA"""
        # Create 1-3 student records per file
        num_records = random.randint(1, 3)
        students = []
        
        for i in range(num_records):
            student = {
                "name": random.choice(self.names),
                "gpa": round(random.uniform(6.0, 10.0), 2)  # GPA between 6.0 and 10.0
            }
            students.append(student)
        
        return students
    
    def produce_files(self):
        """Produce JSON files for structured streaming"""
        print("Starting Structured Streaming File Producer")
        print(f"Output directory: {self.output_dir}")
        print(f"File interval: {self.file_interval} seconds")
        print(f"Total files to create: {self.total_files}")
        print("=" * 60)
        
        for i in range(1, self.total_files + 1):
            # Generate data
            students_data = self.generate_student_data(i)
            
            # Create filename with timestamp
            timestamp = datetime.now().strftime("%H%M%S")
            filename = f"students_{timestamp}_{i:02d}.json"
            filepath = os.path.join(self.output_dir, filename)
            
            # Write JSON file
            with open(filepath, 'w') as f:
                # Write each student as a separate JSON object (one per line)
                for student in students_data:
                    json.dump(student, f)
                    f.write('\n')
            
            # Print console output
            print(f"File {i}/{self.total_files} created: {filename}")
            print(f"Records: {len(students_data)}")
            for student in students_data:
                print(f"  - {student['name']} (GPA: {student['gpa']})")
            
            # Wait before creating next file (except for the last one)
            if i < self.total_files:
                print(f"Waiting {self.file_interval} seconds...")
                print("-" * 40)
                time.sleep(self.file_interval)
        
        print("=" * 60)
        print(f"Production completed! {self.total_files} files created.")
        print(f"Files are ready for Spark Structured Streaming consumption.")
        print(f"Monitor directory: {self.output_dir}")

def main():
    OUTPUT_DIR = "./data/structured_streaming_names"  # Same directory structure as PDF
    FILE_INTERVAL = 10  # 10 seconds between files (good for testing)
    TOTAL_FILES = 6     # Create 6 files total
    
    producer = StructuredStreamingProducer(
        output_dir=OUTPUT_DIR,
        file_interval=FILE_INTERVAL,
        total_files=TOTAL_FILES
    )
    
    producer.produce_files()

if __name__ == "__main__":
    main()