In [1]:
import json
import math
import os

def seconds_to_hours_minutes_seconds(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    remaining_seconds = int(seconds % 60)
    if hours > 0:
        return f"{hours}h{minutes:02d}m{remaining_seconds:02d}s"
    else:
        return f"{minutes}m{remaining_seconds:02d}s"

def create_new_segments(json_data, module, min_duration=30):
    segments = json_data['segments']
    new_segments = []

    i = 0
    while i < len(segments):
        start_time = segments[i]['start']
        end_time = segments[i]['end']
        text = segments[i]['text']

        j = i + 1
        while j < len(segments) and end_time - start_time < min_duration:
            end_time = segments[j]['end']
            text += ' ' + segments[j]['text']
            j += 1

        # Convert start and end times to hours, minutes, and seconds format
        start_time_str = seconds_to_hours_minutes_seconds(start_time)
        end_time_str = seconds_to_hours_minutes_seconds(end_time)

        new_segment = {
            'module': module,
            'start': start_time_str,
            'end': end_time_str,
            'content': text,
        }
        new_segments.append(new_segment)

        i = j

    return new_segments

def process_json_file(input_json_file_path, module, output_jsonl_file_path):
    # Read the input JSON file
    with open(input_json_file_path, 'r') as file:
        json_data = json.load(file)

    # Create new segments
    new_segments = create_new_segments(json_data, module)

    # Check if the output JSONL file exists, create it if it doesn't
    if not os.path.exists(output_jsonl_file_path):
        open(output_jsonl_file_path, 'w').close()

    # Append the new segments to the output JSONL file
    with open(output_jsonl_file_path, 'a') as file:
        for segment in new_segments:
            json_line = json.dumps(segment)
            file.write(json_line + '\n')

    print(f"New segments appended to: {output_jsonl_file_path}")


def create_new_segments_from_jsonl(jsonl_file_path, module, output_jsonl_file_path, min_duration=30):
    with open(jsonl_file_path, 'r') as file:
        for line_number, line in enumerate(file, start=1):
            try:
                json_data = json.loads(line)
                segments = json_data['segments']
                new_segments = []

                i = 0
                while i < len(segments):
                    start_time = segments[i]['start']
                    end_time = segments[i]['end']
                    text = segments[i]['text']

                    j = i + 1
                    while j < len(segments) and end_time - start_time < min_duration:
                        end_time = segments[j]['end']
                        text += ' ' + segments[j]['text']
                        j += 1

                    start_time_str = seconds_to_hours_minutes_seconds(start_time)
                    end_time_str = seconds_to_hours_minutes_seconds(end_time)

                    new_segment = {
                        'module': module,
                        'start': start_time_str,
                        'end': end_time_str,
                        'content': text,
                    }
                    new_segments.append(new_segment)

                    i = j

                if not os.path.exists(output_jsonl_file_path):
                    open(output_jsonl_file_path, 'w').close()

                with open(output_jsonl_file_path, 'a') as file:
                    for segment in new_segments:
                        # Escape the "content" value using json.dumps()
                        segment['content'] = json.dumps(segment['content'])
                        json_line = json.dumps(segment)
                        file.write(json_line + '\n')

            except json.JSONDecodeError as e:
                print(f"JSON decoding error in file '{jsonl_file_path}', line {line_number}: {str(e)}")
                continue

    print(f"New segments appended to: {output_jsonl_file_path}")



# Specify the path to your output JSONL file
output_jsonl_file_path = "../data/GES824/embeds/summaries_audio_GES824.jsonl"

# Process multiple JSON files with their corresponding modules
json_file_paths_and_modules = [
    ('../data/GES824/transcripts/cours_1_transcription.json', 'cours_1'), 
    ('../data/GES824/transcripts/cours_2-1_transcription.json', 'cours_2-1'), 
    ('../data/GES824/transcripts/cours_2-2_transcription.json', 'cours_2-2'), 
    ('../data/GES824/transcripts/cours_3-1_transcription.json', 'cours_3-1'),
]

for json_file_path, module in json_file_paths_and_modules:
    process_json_file(json_file_path, module, output_jsonl_file_path)


jsonl_file_paths_and_modules = [
    ('../data/GES824/transcripts/cours_3-2_transcription.jsonl', 'cours_3-2'), 
    ('../data/GES824/transcripts/cours_3-3_transcription.jsonl', 'cours_3-3'), 
    ('../data/GES824/transcripts/cours_3-4_transcription.jsonl', 'cours_3-4'), 
    ('../data/GES824/transcripts/cours_3-5_transcription.jsonl', 'cours_3-5'),
]

for jsonl_file_path, module in jsonl_file_paths_and_modules:
    create_new_segments_from_jsonl(jsonl_file_path, module, output_jsonl_file_path)

New segments appended to: ../data/GES824/embeds/summaries_audio_GES824.jsonl
New segments appended to: ../data/GES824/embeds/summaries_audio_GES824.jsonl
New segments appended to: ../data/GES824/embeds/summaries_audio_GES824.jsonl
New segments appended to: ../data/GES824/embeds/summaries_audio_GES824.jsonl
New segments appended to: ../data/GES824/embeds/summaries_audio_GES824.jsonl
New segments appended to: ../data/GES824/embeds/summaries_audio_GES824.jsonl
New segments appended to: ../data/GES824/embeds/summaries_audio_GES824.jsonl
New segments appended to: ../data/GES824/embeds/summaries_audio_GES824.jsonl
