In [201]:
import os
import json
import pandas as pd
import numpy as np

In [202]:
# Function to calculate median, average cosine_dist, and occurrences for each speaker
def calculate_speaker_stats(data, filter_length=50):
    speakers_cosine_dist = {}
    segments = data if isinstance(data, list) else data.get('segments', [])

    for segment in segments:
        if 'speaker' not in segment:
            continue
        
        speaker = segment['speaker']
        text_length = len(segment['text'])
        cosine_dist = segment['cosine_dist']
        
        if text_length > filter_length:
            if speaker not in speakers_cosine_dist:
                speakers_cosine_dist[speaker] = []
            speakers_cosine_dist[speaker].append(cosine_dist)

    speaker_stats = {}
    for speaker, cosine_dists in speakers_cosine_dist.items():
        median_cosine_dist = np.median(cosine_dists)
        average_cosine_dist = np.mean(cosine_dists)
        occurrences = len(cosine_dists)
        speaker_stats[speaker] = {
            'median_cosine_dist': median_cosine_dist,
            'average_cosine_dist': average_cosine_dist,
            'occurrences': occurrences
        }
    
    return speaker_stats

# Function to process a single JSON file
def process_single_json(data):
    stats = calculate_speaker_stats(data)
    valid_speakers = {s: stat for s, stat in stats.items()}

    sorted_speakers = sorted(valid_speakers.items(), key=lambda item: (item[1]['average_cosine_dist'], item[1]['median_cosine_dist']))
    assistant_speaker = sorted_speakers[0][0]
    
    for segment in data['segments']:
        if 'speaker' not in segment:
            continue
        if segment['speaker'] == assistant_speaker:
            segment['speaker'] = 'assistant'
        else:
            segment['speaker'] = 'user'
    
    return data

def get_speaker_aware_transcript(segments):
    previous_speaker = segments[0]["speaker"]
    transcript = [{'role': 'user' if previous_speaker == 'user' else 'assistant', 'content': ''}]
    
    for segment in segments:
        if 'speaker' not in segment:
            continue
        speaker = segment["speaker"]
        text = segment["text"]
        role = 'user' if speaker == 'user' else 'assistant'

        if role != transcript[-1]['role']:
            transcript.append({'role': role, 'content': text.strip()})
        else:
            transcript[-1]['content'] += ' ' + text.strip()
        
        previous_speaker = speaker

    return transcript

# Function to validate transcript
def validate_transcript(data, min_number_occurrences=30, valid_speakers=2):
    stats = calculate_speaker_stats(data)
    has_enough_speakers = bool(len({s: stat for s, stat in stats.items()}) >= valid_speakers)
    has_enough_ocurrences = bool(sum(stat['occurrences'] for stat in stats.values()) >= min_number_occurrences)
    return has_enough_speakers and has_enough_ocurrences

def load_json(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

In [203]:
# Directory containing the JSON files
directory = '../output/milei'

# List all JSON files in the directory
json_files = [f for f in os.listdir(directory) if f.endswith('.json')]

# Initialize a list to store the data
data_list = []

In [204]:
# Load and process each JSON file
for json_file in json_files:
    file_path = os.path.join(directory, json_file)
    data = load_json(file_path)
    
    # Ensure data is a dictionary
    if isinstance(data, list):
        data = {'segments': data}
    
    # Validate the JSON data
    if not validate_transcript(data):
        continue
    
    # Extract metadata
    metadata = {key: value for key, value in data.items() if key != 'segments'}
    
    # Process and extract messages
    modified_json_data = process_single_json(data)
    messages = get_speaker_aware_transcript(modified_json_data["segments"])
    
    # Create a single row with metadata and messages
    row = metadata
    row['messages'] = messages
    
    # Append to data_list
    data_list.append(row)

In [205]:
# convert the list to a DataFrame
df = pd.DataFrame(data_list)

# fix a few things
df.rename(columns={'id': 'video_id'}, inplace=True)
df.rename(columns={'candidate_name': 'assistant'}, inplace=True)
df.drop(columns=['Index', 'quality'], inplace=True)

df.head()

Unnamed: 0,channel_id,channel,uploader_url,video_id,url,title,duration,view_count,assistant,messages
0,UCz489cQmrgH57sShDiatwfw,El Peluca Milei,https://www.youtube.com/@ElPelucaMilei,tUkiys_DZ6A,https://www.youtube.com/watch?v=tUkiys_DZ6A,MILEI LE ECHÓ EN CARA QUE SU SOCIO POLÍTICO LO...,1520.0,270112.0,milei,"[{'role': 'assistant', 'content': ' Pero yo es..."
1,UCz489cQmrgH57sShDiatwfw,El Peluca Milei,https://www.youtube.com/@ElPelucaMilei,u2HyQ34IvTo,https://www.youtube.com/watch?v=u2HyQ34IvTo,MILEI DESBURRÓ AL BRUTO DE DANIEL SCIOLI,2750.0,938494.0,milei,"[{'role': 'user', 'content': ' Javier Milei, V..."
2,UCz489cQmrgH57sShDiatwfw,El Peluca Milei,https://www.youtube.com/@ElPelucaMilei,HvIPOKufdV4,https://www.youtube.com/watch?v=HvIPOKufdV4,MILEI ROMPIÓ EN LLANTO TRAS PASAR LAS ELECCIONES,4574.0,961932.0,milei,"[{'role': 'assistant', 'content': ' Idea de......"
3,UCz489cQmrgH57sShDiatwfw,El Peluca Milei,https://www.youtube.com/@ElPelucaMilei,5YUXdjZ0q8E,https://www.youtube.com/watch?v=5YUXdjZ0q8E,MILEI SE CRUZÓ CON FEINMANN EN MEDIO DE LA ENT...,3014.0,513444.0,milei,"[{'role': 'user', 'content': ' Diciéndole que ..."
4,UCz489cQmrgH57sShDiatwfw,El Peluca Milei,https://www.youtube.com/@ElPelucaMilei,zS-3QVSIsNo,https://www.youtube.com/watch?v=zS-3QVSIsNo,¿QUÉ POLÍTICO LE OFRECIÓ UNA VALIJA CON PLATA ...,3187.0,137927.0,milei,"[{'role': 'assistant', 'content': ' No voy a e..."


In [206]:
# summary
print(f"nos quedamos con {len(df)} de {len(json_files)} o un {len(df)/len(json_files)*100:.0f}%")

nos quedamos con 207 de 261 o un 79%


In [209]:
# Save the DataFrame to a .csv and .parquet file
output_path = '../data'
df.to_csv(f'{output_path}/huggingface_dataset.csv', index=False)
df.to_parquet(f'{output_path}/huggingface_dataset.parquet')

print(f"Data saved to {output_path}")

Data saved to ../data
