# Transcription to Excel Converter (With Cleaning)

In [None]:
import csv

# Function to convert seconds to HH:MM:SS format
def format_time(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = int(seconds % 60)
    return f"{hours:02d}:{minutes:02d}:{seconds:02d}"

# Read the transcriptions from the text file
input_file = "transcriptions_with_speakers.txt"
lines = []
with open(input_file, "r", encoding="utf-8") as f:
    lines = f.readlines()


output_file = "transcriptions_with_speakers.csv"
with open(output_file, "w", encoding="utf-8", newline="") as csv_file:
    csv_writer = csv.writer(csv_file)
    # header
    csv_writer.writerow(["Start Time", "End Time", "Speaker", "Transcription"])
    
    for line in lines:
        try:
            # Example line: "0000.5s - 0012.7s - SPEAKER_00: Merhaba, ..."
            parts = line.strip().split(" - ")
            if len(parts) < 3:
                print(f"Skipping line due to incorrect format: {line}")
                continue
            
            start_time = float(parts[0][:-1])  # Remove the 's' and convert to float
            end_time = float(parts[1][:-1])  # Remove the 's' and convert to float
            
            # Split the remaining part to get speaker and transcription
            speaker_transcription = parts[2].split(": ", 1)
            if len(speaker_transcription) < 2:
                speaker_transcription.append("")  # Add an empty transcription if missing

            speaker = speaker_transcription[0]
            transcription = speaker_transcription[1]
            
            formatted_start_time = format_time(start_time)
            formatted_end_time = format_time(end_time)
            
            csv_writer.writerow([formatted_start_time, formatted_end_time, speaker, transcription])
        except Exception as e:
            print(f"Error processing line: {line}. Error: {e}")

print(f"CSV file '{output_file}' created successfully.")

# Further cleaning to make it an excel file that is readable

In [None]:
import pandas as pd

# Load the CSV file created earlier
csv_file = "transcriptions_with_speakers.csv"
df = pd.read_csv(csv_file)

# Function to clean transcription text
def clean_transcription(text):
    # Check if the text is a string
    if isinstance(text, str):
        # Remove unwanted characters at the beginning and end
        text = text.strip()
        if text.startswith('"') and text.endswith('"'):
            text = text[1:-1]
    return text

# Apply the cleaning function to the 'Transcription' column
df['Transcription'] = df['Transcription'].apply(clean_transcription)

# Save the cleaned data back to CSV
cleaned_csv_file = "cleaned_transcriptions_with_speakers.csv"
df.to_csv(cleaned_csv_file, index=False)

print(f"Cleaned CSV file '{cleaned_csv_file}' created successfully.")

In [None]:
df.to_excel("cleaned_transcriptions_with_speakers.xlsx", index=False)