In [None]:
%pip install openai-whisper torch openpyxl pandas chord-extractor autochord

In [1]:
import whisper
import torch
import gc
from chord_extractor.extractors import Chordino
from difflib import SequenceMatcher
import json
import pandas as pd
import os
from openpyxl import load_workbook
from openpyxl.styles import Font
from openpyxl.cell.rich_text import CellRichText, TextBlock, InlineFont
import autochord

# Set up parameters
xlsx_file = "JM.xlsx"
audio_folder = "../Joyful_Melodies"
output_folder = "aligned_output"
chord_output_folder = "chord_transcriptions"

# Ensure output folders exist
os.makedirs(output_folder, exist_ok=True)
os.makedirs(chord_output_folder, exist_ok=True)

2024-10-17 19:35:55.654255: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-17 19:35:55.697882: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


autochord: Initializing...
autochord: Using NNLS-Chroma VAMP plugin in /home/arrupe/.local/lib/python3.8/site-packages/chord_extractor/_lib


2024-10-17 19:35:57.277291: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients_split_2_grad_concat_split_2_split_dim' with dtype int32
	 [[{{node gradients_split_2_grad_concat_split_2_split_dim}}]]
2024-10-17 19:35:57.277401: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients_split_grad_concat_split_split_dim' with dtype int32
	 [[{{node gradients_split_grad_concat_split_split_dim}}]]
2024-10-17 19:35:57.277482: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

autochord: Loaded model from /home/arrupe/.autochord/chroma-seq-bilstm-crf-v1


2024-10-17 19:36:02.197699: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,128,256]
	 [[{{node inputs}}]]
2024-10-17 19:36:02.207434: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder' with dtype float and shape [?,128,256]
	 [[{{node Placeholder}}]]


In [2]:
def read_lyrics_from_xlsx(file_path, serial_number):
    """Read lyrics from the XLSX file for the given serial number."""
    wb = load_workbook(file_path, rich_text=True)
    sheet = wb.active
    
    for row in sheet.iter_rows(min_row=1, values_only=False):
        if row[0].value == serial_number:
            title = row[1].value
            lyrics_cell = row[2]
            
            chorus = []
            verses = []
            current_verse = []
            
            if isinstance(lyrics_cell.value, CellRichText):
                lines = str(lyrics_cell.value).split('\n')
                for line in lines:
                    if line.strip():
                        is_bold = any(isinstance(run, TextBlock) and run.font and run.font.b 
                                      for run in lyrics_cell.value if str(run) in line)
                        if is_bold:
                            chorus.append(line.strip())
                        else:
                            current_verse.append(line.strip())
                    elif current_verse:
                        verses.append('\n'.join(current_verse))
                        current_verse = []
            else:
                # Fallback to non-rich text handling
                lines = str(lyrics_cell.value).split('\n')
                for line in lines:
                    if line.strip():
                        current_verse.append(line.strip())
                    elif current_verse:
                        verses.append('\n'.join(current_verse))
                        current_verse = []
            
            if current_verse:
                verses.append('\n'.join(current_verse))
            
            # Remove the last "verse" if it's just a number
            if verses and verses[-1].strip().isdigit():
                verses.pop()
            
            return {
                "title": title,
                "chorus": '\n'.join(chorus),
                "verses": verses
            }
    
    return None

def transcribe_audio_whisper(audio_file: str, model_name: str = "base") -> list:
    """Transcribe the given audio file using the Whisper model locally."""
    print(f"Loading Whisper model: {model_name}")
    model = whisper.load_model(model_name).to('cpu')

    print(f"Transcribing audio file: {audio_file}")
    with torch.no_grad():
        audio = whisper.load_audio(audio_file)
        result = model.transcribe(audio, language='en', word_timestamps=True)
    
    transcription = []
    for segment in result['segments']:
        for word in segment['words']:
            transcription.append((word['word'], (word['start'], word['end'])))
    
    del result
    gc.collect()
    return transcription

def extract_and_combine_chords(audio_file):
    autochord_chords = autochord.recognize(audio_file)
    chordino = Chordino()
    chordino_chords = chordino.extract(audio_file)
    
    combined_chords = []
    chordino_index = 0
    
    for i, (start, end, autochord_chord) in enumerate(autochord_chords):
        if autochord_chord == 'N':
            while chordino_index < len(chordino_chords) and chordino_chords[chordino_index].timestamp < end:
                if chordino_chords[chordino_index].timestamp >= start:
                    converted_chord = chordino_chords[chordino_index].chord
                    break
                chordino_index += 1
            else:
                continue
        elif ':' in autochord_chord:
            chord_root, chord_type = autochord_chord.split(':')
            converted_chord = chord_root if chord_type == 'maj' else f"{chord_root}m"
        else:
            converted_chord = autochord_chord
        
        while chordino_index < len(chordino_chords) and chordino_chords[chordino_index].timestamp < end:
            chordino_chord = chordino_chords[chordino_index].chord
            if chordino_chord[0] == converted_chord[0]:
                converted_chord = chordino_chord
                break
            chordino_index += 1
        
        combined_chords.append((start, end, converted_chord))
    
    return combined_chords

def correct_transcription(lyrics, transcription):
    """Corrects the transcription using the lyrics from the XLSX file."""
    lyrics_words = ' '.join(lyrics["chorus"].lower().split() + 
                            [word for verse in lyrics["verses"] for word in verse.lower().split()])
    lyrics_words = lyrics_words.split()
    
    corrected_transcription = []
    lyrics_index = 0
    window_size = 5  # Look at 5 words before and after the current word
    
    for i, (trans_word, (start, end)) in enumerate(transcription):
        trans_word = trans_word.lower()
        
        # Define the search window in lyrics
        window_start = max(0, lyrics_index - window_size)
        window_end = min(len(lyrics_words), lyrics_index + window_size + 1)
        search_window = lyrics_words[window_start:window_end]
        
        # Find the best matching word from lyrics within the window
        best_match = None
        best_ratio = 0
        for j, lyric_word in enumerate(search_window):
            ratio = SequenceMatcher(None, trans_word, lyric_word).ratio()
            if ratio > best_ratio and ratio > 0.8:  # Increased similarity threshold
                best_ratio = ratio
                best_match = lyric_word
                lyrics_index = window_start + j
        
        if best_match:
            corrected_transcription.append((best_match, (start, end)))
            lyrics_index += 1
        else:
            # If no good match found, keep the original word
            corrected_transcription.append((trans_word, (start, end)))
    
    return corrected_transcription

def align_lyrics_and_chords(corrected_transcription, chords, lyrics):
    """Aligns lyrics with corrected transcription and chords, preserving original line structure."""
    aligned_output = []
    chord_index = 0
    
    # Flatten lyrics into lines
    lyric_lines = [line.strip() for verse in lyrics['verses'] for line in verse.split('\n') if line.strip()]
    if lyrics['chorus']:
        lyric_lines = lyrics['chorus'].split('\n') + lyric_lines
    
    word_count = 0
    for lyric_line in lyric_lines:
        line_words = lyric_line.split()
        current_line = []
        last_chord = None
        
        for i, word in enumerate(line_words):
            if word_count >= len(corrected_transcription):
                break
            
            _, start, end = corrected_transcription[word_count]
            
            # Find the chord that occurs just before or at the start of the word
            while chord_index < len(chords) - 1 and chords[chord_index + 1][0] <= start:
                chord_index += 1
            
            current_chord = chords[chord_index][2]  # Use the actual chord name
            chord_start = chords[chord_index][0]
            next_chord_start = chords[chord_index + 1][0] if chord_index < len(chords) - 1 else float('inf')
            
            # Decide whether to place the chord before or after the word
            if abs(chord_start - start) < abs(chord_start - end) and chord_start > start:
                word_with_chord = f"{word}[{current_chord}]"
            elif current_chord != last_chord:
                word_with_chord = f"[{current_chord}]{word}"
                last_chord = current_chord
            else:
                word_with_chord = word
            
            current_line.append(word_with_chord)
            word_count += 1
        
        aligned_output.append(" ".join(current_line))
    
    return aligned_output

def save_chord_transcription(chords, output_file):
    """Save the chord transcription to a file."""
    with open(output_file, 'w') as f:
        for start, end, chord in chords:
            f.write(f"{chord} ({start:.2f} - {end:.2f})\n")

def process_song(serial_number, prefix):
    print(f"\nProcessing song {serial_number}")
    
    try:
        # Read lyrics
        lyrics = read_lyrics_from_xlsx(xlsx_file, serial_number)
        if lyrics is None:
            print(f"No lyrics found for serial number {serial_number}")
            return
        print(f"Lyrics read successfully. Title: {lyrics['title']}")
        
        # Find audio file and transcribe
        audio_files = [f for f in os.listdir(audio_folder) if f.startswith(f"{serial_number:03d}")]
        if not audio_files:
            print(f"No audio file found for serial number {serial_number}")
            return
        audio_file = os.path.join(audio_folder, audio_files[0])
        transcription = transcribe_audio_whisper(audio_file, model_name="medium")
        print(f"Transcription length: {len(transcription)}")
        
        # Correct the transcription
        corrected_transcription = correct_transcription(lyrics, transcription)
        
        # Extract and combine chords
        chords = extract_and_combine_chords(audio_file)
        print(f"Extracted {len(chords)} chords")
        
        # Save combined chord transcription
        chord_output_file = os.path.join(chord_output_folder, f"{prefix}_{serial_number:03d}_chords.txt")
        save_chord_transcription(chords, chord_output_file)
        print(f"Combined chord transcription saved to {chord_output_file}")
        
        # Align lyrics, corrected transcription, and chords
        aligned_output = align_lyrics_and_chords(corrected_transcription, chords, lyrics)
        
        # Save aligned output
        aligned_output_file = os.path.join(output_folder, f"{prefix}_{serial_number:03d}_aligned.txt")
        with open(aligned_output_file, 'w') as f:
            f.write('\n'.join(aligned_output))
        
        print(f"Aligned output saved to {aligned_output_file}")
        
    except Exception as e:
        print(f"Error processing song {serial_number}: {str(e)}")
        import traceback
        traceback.print_exc()

In [None]:
# Define the range of serial numbers to process
start_serial = 3
end_serial = 4  # You can adjust this number as needed
all_serials = range(start_serial, end_serial + 1)

# Set a single prefix for all songs
prefix = "JM"

# Process all songs
for serial in all_serials:
    process_song(serial, prefix)

print("All songs processed.")


Processing song 3
Lyrics read successfully. Title: ABBA, FATHER, LET US BE YOURS
Loading Whisper model: medium


  checkpoint = torch.load(fp, map_location=device)


Transcribing audio file: ../Joyful_Melodies/003 Abba Father - wjl B36.mp3




Transcription length: 140


[src/libmpg123/id3.c:process_comment():587] error: No comment text / valid description?
2024-10-17 19:42:11.870762: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype double and shape [?,128,24]
	 [[{{node inputs}}]]




[src/libmpg123/id3.c:process_comment():587] error: No comment text / valid description?


Extracted 102 chords
Combined chord transcription saved to chord_transcriptions/JM_003_chords.txt
Error processing song 3: not enough values to unpack (expected 3, got 2)

Processing song 4


Traceback (most recent call last):
  File "/tmp/ipykernel_209133/1224115262.py", line 228, in process_song
    aligned_output = align_lyrics_and_chords(corrected_transcription, chords, lyrics)
  File "/tmp/ipykernel_209133/1224115262.py", line 163, in align_lyrics_and_chords
    _, start, end = corrected_transcription[word_count]
ValueError: not enough values to unpack (expected 3, got 2)


Lyrics read successfully. Title: ABIDE WITH ME!
Loading Whisper model: medium
