In [9]:
import re
from pathlib import Path
from typing import Iterable, Union
import argparse

def remove_metadata_blocks_from_text(text: str) -> str:
    """
    Remove metadata blocks from a text string as described above.
    Returns the cleaned text.
    """
    lines = text.splitlines(keepends=True)
    out_lines = []
    i = 0

    # match any line that starts with optional spaces, then digits, then 'Contributors'
    pattern = re.compile(r'^\s*\d+\s+Contributors')  # matches start of metadata block

    #match any contiguous non-space "word" that ends with 'Embed'
    embed_pat = re.compile(r'\b\S+Embed\b', re.IGNORECASE)

    while i < len(lines):
        # remove tokens like '87Embed' but NOT standalone 'Embed'
        line = embed_pat.sub('', lines[i])

        if pattern.match(line):
            # skip this metadata line and subsequent non-blank lines
            i += 1
            while i < len(lines) and lines[i].strip() != "":
                i += 1
            # if there's a blank line, preserve it (to keep stanza breaks)
            if i < len(lines) and lines[i].strip() == "":
                out_lines.append(lines[i])
                i += 1
        else:
            out_lines.append(line)
            i += 1

    return "".join(out_lines)


def process_file(path: Union[str, Path], inplace: bool = True, backup: bool = True, encoding: str = "utf-8") -> str:
    """
    Process a single file. If inplace is True, overwrite the file (optionally making a .bak backup).
    Returns the cleaned text.
    """
    p = Path(path)
    text = p.read_text(encoding=encoding)
    cleaned = remove_metadata_blocks_from_text(text)

    if inplace:
        if backup:
            bak = p.with_suffix(p.suffix + ".bak")
            bak.write_text(text, encoding=encoding)
        p.write_text(cleaned, encoding=encoding)

    return cleaned


def process_paths(paths: Iterable[Union[str, Path]], **kwargs) -> None:
    """
    Process multiple files or directories. If a directory is supplied, all files
    inside (non-recursive) will be processed. kwargs are passed to process_file.
    """
    for p in paths:
        p = Path(p)
        if p.is_dir():
            for child in p.iterdir():
                if child.is_file():
                    process_file(child, **kwargs)
        elif p.is_file():
            process_file(p, **kwargs)

In [None]:
# make a test on one file

# folder with artist/lyrics files
artist_lyrics = "50_Cent.txt"

# now run the processing
process_file(artist_lyrics, inplace=True, backup=True, encoding="utf-8")

'\n\nLil mama, show me how you move it\nGo \'head, put your back into it\nDo your thing like it ain\'t nothing to it\nShake, sh-sh-shake that ass, girl!\nLil mama, show me how you move it\nGo \'head, put your back into it\nDo your thing like it ain\'t nothing to it\nShake, sh-sh-shake that ass, girl!\n\nGo, go, go, 50 in the house, bounce\nY\'all already know what I\'m about\nThe flow sound sick over Dre drums, nigga\nI ain\'t stupid, I say Doc then my doe come quicker, whoa!\nShorty hips is hypnotic, she moves is so erotic\nI watch her, I\'m like, "Bounce that ass, girl"\nI get it crump in here, I make it jump in here\nFront in here, we\'ll thump in here, oh!\nSo gutter, so ghetto, so hood\nSo gully, so grimey, what\'s good\nOutside the Benz on dubs, I\'m in the club with the snub\nDon\'t start nothing, there won\'t be nothing, uhh (one, two, three, let’s go!)\nSee 50 Cent LiveGet tickets as low as $70You might also like\nLil mama, show me how you move it\nGo \'head, put your back int

In [None]:
# Clean all files in a folder
PATH = '/Users/noa/Desktop/02805 - Social Graphs/artist_lyrics_cleaned'
process_paths([PATH], inplace=True, backup=False, encoding="utf-8")