# Clean des sons après récupération

## Paquets utilisés

In [11]:
import json
import re
import pandas as pd
from typing import List

## Création de plusieurs champs dérivé des paroles originales

### `original_lyrics`
Les paroles originales, celles récupérées sur le site [genius.com](https://genius.com)

### `lyrics`
Les paroles sans: `[Couplet]`, `mot94`, `34`, `  `

### `lyrics_keywords`
Les paroles utilisées pour être comparée avec un dictionnaire, elles sont normalisés à leur forme la plus simple

In [12]:
def clean_lyrics(lyrics: str):
    lyrics = re.sub(r"\([^()]*\)", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"\s*\[(.*?)\]\s*", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"[^a-zA-Z \w ' -]", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"\w*\d\w*", "", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"\s{2,}", " ", lyrics, flags=re.M | re.I)
    lyrics = lyrics.strip().lower()

    return lyrics

def clean_lyrics_keywords(lyrics: str):
    lyrics = clean_lyrics(lyrics)
    lyrics = re.sub(r"(^|\s)(qu'|j'|l'|t'|c'|t'|d'|s'|n'|y'|m')*", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"'(\s|$)", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"(\s|^)'", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"-(\s|$)", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"(\s|^)-", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"(^| ).(( ).)*( |$)", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"\s{2,}", " ", lyrics, flags=re.M | re.I)
    lyrics = lyrics.strip().lower()

    return lyrics

input_path = "./datas/songs.json"
output_path = "./datas/post_clean_songs.json"

with open(input_path, "r", encoding="utf8") as file:
    songs_inputs: List[dict] = json.loads(file.read())

    songs = pd.DataFrame(songs_inputs)

    songs["lyrics"] = songs["original_lyrics"].apply(clean_lyrics)
    songs["lyrics_keywords"] = songs["original_lyrics"].apply(clean_lyrics_keywords)

    songs_json = songs.to_json(force_ascii=False, orient="records")

    with open(output_path, "w", encoding="utf8") as file:
        file.write(songs_json)
        file.close()

    print("Output written at", output_path)

Output written at ./datas/post_clean_songs.json


# RENOMMEZ `./datas/clean_songs.json` EN `./datas/songs.json` POUR LES PROCHAINS NOTEBOOKS