# Clean des sons après récupération

## Paquets utilisés

In [33]:
import json
import re
import pandas as pd
from typing import List

In [34]:
input_path = "./datas/api_songs.json"
output_path = "./datas/songs.json"

In [35]:
songs = pd.read_json(input_path)
inputs: pd.Series

with open("./datas/songs.txt", "r", encoding="utf8") as file:
    inputs = pd.Series(file.read().lower().splitlines())
    file.close()

songs["artist_name"] = songs['artist'].apply(pd.Series)["name"]
songs["query"] = songs['artist_name'].str.lower() + " - " + songs['name'].str.lower()

songs["is_in_inputs"] = songs["query"].isin(inputs)

songs = songs[songs["is_in_inputs"] == True]

del songs["is_in_inputs"]
del songs["artist_name"]

print(f"With {len(inputs)} songs, Genius provided {len(songs)} correct songs")
print(f"We keep {len(songs) / len(inputs) * 100:.2f}% of the songs")

songs

songs[songs["query"] == "jul - la pharmacie"]

With 1830 songs, Genius provided 1307 correct songs
We keep 71.42% of the songs


Unnamed: 0,id,name,album,artist,image,url,original_lyrics,date,query
688,6258610,La pharmacie,"{'name': 'Loin du monde', 'id': 698521}","{'name': 'JuL', 'id': 74283, 'url': 'https://g...",https://images.genius.com/e62f9ec698e11fdfc79c...,https://genius.com/Jul-la-pharmacie-lyrics,"[Paroles de ""La pharmacie""] [Intro] Que tu che...",2020-12-18T00:00:00,jul - la pharmacie


In [36]:
inputs[inputs == "jul - la pharmacie"]

700    jul - la pharmacie
dtype: object

## Création de plusieurs champs dérivé des paroles originales

### `original_lyrics`
Les paroles originales, celles récupérées sur le site [genius.com](https://genius.com)

### `lyrics`
Les paroles sans: `[Couplet]`, `mot94`, `34`, `  `

### `lyrics_keywords`
Les paroles utilisées pour être comparée avec un dictionnaire, elles sont normalisés à leur forme la plus simple

In [37]:
def clean_lyrics(lyrics: str):
    lyrics = re.sub(r"\([^()]*\)", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"\s*\[(.*?)\]\s*", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"[^a-zA-Z \w ' -]", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"\w*\d\w*", "", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"\s{2,}", " ", lyrics, flags=re.M | re.I)
    lyrics = lyrics.strip().lower()

    return lyrics

def clean_lyrics_keywords(lyrics: str):
    lyrics = clean_lyrics(lyrics)
    lyrics = re.sub(r"(^|\s)(qu'|j'|l'|t'|c'|t'|d'|s'|n'|y'|m')*", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"'(\s|$)", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"(\s|^)'", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"-(\s|$)", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"(\s|^)-", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"(^| ).(( ).)*( |$)", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"\s{2,}", " ", lyrics, flags=re.M | re.I)
    lyrics = lyrics.strip().lower()

    return lyrics

songs["lyrics"] = songs["original_lyrics"].apply(clean_lyrics)
songs["lyrics_keywords"] = songs["original_lyrics"].apply(clean_lyrics_keywords)

songs_json = songs.to_json(force_ascii=False, orient="records")

with open(output_path, "w", encoding="utf8") as file:
    file.write(songs_json)
    file.close()

print("Output written at", output_path)

Output written at ./datas/songs.json


# RENOMMEZ `./datas/clean_songs.json` EN `./datas/songs.json` POUR LES PROCHAINS NOTEBOOKS