# Clean des sons après récupération

## Paquets utilisés

In [12]:
import json
import re
import pandas as pd
from typing import List

In [13]:
input_path = "./datas/api_songs.json"
output_path = "./datas/clean_songs.json"

In [14]:
songs = pd.read_json(input_path)
inputs: pd.Series

with open("./datas/songs.txt", "r", encoding="utf8") as file:
    inputs = pd.Series(file.read().lower().splitlines())
    file.close()

songs["artist_name"] = songs['artist'].apply(pd.Series)["name"]
songs["query"] = songs['artist_name'].str.lower() + " - " + songs['name'].str.lower()

songs["is_in_inputs"] = songs["query"].isin(inputs)

songs = songs[songs["is_in_inputs"] == True]

songs.drop_duplicates(subset=["id"], inplace=True)

del songs["is_in_inputs"]
del songs["artist_name"]

print(f"With {len(inputs)} songs, Genius provided {len(songs)} correct songs")
print(f"We keep {len(songs) / len(inputs) * 100:.2f}% of the songs")

songs

With 1830 songs, Genius provided 1268 correct songs
We keep 69.29% of the songs


Unnamed: 0,id,name,album,artist,image,url,original_lyrics,date,query
0,6408722,F*cked Up 4,"{'name': 'MYSTR J.O.$', 'id': 726527}","{'name': 'Josman', 'id': 153477, 'url': 'https...",https://images.genius.com/5d1737204d5b010efc28...,https://genius.com/Josman-fcked-up-4-lyrics,"[Paroles de ""F*cked Up 4""] [Intro] J.O.S J.O.S...",2021-01-28T00:00:00,josman - f*cked up 4
1,6408724,Doré,"{'name': 'MYSTR J.O.$', 'id': 726527}","{'name': 'Josman', 'id': 153477, 'url': 'https...",https://images.genius.com/b6488c411877313d186d...,https://genius.com/Josman-dore-lyrics,"[Paroles de ""Doré""] [Refrain] Le ciel est doré...",2021-01-29T00:00:00,josman - doré
2,6408725,Décisions,"{'name': 'MYSTR J.O.$', 'id': 726527}","{'name': 'Josman', 'id': 153477, 'url': 'https...",https://images.genius.com/b6488c411877313d186d...,https://genius.com/Josman-decisions-lyrics,"[Paroles de ""Décisions""] [Couplet 1] J'peux pa...",2021-01-29T00:00:00,josman - décisions
3,6408726,SEC,"{'name': 'MYSTR J.O.$', 'id': 726527}","{'name': 'Josman', 'id': 153477, 'url': 'https...",https://images.genius.com/b6488c411877313d186d...,https://genius.com/Josman-sec-lyrics,"[Paroles de ""SEC""] [Intro] Sec comme le miel, ...",2021-01-29T00:00:00,josman - sec
4,6408727,New Hares (Same Sh!t),"{'name': 'MYSTR J.O.$', 'id': 726527}","{'name': 'Josman', 'id': 153477, 'url': 'https...",https://images.genius.com/b6488c411877313d186d...,https://genius.com/Josman-new-hares-same-sh-t-...,"[Paroles de ""New Hares (Same Sh!t)""] [Couplet ...",2021-01-29T00:00:00,josman - new hares (same sh!t)
...,...,...,...,...,...,...,...,...,...
1813,6708599,Follow,"{'name': 'Mektoub', 'id': 760403}","{'name': 'Di-Meh', 'id': 60891, 'url': 'https:...",https://images.genius.com/691df6a4289d4b326502...,https://genius.com/Di-meh-follow-lyrics,"[Paroles de ""Follow""] [Refrain] Ouais j'y vais...",2021-05-14T00:00:00,di-meh - follow
1815,6708601,Turn Up,"{'name': 'Mektoub', 'id': 760403}","{'name': 'Di-Meh', 'id': 60891, 'url': 'https:...",https://images.genius.com/691df6a4289d4b326502...,https://genius.com/Di-meh-turn-up-lyrics,"[Paroles de ""Turn Up""] [Intro] Hmm Yeah Hmm [R...",2021-05-14T00:00:00,di-meh - turn up
1816,6708602,Gâtée,"{'name': 'Mektoub', 'id': 760403}","{'name': 'Di-Meh', 'id': 60891, 'url': 'https:...",https://images.genius.com/691df6a4289d4b326502...,https://genius.com/Di-meh-gatee-lyrics,"[Paroles de ""Gâtée""] [Refrain] Dis-moi pourquo...",2021-05-14T00:00:00,di-meh - gâtée
1817,6708603,Week-end,"{'name': 'Mektoub', 'id': 760403}","{'name': 'Di-Meh', 'id': 60891, 'url': 'https:...",https://images.genius.com/691df6a4289d4b326502...,https://genius.com/Di-meh-week-end-lyrics,"[Paroles de ""Week-end"" ft. Klench Poko] RETRAN...",2021-05-14T00:00:00,di-meh - week-end


## Création de plusieurs champs dérivé des paroles originales

### `original_lyrics`
Les paroles originales, celles récupérées sur le site [genius.com](https://genius.com)

### `lyrics`
Les paroles sans: `[Couplet]`, `mot94`, `34`, `  `

### `lyrics_keywords`
Les paroles utilisées pour être comparée avec un dictionnaire, elles sont normalisés à leur forme la plus simple

In [15]:
def clean_lyrics(lyrics: str):
    lyrics = re.sub(r"\([^()]*\)", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"\s*\[(.*?)\]\s*", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"[^a-zA-Z \w ' -]", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"\w*\d\w*", "", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"\s{2,}", " ", lyrics, flags=re.M | re.I)
    lyrics = lyrics.strip().lower()

    return lyrics

def clean_lyrics_keywords(lyrics: str):
    lyrics = clean_lyrics(lyrics)
    lyrics = re.sub(r"(^|\s)(qu'|j'|l'|t'|c'|t'|d'|s'|n'|y'|m')*", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"'(\s|$)", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"(\s|^)'", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"-(\s|$)", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"(\s|^)-", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"(^| ).(( ).)*( |$)", " ", lyrics, flags=re.M | re.I)
    lyrics = re.sub(r"\s{2,}", " ", lyrics, flags=re.M | re.I)
    lyrics = lyrics.strip().lower()

    return lyrics

songs["lyrics"] = songs["original_lyrics"].apply(clean_lyrics)
songs["lyrics_keywords"] = songs["original_lyrics"].apply(clean_lyrics_keywords)

songs_json = songs.to_json(force_ascii=False, orient="records")

with open(output_path, "w", encoding="utf8") as file:
    file.write(songs_json)
    file.close()

print("Output written at", output_path)

Output written at ./datas/clean_songs.json


# RENOMMEZ `./datas/clean_songs.json` EN `./datas/songs.json` POUR LES PROCHAINS NOTEBOOKS