In [49]:
import requests
import urllib.parse
from bs4 import BeautifulSoup
import re
import spacy
import json
import numpy as np
import networkx as nx
from networkx.algorithms import bipartite
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
import string
import pathlib
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans

nlp = spacy.load("en_core_web_sm")
    
def occurences(url, word):
    response = requests.get(url)
    if response.status_code != 200:
        return ("erreur")

    soup = BeautifulSoup(response.text, 'html.parser')
    paragraphs = soup.find_all('p')
    full_text = ' '.join(p.get_text() for p in paragraphs)

    occurences = 0
    for match in re.finditer(rf'\b{re.escape(word)}\b', full_text, re.IGNORECASE):
        occurences += 1
        print(f"{occurences} starts at: {match.start()} ends at: {match.end()}")

    return occurences



def theme(url, word):
    response = requests.get(url)
    if response.status_code != 200:
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    paragraphs = soup.find_all('p')
    full_text = ' '.join(p.get_text() for p in paragraphs)

    themes = []
    occurences = 0

    for match in re.finditer(rf'\b{re.escape(word)}\b', full_text, re.IGNORECASE):
        occurences += 1
        x = match.start()
        y = match.end()
        
        # On selectionne quelques mots qui entourent le mot-clé 
        sentence = full_text[max(0, x - 70): y]

        # On supprime les références wikipedia eventuellement contenue
        sentence = re.sub(r'\[\d+\]', '', sentence)  

        print("> Phrase séléctionnée: ", sentence.strip())

        # On selectionne uniquement le fragment de mots contenus autour de notre mot-clé et 
        # entre 2 signes de ponctuations si presents
        fragments = re.split(r'[.?!;,:\n]', sentence)
        fragment = next((frag.strip() for frag in fragments if word.lower() in frag.lower()), None)
        print("> Fragment: ",fragment)

        # On utilise spaCy pour supprimer les "stopwords" 
        doc = nlp(fragment)
        without_stopwords = [token.text for token in doc if not token.is_stop]
        sentence = ' '.join(without_stopwords)
        print("> Fragment sans stopwords:", sentence)

        # On extrait le theme, en utilisant spaCy
        for mot in doc:
            if mot.text.lower() == word.lower():
                mots_lies = []
                i = mot.i - 1
                while i >= 0:
                    prev = doc[i]
                    if prev.dep_ in ("amod", "compound") and prev.pos_ in ("ADJ", "NOUN"):
                        mots_lies.insert(0, prev.text)
                        i -= 1
                    else:
                        break

                theme = ' '.join(w.lower() for w in mots_lies + [mot.text])
                print("> Theme: ", theme)
                themes.append(theme)
                break  

    return themes



def wikidata_to_wikipedia(wikidata_url, lang='en'):
    entity_id = wikidata_url.strip().split('/')[-1]
    api_url = 'https://www.wikidata.org/w/api.php'
    params = {'action': 'wbgetentities', 'ids': entity_id, 'format': 'json', 'props': 'sitelinks'}

    response = requests.get(api_url, params=params)
    data = response.json()

    sitelinks = data['entities'].get(entity_id, {}).get('sitelinks', {})
    wiki_key = f'{lang}wiki'
    if wiki_key in sitelinks:
        title = sitelinks[wiki_key].get('title')
        if title:
            encoded_title = urllib.parse.quote(title.replace(' ', '_'))
            return f'https://{lang}.wikipedia.org/wiki/{encoded_title}'
    return f"Non trouvee."


#theme("https://en.wikipedia.org/wiki/Rapha%C3%ABl_Coleman","activist")
#wikidata_to_wikipedia("http://www.wikidata.org/entity/Q240573", lang='en')

# G1 = nx.Graph()
# artistes = json.loads(pathlib.Path("artistes.json").read_text())
with open("artistes.json", "r", encoding="utf-8") as f:artistes = json.load(f)
# for artiste in artistes:
#     nom = artiste.get("personLabel")
#     wikidata_url = artiste.get("person")
#     wikipedia_url = wikidata_to_wikipedia(wikidata_url)
#     if wikipedia_url.startswith("http"):
#         themes_extraits = theme(wikipedia_url, "activist")
#         if themes_extraits:
#             G1.add_node(nom, bipartite=0)
#             for t in themes_extraits:
#                 G1.add_node(t, bipartite=1)
#                 G1.add_edge(nom, t)


# left_nodes = {n for n, d in G1.nodes(data=True) if d.get("bipartite") == 0}
# right_nodes = {n for n, d in G1.nodes(data=True) if d.get("bipartite") == 1}
# pos = nx.bipartite_layout(G1, left_nodes)
# plt.figure(figsize=(14, 10))
# nx.draw_networkx(
#     G1, pos,
#     node_color=["skyblue" if n in left_nodes else "lightgreen" for n in G1.nodes()],
#     node_size=1200,
#     font_size=9,
#     with_labels=True,
#     edge_color="gray"
# )
# plt.show()

# for theme in right_nodes:
#     connected_artists = list(G1.neighbors(theme))
#     print(f"Le thème '{theme}' est relié à {len(connected_artists)} artiste(s)")


communautes = {"climate" :["climate activist", "environmental activist", "climate change activist", "anti - nuclear activist"],
               "animal rights":["animal rights activist", "vegan activist"],
                "human rights" : ["human rights activist", "civil rights activist"],
                "women & lgbt rights" : ["feminist activist", "gay activist", "lesbian activist", "transgender activist", "sexual purity activist"],
                "political" : ["political activist"],
                "social":["social activist", "social media activist", "yellow vest activist"],
                "health" : ["cancer research activist"]}

# G2 = nx.Graph()
# artiste_to_themes = {}
# for artiste in artistes:
#     nom = artiste.get("personLabel")
#     wikidata_url = artiste.get("person")
#     wikipedia_url = wikidata_to_wikipedia(wikidata_url)
#     if wikipedia_url.startswith("http"):
#         themes_extraits = theme(wikipedia_url, "activist")
#         if themes_extraits:
#             artiste_ajoute = False
#             for t in themes_extraits:
#                 for communaute, theme2 in communautes.items():
#                     for i in range (len(theme2)):    
#                         if theme2[i] in t: 
#                             if artiste_ajoute == False:
#                                 G2.add_node(nom, bipartite=0) 
#                                 artiste_ajoute = True
#                                 G2.add_node(communaute, bipartite=1)
#                                 G2.add_edge(nom, communaute)
#                                 artiste_to_themes.setdefault(nom, set()).add(communaute)
#                                 break
#                             else:
#                                 G2.add_node(communaute, bipartite=1)
#                                 G2.add_edge(nom, communaute)
#                                 artiste_to_themes.setdefault(nom, set()).add(communaute)
#                                 break


# left_nodes = {n for n, d in G2.nodes(data=True) if d.get("bipartite") == 0}
# right_nodes = {n for n, d in G2.nodes(data=True) if d.get("bipartite") == 1}
# pos = nx.bipartite_layout(G2, left_nodes, align='vertical', scale=5)
# plt.figure(figsize=(14, 10))
# nx.draw_networkx(
#     G2, pos,
#     node_color=["skyblue" if n in left_nodes else "lightgreen" for n in G2.nodes()],
#     node_size=800,
#     font_size=7,
#     with_labels=True,
#     edge_color="gray"
# )
# plt.show()

# with open("chanteurs.json", "r", encoding="utf-8") as f:chanteurs = json.load(f)
# chanteurs_activists = {}
# for personne in artiste_to_themes:
#     for chanteur in chanteurs:
#         if personne == chanteur.get("personLabel"):
#             chanteurs_activists[personne] = list(artiste_to_themes[personne])
# print (chanteurs_activists)

def related_words(word):
    related = set()
    for syn in wn.synsets(word):
        for lemma in syn.lemmas():
            related.add(lemma.name())
        for hyper in syn.hypernyms():
            for lemma in hyper.lemmas():
                related.add(lemma.name())
        for hypo in syn.hyponyms():
            for lemma in hypo.lemmas():
                related.add(lemma.name())
    return related

# On commence maintenant l'analyse des chansons 
import lyricsgenius
genius = lyricsgenius.Genius("Fu9F4P_Zft__NzCvberMcG773LOGTEU1bpqA9cdGErZYU41AWnwlmlr649k55FRD", timeout=15)
genius.remove_section_headers = True # Remove section headers (e.g. [Chorus]) from lyrics when searching
genius.excluded_terms = ["(Remix)", "(Live)", "(Edit)", "(Version)"] # Exclude songs with these words in their title

#pour obtenir les paroles d'une chanson
def paroles(chanson, chanteur):
    song = genius.search_song(chanson, chanteur)
    return song.lyrics

#pour obtenir les titres des chansons d'un artiste
def chansons(chanteur, max_chansons):
    artiste = genius.search_artist(chanteur, max_songs=max_chansons, sort="popularity")
    return [chanson.title for chanson in artiste.songs]

def texte_paroles(texte):
    texte = texte.lower()
    texte = texte.replace('-', '')
    texte = texte.translate(str.maketrans('', '', string.punctuation))
    return set(word_tokenize(texte))


from gensim.models import KeyedVectors

model_path = "GoogleNews-vectors-negative300.bin"
model = KeyedVectors.load_word2vec_format(model_path, binary=True,limit=500000)

def nettoyer_mot(mot):
    mot = mot.replace("_", " ")          
    mot = mot.lower()                     
    mot = re.sub(r"[^\w\s]", "", mot)     
    return mot.strip()

def related_words2(keyword, seuil=0.3):
    try:
        similar = model.most_similar(keyword, topn=5000000)  
        mots = set([word for word, score in similar if score >= seuil])
        return set(nettoyer_mot(mot) for mot in mots)
    except KeyError:
        return set()
    

#attribue un score de similarité entre les paroles d'une chanson et un theme
def score_similarite(chanson, chanteur, theme):
    mots_lies = related_words2(theme)
    lyrics = paroles(chanson, chanteur)
    mots_lyrics = texte_paroles(lyrics)
    mots_communs = mots_lyrics.intersection(mots_lies)
    print(mots_communs)
    #return len(mots_communs) / max(1, len(mots_lyrics))
    if len(mots_communs) >= 20:
        return "A+"
    if 15 <= len(mots_communs) <= 20:
        return "A"
    if 10<=len(mots_communs) <= 15:
        return "B"
    if 5<= len(mots_communs) <= 10:
        return "C"
    if 0 < len (mots_communs)<= 5:
        return "D"
    else: 
        return "E"

chanteurs_themes = {'M.I.A.': ['politics'], 'KRS-One': ['human rights', 'animal rights'], 'Killer Mike': ['social'], 'Michael Cuccione': ['health'], 'L7a9d': ['human rights'], 'Saba Saba': ['social'], 'Rebecca Moore': ['animal rights'], 'Joaquin Phoenix': ['animal rights'], 'Wyclef Jean': ['politics'], 'Adé Bantu': ['social'], 'Tupac Shakur': ['politics', 'human rights'], 'Mai Khôi': ['politics'], 'Jacline Mouraud': ['social'], 'Madeleina Kay': ['politics'], 'Topher': ['politics'], 'Montana Tucker': ['social']}
# for chanteur, themes_associes in chanteurs_themes.items():
#      songs = chansons(chanteur, 5)  
#      for theme in themes_associes:
#          scores = []
#          for chanson in songs:
#              score_chanson = score_similarite(chanson, chanteur, theme)
#              scores.append(score_chanson)
#          print (chanteur, theme, scores)


mots = ["pollution", "forest", "recycling", "plastic", "energy", "biodiversity", "climate", "oil","deforestation", "carbon", "agriculture", "ocean", "sustainable", "windmill", "compost","glacier", "solar", "emission", "development", "hurricane", "car", "computer", "phone","pizza", "football", "music", "school", "mountain", "internet", "coffee"]

def test_seuil(mots, seuil, theme):
    try:
        similar = model.most_similar(theme, topn=5000000)  
        m = set([word for word, score in similar if score >= seuil])
        s = set(nettoyer_mot(mot) for mot in m)
    except KeyError:
        s = set()
    mots = set(mots)
    mots_communs = mots.intersection(s)
    print (mots_communs)
    return len (mots_communs)

# seuils = np.arange(0, 1.1, 0.1)
# longueurs = [test_seuil(mots, seuil, "environmental") for seuil in seuils]
# plt.plot(seuils, longueurs, marker='o')
# plt.title("Nombre de mots détectésen fonction du seuil")
# plt.xlabel("Seuil de similarité")
# plt.ylabel("Nombre de mots en commun")
# plt.grid(True)
# plt.show()

def nationalite(url):
    if not url or not url.startswith("http"):
        return "invalid url"
    response = requests.get(url)
    if response.status_code != 200:
        return "error"
    soup = BeautifulSoup(response.text, 'html.parser')
    infobox = soup.find("table", class_="infobox")
    if not infobox:
        return "no infobox found"
    for row in infobox.find_all("tr"):
        header = row.find("th")
        if header and "nationality" in header.get_text(strip=True).lower():
            data = row.find("td")
            if data:
                return data.get_text(strip=True)
    for row in infobox.find_all("tr"):
        header = row.find("th")
        if header and "born" in header.get_text(strip=True).lower():
            data = row.find("td")
            if data:
                lines = data.get_text(separator="\n", strip=True).split("\n")
                if lines:
                    last_line = lines[-1].strip()
                    if ',' in last_line:
                        parts = [p.strip() for p in last_line.split(',')]
                        return parts[-1]
                    return last_line
    return "not found"

with open("chanteurs.json", "r", encoding="utf-8") as f:chanteurs = json.load(f)
noms_filtres = set(chanteurs_themes.keys())

# chanteurs_nationalite = {}

# for chanteur in chanteurs:
#     nom = chanteur.get("personLabel")
#     if nom in noms_filtres:
#         wikidata_url = chanteur.get("person")
#         wikipedia_url = wikidata_to_wikipedia(wikidata_url)
#         if not wikipedia_url or not wikipedia_url.startswith("http"):
#             pays = "invalid url"
#         else:
#             pays = nationalite(wikipedia_url)
#         chanteurs_nationalite[nom] = pays
# print(chanteurs_nationalite)

def genres(url):
    if not url or not url.startswith("http"):
        return "invalid url"
    response = requests.get(url)
    if response.status_code != 200:
        return "error"
    soup = BeautifulSoup(response.text, 'html.parser')
    infobox = soup.find("table", class_="infobox")
    if not infobox:
        return "no infobox found"
    for row in infobox.find_all("tr"):
        header = row.find("th")
        if header and "genres" in header.get_text(strip=True).lower():
            data = row.find("td")
            if data:
                return data.get_text(separator=", ", strip=True)
    return "not found"

# chanteurs_genres = {}

# for chanteur in chanteurs:
#     nom = chanteur.get("personLabel")
#     if nom in noms_filtres:
#         wikidata_url = chanteur.get("person")
#         wikipedia_url = wikidata_to_wikipedia(wikidata_url)
#         if not wikipedia_url or not wikipedia_url.startswith("http"):
#             genre = "invalid url"
#         else:
#             genre = genres(wikipedia_url)
#         chanteurs_genres[nom] = genre

# chanteurs_genres2 = {}

# # Nettoyer la liste obtenue et ne laisser que les artistes qui possèdent des genres
# for artist, genres in chanteurs_genres.items():
#     if genres == "not found" or not genres:
#         continue  
#     cleaned = re.sub(r"[\[\]0-9/]", "", genres)
#     cleaned = re.sub(r",\s*,+", ",", cleaned)  
#     cleaned = re.sub(r"\s+", " ", cleaned) 
#     cleaned = [g.strip().lower() for g in cleaned.split(",") if g.strip()]
#     chanteurs_genres2[artist] = list(set(cleaned)) 
# print (chanteurs_genres2)

with open("femmes.json", "r", encoding="utf-8") as f:femmes = json.load(f)

# chanteurs_gender = {}

# for chanteur in chanteurs:
#     nom = chanteur.get("personLabel")
#     if nom in noms_filtres:
#         if chanteur in femmes:
#             chanteurs_gender[nom] = "femme"
#         else:
#             chanteurs_gender[nom] = "homme"

def naissance(url):
    if not url or not url.startswith("http"):
        return "invalid url"
    response = requests.get(url)
    if response.status_code != 200:
        return "error"
    soup = BeautifulSoup(response.text, 'html.parser')
    infobox = soup.find("table", class_="infobox")
    if not infobox:
        return "no infobox found"
    rows = infobox.find_all("tr")
    for row in rows:
        header = row.find("th")
        if header and "born" in header.text.lower():
            born_cell = row.find("td")
            if born_cell:
                text = born_cell.get_text(" ", strip=True)
                match = re.search(r"\b(19[6-9][0-9]|20[0-9]{2})\b", text)
                if match:
                    return int(match.group(1))
                else:
                    return "no valid year found"
    return "no 'born' field found"
    
# chanteurs_naissance = {}
# for chanteur in chanteurs:
#     nom = chanteur.get("personLabel")
#     if nom in noms_filtres:
#         wikidata_url = chanteur.get("person")
#         wikipedia_url = wikidata_to_wikipedia(wikidata_url)
#         if not wikipedia_url or not wikipedia_url.startswith("http"):
#             annee = "invalid url"
#         else:
#             annee = naissance(wikipedia_url)
#         chanteurs_naissance[nom] = annee
# print (chanteurs_naissance)

chanteurs_gender2 = {'KRS-One': 'homme', 'Tupac Shakur': 'homme', 'Adé Bantu': 'homme', 'Joaquin Phoenix': 'homme', 'Wyclef Jean': 'homme', 'Killer Mike': 'homme', 'M.I.A.': 'femme', 'Saba Saba': 'homme', 'Mai Khôi': 'femme', 'Michael Cuccione': 'homme', 'L7a9d': 'homme', 'Rebecca Moore': 'femme', 'Madeleina Kay': 'femme', 'Jacline Mouraud': 'femme', 'Topher': 'homme', 'Montana Tucker': 'femme'}    
chanteurs_nationalite2 =  {'KRS-One': 'U.S.', 'Tupac Shakur': 'U.S.', 'Adé Bantu': 'England', 'Joaquin Phoenix': 'Puerto Rico', 'Wyclef Jean': 'Haiti', 'Killer Mike': 'U.S.', 'M.I.A.': 'England', 'Saba Saba': 'Uganda', 'Mai Khôi': 'Vietnam', 'Michael Cuccione': 'Canada', 'L7a9d': 'Morocco', 'Rebecca Moore': 'U.S.', 'Madeleina Kay': 'England', 'Jacline Mouraud': 'France', 'Topher': 'U.S.', 'Montana Tucker': 'U.S.'}
chanteurs_genres3 = {'KRS-One': ['conscious rap', 'hardcore hip-hop', 'political hip-hop', 'east coast hip-hop'], 'Tupac Shakur': ['political hip-hop', 'west coast hip-hop', 'gangsta rap'], 'Adé Bantu': ['hip-hop', 'afrofunk', 'fuji', 'afrobeat'], 'Wyclef Jean': ['east coast hip hop', 'neo soul', 'r&b', 'reggae fusion', 'pop rap'], 'Killer Mike': ['southern hip-hop', 'gospel'], 'M.I.A.': ['hip hop', 'alternative hip hop', 'world', 'dance', 'progressive rap', 'pop', 'electronica', 'experimental'], 'Saba Saba': ['hip hop', 'african hip hop'], 'Mai Khôi': ['pop'], 'L7a9d': ['rap'], 'Rebecca Moore': ['alternative', 'ambient', 'experimental'], 'Topher': ['hip hop']}
chanteurs_naissance2 = {'KRS-One': 1965, 'Tupac Shakur': 1971, 'Adé Bantu': 1971, 'Joaquin Phoenix': 1974, 'Wyclef Jean': 1969, 'Killer Mike': 1975, 'M.I.A.': 1975, 'Saba Saba': 1977, 'Mai Khôi': 1983, 'Michael Cuccione': 1985, 'L7a9d': 1988, 'Rebecca Moore': 1968, 'Madeleina Kay': 1994, 'Jacline Mouraud': 1967, 'Topher': 1991, 'Montana Tucker': 1993}

chanteurs = chanteurs_gender2.keys()
print (chanteurs)
dictionnaire = [
    {'nationalite': chanteurs_nationalite2.get(chanteur), 'gender': chanteurs_gender2.get(chanteur), 'genres':chanteurs_genres3.get(chanteur, []), 'naissance':chanteurs_naissance2.get(chanteur)/1965}
    for chanteur in chanteurs
]
print (dictionnaire)

vec= DictVectorizer()
X = vec.fit_transform(dictionnaire).toarray()

for k in range(2,len(X[0])):
    print("number of clusters: "+ str(k))
    kmeans = KMeans(n_clusters=k, random_state=0).fit(X)
    print(silhouette_score(X, kmeans.labels_))
    print(kmeans.labels_)



dict_keys(['KRS-One', 'Tupac Shakur', 'Adé Bantu', 'Joaquin Phoenix', 'Wyclef Jean', 'Killer Mike', 'M.I.A.', 'Saba Saba', 'Mai Khôi', 'Michael Cuccione', 'L7a9d', 'Rebecca Moore', 'Madeleina Kay', 'Jacline Mouraud', 'Topher', 'Montana Tucker'])
[{'nationalite': 'U.S.', 'gender': 'homme', 'genres': ['conscious rap', 'hardcore hip-hop', 'political hip-hop', 'east coast hip-hop'], 'naissance': 1.0}, {'nationalite': 'U.S.', 'gender': 'homme', 'genres': ['political hip-hop', 'west coast hip-hop', 'gangsta rap'], 'naissance': 1.0030534351145037}, {'nationalite': 'England', 'gender': 'homme', 'genres': ['hip-hop', 'afrofunk', 'fuji', 'afrobeat'], 'naissance': 1.0030534351145037}, {'nationalite': 'Puerto Rico', 'gender': 'homme', 'genres': [], 'naissance': 1.0045801526717557}, {'nationalite': 'Haiti', 'gender': 'homme', 'genres': ['east coast hip hop', 'neo soul', 'r&b', 'reggae fusion', 'pop rap'], 'naissance': 1.0020356234096692}, {'nationalite': 'U.S.', 'gender': 'homme', 'genres': ['south

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



0.14386220038723613
[1 1 1 1 1 1 0 1 0 1 1 0 0 0 1 0]
number of clusters: 3
0.11124885856181935
[2 2 1 1 1 2 0 1 0 1 1 0 0 0 1 0]
number of clusters: 4
0.09201634650459678
[2 2 1 1 1 2 0 1 0 1 1 3 3 3 1 3]
number of clusters: 5
0.06164298999368524
[2 2 1 1 1 2 4 1 0 1 1 3 3 3 1 3]
number of clusters: 6
0.03983702428772151
[5 2 1 1 1 2 4 1 0 1 1 3 3 3 1 3]
number of clusters: 7
0.019427579947971195
[5 6 1 1 1 2 4 1 0 1 1 3 3 3 1 3]
number of clusters: 8
0.052034709036519876
[4 5 7 1 6 5 3 5 0 1 1 2 2 2 5 2]
number of clusters: 9
0.06589590380116987
[4 5 7 1 6 5 3 5 0 1 1 8 2 2 5 2]
number of clusters: 10
0.07115003152449487
[4 9 7 1 6 5 3 5 0 1 1 8 2 2 5 2]
number of clusters: 11
0.07583469931876811
[ 4  9  7  1  6  5  3 10  0  1  1  8  2  2  5  2]
number of clusters: 12
0.06571352615233855
[ 4  9  7  1  6  5  3 10  0  1 11  8  2  2  5  2]
number of clusters: 13
0.057340664269886946
[ 4  9  7  1  6 12  3 10  0  1 11  8  2  2  5  2]
number of clusters: 14
0.022943256408860968
[ 4  9  7  

ValueError: Number of labels is 16. Valid values are 2 to n_samples - 1 (inclusive)