In [None]:
import requests
import urllib.parse
from bs4 import BeautifulSoup
import re
import spacy
import json
import numpy as np
import networkx as nx
from networkx.algorithms import bipartite
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
import string
import pathlib
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans
from itertools import combinations
from collections import Counter


nlp = spacy.load("en_core_web_sm")
    
def occurences(url, word):
    response = requests.get(url)
    if response.status_code != 200:
        return ("erreur")

    soup = BeautifulSoup(response.text, 'html.parser')
    paragraphs = soup.find_all('p')
    full_text = ' '.join(p.get_text() for p in paragraphs)

    occurences = 0
    for match in re.finditer(rf'\b{re.escape(word)}\b', full_text, re.IGNORECASE):
        occurences += 1
        print(f"{occurences} starts at: {match.start()} ends at: {match.end()}")

    return occurences



def theme(url, word):
    response = requests.get(url)
    if response.status_code != 200:
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    paragraphs = soup.find_all('p')
    full_text = ' '.join(p.get_text() for p in paragraphs)

    themes = []
    occurences = 0

    for match in re.finditer(rf'\b{re.escape(word)}\b', full_text, re.IGNORECASE):
        occurences += 1
        x = match.start()
        y = match.end()
        
        # On selectionne quelques mots qui entourent le mot-clé 
        sentence = full_text[max(0, x - 70): y]

        # On supprime les références wikipedia eventuellement contenue
        sentence = re.sub(r'\[\d+\]', '', sentence)  

        print("> Phrase séléctionnée: ", sentence.strip())

        # On selectionne uniquement le fragment de mots contenus autour de notre mot-clé et 
        # entre 2 signes de ponctuations si presents
        fragments = re.split(r'[.?!;,:\n]', sentence)
        fragment = next((frag.strip() for frag in fragments if word.lower() in frag.lower()), None)
        print("> Fragment: ",fragment)

        # On utilise spaCy pour supprimer les "stopwords" 
        doc = nlp(fragment)
        without_stopwords = [token.text for token in doc if not token.is_stop]
        sentence = ' '.join(without_stopwords)
        print("> Fragment sans stopwords:", sentence)

        # On extrait le theme, en utilisant spaCy
        for mot in doc:
            if mot.text.lower() == word.lower():
                mots_lies = []
                i = mot.i - 1
                while i >= 0:
                    prev = doc[i]
                    if prev.dep_ in ("amod", "compound") and prev.pos_ in ("ADJ", "NOUN"):
                        mots_lies.insert(0, prev.text)
                        i -= 1
                    else:
                        break

                theme = ' '.join(w.lower() for w in mots_lies + [mot.text])
                print("> Theme: ", theme)
                themes.append(theme)
                break  

    return themes



def wikidata_to_wikipedia(wikidata_url, lang='en'):
    entity_id = wikidata_url.strip().split('/')[-1]
    api_url = 'https://www.wikidata.org/w/api.php'
    params = {'action': 'wbgetentities', 'ids': entity_id, 'format': 'json', 'props': 'sitelinks'}

    response = requests.get(api_url, params=params)
    data = response.json()

    sitelinks = data['entities'].get(entity_id, {}).get('sitelinks', {})
    wiki_key = f'{lang}wiki'
    if wiki_key in sitelinks:
        title = sitelinks[wiki_key].get('title')
        if title:
            encoded_title = urllib.parse.quote(title.replace(' ', '_'))
            return f'https://{lang}.wikipedia.org/wiki/{encoded_title}'
    return f"Non trouvee."


#theme("https://en.wikipedia.org/wiki/Rapha%C3%ABl_Coleman","activist")
#wikidata_to_wikipedia("http://www.wikidata.org/entity/Q240573", lang='en')

# G1 = nx.Graph()
artistes = json.loads(pathlib.Path("artistes.json").read_text())
# with open("artistes.json", "r", encoding="utf-8") as f:artistes = json.load(f)
# for artiste in artistes:
#     nom = artiste.get("personLabel")
#     wikidata_url = artiste.get("person")
#     wikipedia_url = wikidata_to_wikipedia(wikidata_url)
#     if wikipedia_url.startswith("http"):
#         themes_extraits = theme(wikipedia_url, "activist")
#         if themes_extraits:
#             G1.add_node(nom, bipartite=0)
#             for t in themes_extraits:
#                 G1.add_node(t, bipartite=1)
#                 G1.add_edge(nom, t)




# left_nodes = {n for n, d in G1.nodes(data=True) if d.get("bipartite") == 0}
# right_nodes = {n for n, d in G1.nodes(data=True) if d.get("bipartite") == 1}
# pos = nx.bipartite_layout(G1, left_nodes)
# plt.figure(figsize=(14, 10))
# nx.draw_networkx(
#     G1, pos,
#     node_color=["skyblue" if n in left_nodes else "lightgreen" for n in G1.nodes()],
#     node_size=1200,
#     font_size=9,
#     with_labels=True,
#     edge_color="gray"
# )
# plt.show()

# for theme in right_nodes:
#     connected_artists = list(G1.neighbors(theme))
#     print(f"Le thème '{theme}' est relié à {len(connected_artists)} artiste(s)")


communautes = {"climate" :["climate activist", "environmental activist", "climate change activist", "anti - nuclear activist"],
               "animal rights":["animal rights activist", "vegan activist"],
                "human rights" : ["human rights activist", "civil rights activist"],
                "women & lgbt rights" : ["feminist activist", "gay activist", "lesbian activist", "transgender activist", "sexual purity activist"],
                "political" : ["political activist"],
                "social":["social activist", "social media activist", "yellow vest activist"],
                "health" : ["cancer research activist"]}

# G2 = nx.Graph()
# artiste_to_themes = {}
# for artiste in artistes:
#     nom = artiste.get("personLabel")
#     wikidata_url = artiste.get("person")
#     wikipedia_url = wikidata_to_wikipedia(wikidata_url)
#     if wikipedia_url.startswith("http"):
#         themes_extraits = theme(wikipedia_url, "activist")
#         if themes_extraits:
#             artiste_ajoute = False
#             for t in themes_extraits:
#                 for communaute, theme2 in communautes.items():
#                     for i in range (len(theme2)):    
#                         if theme2[i] in t: 
#                             if artiste_ajoute == False:
#                                 G2.add_node(nom, bipartite=0) 
#                                 artiste_ajoute = True
#                                 G2.add_node(communaute, bipartite=1)
#                                 G2.add_edge(nom, communaute)
#                                 artiste_to_themes.setdefault(nom, set()).add(communaute)
#                                 break
#                             else:
#                                 G2.add_node(communaute, bipartite=1)
#                                 G2.add_edge(nom, communaute)
#                                 artiste_to_themes.setdefault(nom, set()).add(communaute)
#                                 break


# left_nodes = {n for n, d in G2.nodes(data=True) if d.get("bipartite") == 0}
# right_nodes = {n for n, d in G2.nodes(data=True) if d.get("bipartite") == 1}
# pos = nx.bipartite_layout(G2, left_nodes, align='vertical', scale=5)
# plt.figure(figsize=(14, 10))
# nx.draw_networkx(
#     G2, pos,
#     node_color=["skyblue" if n in left_nodes else "lightgreen" for n in G2.nodes()],
#     node_size=800,
#     font_size=7,
#     with_labels=True,
#     edge_color="gray"
# )
# plt.show()

# with open("chanteurs.json", "r", encoding="utf-8") as f:chanteurs = json.load(f)
# chanteurs_activists = {}
# for personne in artiste_to_themes:
#     for chanteur in chanteurs:
#         if personne == chanteur.get("personLabel"):
#             chanteurs_activists[personne] = list(artiste_to_themes[personne])
# print (chanteurs_activists)

def related_words(word):
    related = set()
    for syn in wn.synsets(word):
        for lemma in syn.lemmas():
            related.add(lemma.name())
        for hyper in syn.hypernyms():
            for lemma in hyper.lemmas():
                related.add(lemma.name())
        for hypo in syn.hyponyms():
            for lemma in hypo.lemmas():
                related.add(lemma.name())
    return related

# On commence maintenant l'analyse des chansons 
import lyricsgenius
genius = lyricsgenius.Genius("Fu9F4P_Zft__NzCvberMcG773LOGTEU1bpqA9cdGErZYU41AWnwlmlr649k55FRD", timeout=15)
genius.remove_section_headers = True # Remove section headers (e.g. [Chorus]) from lyrics when searching
genius.excluded_terms = ["(Remix)", "(Live)", "(Edit)", "(Version)"] # Exclude songs with these words in their title

#pour obtenir les paroles d'une chanson
def paroles(chanson, chanteur):
    song = genius.search_song(chanson, chanteur)
    return song.lyrics

#pour obtenir les titres des chansons d'un artiste
def chansons(chanteur, max_chansons):
    artiste = genius.search_artist(chanteur, max_songs=max_chansons, sort="popularity")
    return [chanson.title for chanson in artiste.songs]

def texte_paroles(texte):
    texte = texte.lower()
    texte = texte.replace('-', '')
    texte = texte.translate(str.maketrans('', '', string.punctuation))
    return set(word_tokenize(texte))


from gensim.models import KeyedVectors

model_path = "GoogleNews-vectors-negative300.bin"
model = KeyedVectors.load_word2vec_format(model_path, binary=True,limit=500000)

def nettoyer_mot(mot):
    mot = mot.replace("_", " ")          
    mot = mot.lower()                     
    mot = re.sub(r"[^\w\s]", "", mot)     
    return mot.strip()

def related_words2(keyword, seuil=0.3):
    try:
        similar = model.most_similar(keyword, topn=5000000)  
        mots = set([word for word, score in similar if score >= seuil])
        return set(nettoyer_mot(mot) for mot in mots)
    except KeyError:
        return set()
    

#attribue un score de similarité entre les paroles d'une chanson et un theme
def score_similarite(chanson, chanteur, theme):
    mots_lies = related_words2(theme)
    lyrics = paroles(chanson, chanteur)
    mots_lyrics = texte_paroles(lyrics)
    mots_communs = mots_lyrics.intersection(mots_lies)
    print(mots_communs)
    #return len(mots_communs) / max(1, len(mots_lyrics))
    if len(mots_communs) >= 20:
        return "A+"
    if 15 <= len(mots_communs) <= 20:
        return "A"
    if 10<=len(mots_communs) <= 15:
        return "B"
    if 5<= len(mots_communs) <= 10:
        return "C"
    if 0 < len (mots_communs)<= 5:
        return "D"
    else: 
        return "E"

chanteurs_themes = {'M.I.A.': ['politics'], 'KRS-One': ['human rights', 'animal rights'], 'Killer Mike': ['social'], 'Michael Cuccione': ['health'], 'L7a9d': ['human rights'], 'Saba Saba': ['social'], 'Rebecca Moore': ['animal rights'], 'Joaquin Phoenix': ['animal rights'], 'Wyclef Jean': ['politics'], 'Adé Bantu': ['social'], 'Tupac Shakur': ['politics', 'human rights'], 'Mai Khôi': ['politics'], 'Jacline Mouraud': ['social'], 'Madeleina Kay': ['politics'], 'Topher': ['politics'], 'Montana Tucker': ['social']}
# for chanteur, themes_associes in chanteurs_themes.items():
#      songs = chansons(chanteur, 5)  
#      for theme in themes_associes:
#          scores = []
#          for chanson in songs:
#              score_chanson = score_similarite(chanson, chanteur, theme)
#              scores.append(score_chanson)
#          print (chanteur, theme, scores)


mots = ["pollution", "forest", "recycling", "plastic", "energy", "biodiversity", "climate", "oil","deforestation", "carbon", "agriculture", "ocean", "sustainable", "windmill", "compost","glacier", "solar", "emission", "development", "hurricane", "car", "computer", "phone","pizza", "football", "music", "school", "mountain", "internet", "coffee"]

def test_seuil(mots, seuil, theme):
    try:
        similar = model.most_similar(theme, topn=5000000)  
        m = set([word for word, score in similar if score >= seuil])
        s = set(nettoyer_mot(mot) for mot in m)
    except KeyError:
        s = set()
    mots = set(mots)
    mots_communs = mots.intersection(s)
    print (mots_communs)
    return len (mots_communs)

# seuils = np.arange(0, 1.1, 0.1)
# longueurs = [test_seuil(mots, seuil, "environmental") for seuil in seuils]
# plt.plot(seuils, longueurs, marker='o')
# plt.title("Nombre de mots détectésen fonction du seuil")
# plt.xlabel("Seuil de similarité")
# plt.ylabel("Nombre de mots en commun")
# plt.grid(True)
# plt.show()

def nationalite(url):
    if not url or not url.startswith("http"):
        return "invalid url"
    response = requests.get(url)
    if response.status_code != 200:
        return "error"
    soup = BeautifulSoup(response.text, 'html.parser')
    infobox = soup.find("table", class_="infobox")
    if not infobox:
        return "no infobox found"
    for row in infobox.find_all("tr"):
        header = row.find("th")
        if header and "nationality" in header.get_text(strip=True).lower():
            data = row.find("td")
            if data:
                return data.get_text(strip=True)
    for row in infobox.find_all("tr"):
        header = row.find("th")
        if header and "born" in header.get_text(strip=True).lower():
            data = row.find("td")
            if data:
                lines = data.get_text(separator="\n", strip=True).split("\n")
                if lines:
                    last_line = lines[-1].strip()
                    if ',' in last_line:
                        parts = [p.strip() for p in last_line.split(',')]
                        return parts[-1]
                    return last_line
    return "not found"

with open("chanteurs.json", "r", encoding="utf-8") as f:chanteurs = json.load(f)
noms_filtres = set(chanteurs_themes.keys())

# chanteurs_nationalite = {}

# for chanteur in chanteurs:
#     nom = chanteur.get("personLabel")
#     if nom in noms_filtres:
#         wikidata_url = chanteur.get("person")
#         wikipedia_url = wikidata_to_wikipedia(wikidata_url)
#         if not wikipedia_url or not wikipedia_url.startswith("http"):
#             pays = "invalid url"
#         else:
#             pays = nationalite(wikipedia_url)
#         chanteurs_nationalite[nom] = pays
# print(chanteurs_nationalite)

def genres(url):
    if not url or not url.startswith("http"):
        return "invalid url"
    response = requests.get(url)
    if response.status_code != 200:
        return "error"
    soup = BeautifulSoup(response.text, 'html.parser')
    infobox = soup.find("table", class_="infobox")
    if not infobox:
        return "no infobox found"
    for row in infobox.find_all("tr"):
        header = row.find("th")
        if header and "genres" in header.get_text(strip=True).lower():
            data = row.find("td")
            if data:
                return data.get_text(separator=", ", strip=True)
    return "not found"

# chanteurs_genres = {}

# for chanteur in chanteurs:
#     nom = chanteur.get("personLabel")
#     if nom in noms_filtres:
#         wikidata_url = chanteur.get("person")
#         wikipedia_url = wikidata_to_wikipedia(wikidata_url)
#         if not wikipedia_url or not wikipedia_url.startswith("http"):
#             genre = "invalid url"
#         else:
#             genre = genres(wikipedia_url)
#         chanteurs_genres[nom] = genre

# chanteurs_genres2 = {}

# # Nettoyer la liste obtenue et ne laisser que les artistes qui possèdent des genres
# for artist, genres in chanteurs_genres.items():
#     if genres == "not found" or not genres:
#         continue  
#     cleaned = re.sub(r"[\[\]0-9/]", "", genres)
#     cleaned = re.sub(r",\s*,+", ",", cleaned)  
#     cleaned = re.sub(r"\s+", " ", cleaned) 
#     cleaned = [g.strip().lower() for g in cleaned.split(",") if g.strip()]
#     chanteurs_genres2[artist] = list(set(cleaned)) 
# print (chanteurs_genres2)

with open("femmes.json", "r", encoding="utf-8") as f:femmes = json.load(f)

# chanteurs_gender = {}

# for chanteur in chanteurs:
#     nom = chanteur.get("personLabel")
#     if nom in noms_filtres:
#         if chanteur in femmes:
#             chanteurs_gender[nom] = "femme"
#         else:
#             chanteurs_gender[nom] = "homme"

def naissance(url):
    if not url or not url.startswith("http"):
        return "invalid url"
    response = requests.get(url)
    if response.status_code != 200:
        return "error"
    soup = BeautifulSoup(response.text, 'html.parser')
    infobox = soup.find("table", class_="infobox")
    if not infobox:
        return "no infobox found"
    rows = infobox.find_all("tr")
    for row in rows:
        header = row.find("th")
        if header and "born" in header.text.lower():
            born_cell = row.find("td")
            if born_cell:
                text = born_cell.get_text(" ", strip=True)
                match = re.search(r"\b(19[6-9][0-9]|20[0-9]{2})\b", text)
                if match:
                    return int(match.group(1))
                else:
                    return "no valid year found"
    return "no 'born' field found"
    
# chanteurs_naissance = {}
# for chanteur in chanteurs:
#     nom = chanteur.get("personLabel")
#     if nom in noms_filtres:
#         wikidata_url = chanteur.get("person")
#         wikipedia_url = wikidata_to_wikipedia(wikidata_url)
#         if not wikipedia_url or not wikipedia_url.startswith("http"):
#             annee = "invalid url"
#         else:
#             annee = naissance(wikipedia_url)
#         chanteurs_naissance[nom] = annee
# print (chanteurs_naissance)

chanteurs_gender2 = {'KRS-One': 'homme', 'Tupac Shakur': 'homme', 'Adé Bantu': 'homme', 'Joaquin Phoenix': 'homme', 'Wyclef Jean': 'homme', 'Killer Mike': 'homme', 'M.I.A.': 'femme', 'Saba Saba': 'homme', 'Mai Khôi': 'femme', 'Michael Cuccione': 'homme', 'L7a9d': 'homme', 'Rebecca Moore': 'femme', 'Madeleina Kay': 'femme', 'Jacline Mouraud': 'femme', 'Topher': 'homme', 'Montana Tucker': 'femme'}    
chanteurs_nationalite2 =  {'KRS-One': 'U.S.', 'Tupac Shakur': 'U.S.', 'Adé Bantu': 'England', 'Joaquin Phoenix': 'Puerto Rico', 'Wyclef Jean': 'Haiti', 'Killer Mike': 'U.S.', 'M.I.A.': 'England', 'Saba Saba': 'Uganda', 'Mai Khôi': 'Vietnam', 'Michael Cuccione': 'Canada', 'L7a9d': 'Morocco', 'Rebecca Moore': 'U.S.', 'Madeleina Kay': 'England', 'Jacline Mouraud': 'France', 'Topher': 'U.S.', 'Montana Tucker': 'U.S.'}
chanteurs_genres3 = {'KRS-One': ['conscious rap', 'hardcore hip-hop', 'political hip-hop', 'east coast hip-hop'], 'Tupac Shakur': ['political hip-hop', 'west coast hip-hop', 'gangsta rap'], 'Adé Bantu': ['hip-hop', 'afrofunk', 'fuji', 'afrobeat'], 'Wyclef Jean': ['east coast hip hop', 'neo soul', 'r&b', 'reggae fusion', 'pop rap'], 'Killer Mike': ['southern hip-hop', 'gospel'], 'M.I.A.': ['hip hop', 'alternative hip hop', 'world', 'dance', 'progressive rap', 'pop', 'electronica', 'experimental'], 'Saba Saba': ['hip hop', 'african hip hop'], 'Mai Khôi': ['pop'], 'L7a9d': ['rap'], 'Rebecca Moore': ['alternative', 'ambient', 'experimental'], 'Topher': ['hip hop']}
chanteurs_naissance2 = {'KRS-One': 1965, 'Tupac Shakur': 1971, 'Adé Bantu': 1971, 'Joaquin Phoenix': 1974, 'Wyclef Jean': 1969, 'Killer Mike': 1975, 'M.I.A.': 1975, 'Saba Saba': 1977, 'Mai Khôi': 1983, 'Michael Cuccione': 1985, 'L7a9d': 1988, 'Rebecca Moore': 1968, 'Madeleina Kay': 1994, 'Jacline Mouraud': 1967, 'Topher': 1991, 'Montana Tucker': 1993}

# chanteurs = chanteurs_gender2.keys()
# print (chanteurs)
# dictionnaire = [
#     {'nationalite': chanteurs_nationalite2.get(chanteur), 'gender': chanteurs_gender2.get(chanteur), 'genres':chanteurs_genres3.get(chanteur, []), 'naissance':chanteurs_naissance2.get(chanteur)/1965}
#     for chanteur in chanteurs
# ]
# print (dictionnaire)

# vec= DictVectorizer()
# X = vec.fit_transform(dictionnaire).toarray()

# for k in range(2,len(X[0])):
#     print("number of clusters: "+ str(k))
#     kmeans = KMeans(n_clusters=k, random_state=0).fit(X)
#     print(silhouette_score(X, kmeans.labels_))
#     print(kmeans.labels_)

america = ['U.S.','Canada','Haiti', 'Puerto Rico', 'Brazil']
europe = ['England', 'France']
africa = ['Uganda','Morocco','Nigeria', 'South Africa']
asia = ['India', 'Afghanistan', 'Hong Kong', 'Vietnam', 'Pakistan', 'Bangladesh']


def continent(pays):
    if pays in america:
        return "america"
    elif pays in europe:
        return "europe"
    elif pays in africa:
        return "africa"
    elif pays in asia:
        return "asia"
    return None


# G3 = nx.Graph()

# for chanteur in chanteurs_gender2:
#     G3.add_node(chanteur)
# for a, b in combinations(chanteurs_gender2, 2):
#     weight = 0
#     if chanteurs_gender2[a] == chanteurs_gender2[b]:
#         weight += 1
#     nat_a = chanteurs_nationalite2.get(a)
#     nat_b = chanteurs_nationalite2.get(b)
#     if nat_a == nat_b:
#         weight += 2
#     elif continent(nat_a) == continent(nat_b):
#         weight += 1
#     genres_a = set(chanteurs_genres3.get(a, []))
#     genres_b = set(chanteurs_genres3.get(b, []))
#     weight += len(genres_a & genres_b)  
#     y_a = chanteurs_naissance2.get(a)
#     y_b = chanteurs_naissance2.get(b)
#     if y_a == y_b:
#         weight += 2
#     elif abs(y_a - y_b) <= 5:
#         weight += 1
#     if weight > 0:
#         G3.add_edge(a, b, weight=weight)

# pos = nx.spring_layout(G3, seed=42)
# edges = G3.edges(data=True)
# weights = [d['weight'] for (_, _, d) in edges]
# plt.figure(figsize=(13, 11))
# nx.draw_networkx_nodes(G3, pos, node_color='lightblue', node_size=1000)
# nx.draw_networkx_labels(G3, pos, font_size=10)
# nx.draw_networkx_edges(G3, pos, width=weights, edge_color='gray')
# plt.axis('off')
# plt.show()

# louvain=nx.community.louvain_communities(G3, weight='weight')
# print("Louvain Communities:", louvain)

# louvain_dict = {}
# for i, community in enumerate(louvain):
#     for node in community:
#         louvain_dict[node] = i

# pos = nx.spring_layout(G3)
# cmap = plt.cm.get_cmap('tab10')
# nx.draw_networkx_nodes(G3, pos, node_color=[louvain_dict[n] for n in G3.nodes()], cmap=cmap)
# nx.draw_networkx_edges(G3, pos, width=[G3[u][v]['weight'] for u, v in G3.edges()])
# nx.draw_networkx_labels(G3, pos)
# plt.show()

# communautes = [{'Michael Cuccione', 'KRS-One', 'Topher', 'L7a9d', 'Tupac Shakur', 'Saba Saba', 'Killer Mike', 'Wyclef Jean', 'Joaquin Phoenix'},{'Rebecca Moore', 'Montana Tucker', 'Madeleina Kay', 'Adé Bantu', 'M.I.A.', 'Jacline Mouraud', 'Mai Khôi'}]
# themes_par_communaute = []

# for i, comm in enumerate(communautes):
#     themes = []
#     for chanteur in comm:
#         if chanteur in chanteurs_themes:
#             themes.extend(chanteurs_themes[chanteur])
#     themes_uniques = list(set(themes))
#     themes_par_communaute.append(themes_uniques)

# for i, themes in enumerate(themes_par_communaute):
#     print(f"Communauté {i+1} : {themes}")

def analyse_communaute(communaute):
    genres = []
    continents = []
    ages = []
    genres_musicaux = []
    for chanteur in communaute:
        genres.append(chanteurs_gender2.get(chanteur))
        nat = chanteurs_nationalite2.get(chanteur)
        continents.append(continent(nat))
        naissance = chanteurs_naissance2.get(chanteur)
        if naissance:
            ages.append(naissance)
        genres_musicaux.extend(chanteurs_genres3.get(chanteur, []))
    return {
        'genre': Counter(genres),
        'continent': Counter(continents),
        'annee_naissance_moyenne': round(np.mean(ages), 1) if ages else None,
        'genres_musicaux_frequents': Counter(genres_musicaux).most_common(5)
    }
# res_1 = analyse_communaute(communautes[0])
# res_2 = analyse_communaute(communautes[1])
# print("Communauté bleu foncé :", res_1)
# print("Communauté bleu clair :", res_2)





# on fait le travail pour les artistes et pas uniquement les chanteurs
artiste_to_themes = {'M.I.A.': {'political'}, 'Arundhati Roy': {'political'}, 'Dexter Scott King': {'animal rights', 'human rights'}, 'Alexandra Paul': {'animal rights'}, 'Talib Kweli': {'political'}, 'KRS-One': {'animal rights', 'human rights'}, 'Killer Mike': {'social'}, 'Michael Cuccione': {'health'}, 'L7a9d': {'human rights'}, 'Anneka Svenska': {'animal rights'}, 'Sonia Nassery Cole': {'human rights'}, 'Saba Saba': {'social'}, 'Tam Tak-chi': {'social'}, 'Rebecca Moore': {'animal rights'}, 'Tembi Locke': {'human rights'}, 'Joaquin Phoenix': {'animal rights'}, 'Raphaël Coleman': {'climate'}, 'Ben Patrick Johnson': {'women & lgbt rights', 'human rights'}, 'Cyrus Grace Dunham': {'women & lgbt rights'}, 'Wyclef Jean': {'political'}, 'Adé Bantu': {'social'}, 'Luisa Mell': {'animal rights'}, 'Emma Watson': {'climate'}, 'Tupac Shakur': {'political', 'human rights'}, 'Lesego Motsepe': {'social'}, 'Dallas Goldtooth': {'climate'}, 'Mai Khôi': {'political'}, 'Jacline Mouraud': {'social'}, 'Cat Brooks': {'climate'}, 'Madeleina Kay': {'political'}, 'Ronen Rubinstein': {'climate'}, 'Qandeel Baloch': {'women & lgbt rights'}, 'Shitou': {'women & lgbt rights'}, 'Malynda Hale': {'political'}, 'Juliana Olayode': {'women & lgbt rights'}, 'Topher': {'political'}, 'Samina Luthfa': {'social'}, 'Montana Tucker': {'social'}}
artistes_filtres = set(artiste_to_themes.keys())

# artistes_nationalite = {}

# for artiste in artistes:
#     nom = artiste.get("personLabel")
#     if nom in artistes_filtres:
#         wikidata_url = artiste.get("person")
#         wikipedia_url = wikidata_to_wikipedia(wikidata_url)
#         if not wikipedia_url or not wikipedia_url.startswith("http"):
#             pays = "invalid url"
#         else:
#             pays = nationalite(wikipedia_url)
#         artistes_nationalite[nom] = pays
# print(artistes_nationalite)

# artistes_naissance = {}
# for artiste in artistes:
#     nom = artiste.get("personLabel")
#     if nom in artistes_filtres:
#         wikidata_url = artiste.get("person")
#         wikipedia_url = wikidata_to_wikipedia(wikidata_url)
#         if not wikipedia_url or not wikipedia_url.startswith("http"):
#             annee = "invalid url"
#         else:
#             annee = naissance(wikipedia_url)
#         artistes_naissance[nom] = annee
# print (artistes_naissance)

# with open("femmes2.json", "r", encoding="utf-8") as f:femmes2 = json.load(f)

# artistes_gender = {}
# for artiste in artistes:
#     nom = artiste.get("personLabel")
#     if nom in artistes_filtres:
#         if artiste in femmes2:
#             artistes_gender[nom] = "femme"
#         else:
#             artistes_gender[nom] = "homme"
# print (artistes_gender)


artistes_nationalite2 = {'M.I.A.': 'England', 'Arundhati Roy': 'India', 'Dexter Scott King': 'U.S.', 'Alexandra Paul': 'U.S.', 'Talib Kweli': 'U.S.', 'KRS-One': 'U.S.', 'Killer Mike': 'U.S.', 'Michael Cuccione': 'Canada', 'L7a9d': 'Morocco', 'Anneka Svenska': 'England', 'Sonia Nassery Cole': 'Afghanistan', 'Saba Saba': 'Uganda', 'Tam Tak-chi': 'Hong Kong', 'Rebecca Moore': 'U.S.',  'Joaquin Phoenix': 'Puerto Rico', 'Raphaël Coleman': 'England', 'Ben Patrick Johnson': 'U.S.', 'Cyrus Grace Dunham': 'U.S.', 'Wyclef Jean': 'Haiti', 'Adé Bantu': 'England', 'Luisa Mell': 'Brazil', 'Emma Watson': 'France', 'Tupac Shakur': 'U.S.', 'Lesego Motsepe': 'South Africa', 'Dallas Goldtooth': 'U.S.', 'Mai Khôi': 'Vietnam', 'Jacline Mouraud': 'France', 'Cat Brooks': 'U.S.', 'Madeleina Kay': 'England', 'Qandeel Baloch': 'Pakistan', 'Malynda Hale': 'U.S.', 'Juliana Olayode': 'Nigeria', 'Topher': 'U.S.', 'Samina Luthfa': 'Bangladesh', 'Montana Tucker': 'U.S.'}
artistes_naissance2 = {'M.I.A.': 1975, 'Arundhati Roy': 1961, 'Dexter Scott King': 1961, 'Alexandra Paul': 1963, 'Talib Kweli': 1975, 'KRS-One': 1965, 'Killer Mike': 1975, 'Michael Cuccione': 1985, 'L7a9d': 1988, 'Anneka Svenska': 1974, 'Sonia Nassery Cole': 1965, 'Saba Saba': 1977, 'Tam Tak-chi': 1973, 'Rebecca Moore': 1968,  'Joaquin Phoenix': 1974, 'Raphaël Coleman': 1994, 'Ben Patrick Johnson': 1969, 'Cyrus Grace Dunham': 1992, 'Wyclef Jean': 1969, 'Adé Bantu': 1971, 'Luisa Mell': 1978, 'Emma Watson': 1990, 'Tupac Shakur': 1971, 'Lesego Motsepe': 1974, 'Dallas Goldtooth': 1983, 'Mai Khôi': 1983, 'Jacline Mouraud': 1967, 'Cat Brooks': 1975, 'Madeleina Kay': 1994, 'Qandeel Baloch': 1990,  'Malynda Hale': 1986, 'Juliana Olayode': 1995, 'Topher': 1991, 'Montana Tucker': 1993}
artistes_gender2 = {'M.I.A.': 'femme', 'Arundhati Roy': 'femme', 'Dexter Scott King': 'homme', 'Alexandra Paul': 'femme', 'Talib Kweli': 'homme', 'KRS-One': 'homme', 'Killer Mike': 'homme', 'Michael Cuccione': 'homme', 'L7a9d': 'homme', 'Anneka Svenska': 'femme', 'Sonia Nassery Cole': 'femme', 'Saba Saba': 'homme', 'Tam Tak-chi': 'homme', 'Rebecca Moore': 'femme', 'Tembi Locke': 'femme', 'Joaquin Phoenix': 'homme', 'Raphaël Coleman': 'homme', 'Ben Patrick Johnson': 'homme', 'Cyrus Grace Dunham': 'homme', 'Wyclef Jean': 'homme', 'Adé Bantu': 'homme', 'Luisa Mell': 'femme', 'Emma Watson': 'femme', 'Tupac Shakur': 'homme', 'Lesego Motsepe': 'femme', 'Dallas Goldtooth': 'homme', 'Mai Khôi': 'femme', 'Jacline Mouraud': 'femme', 'Cat Brooks': 'femme', 'Madeleina Kay': 'femme', 'Qandeel Baloch': 'femme', 'Shitou': 'femme', 'Malynda Hale': 'femme', 'Juliana Olayode': 'femme', 'Topher': 'homme', 'Samina Luthfa': 'femme', 'Montana Tucker': 'femme'}
artistes_communs = set(artistes_nationalite2.keys()) & set(artistes_naissance2.keys()) & set(artistes_gender2.keys())

# artistes = artistes_gender2.keys()
# print (artistes)
# dictionnaire = [
#     {'nationalite': artistes_nationalite2.get(artiste), 'gender': artistes_gender2.get(artiste), 'naissance':artistes_naissance2.get(artiste)/1961}
#     for artiste in artistes_communs
# ]
# print (dictionnaire)

# vec= DictVectorizer()
# X = vec.fit_transform(dictionnaire).toarray()

# for k in range(2,len(X)):
#     print("number of clusters: "+ str(k))
#     kmeans = KMeans(n_clusters=k, random_state=0).fit(X)
#     print(silhouette_score(X, kmeans.labels_))
#     print(kmeans.labels_)

# G4 = nx.Graph()

# for artiste in artistes_communs:
#     G4.add_node(artiste)
# for a, b in combinations(artistes_communs, 2):
#     weight = 0
#     if artistes_gender2[a] == artistes_gender2[b]:
#         weight += 1
#     nat_a = artistes_nationalite2.get(a)
#     nat_b = artistes_nationalite2.get(b)
#     if nat_a == nat_b:
#         weight += 2
#     elif continent(nat_a) == continent(nat_b):
#         weight += 1
#     y_a = artistes_naissance2.get(a)
#     y_b = artistes_naissance2.get(b)
#     if y_a == y_b:
#         weight += 2
#     elif abs(y_a - y_b) <= 5:
#         weight += 1
#     if weight > 0:
#         G4.add_edge(a, b, weight=weight)

# pos = nx.spring_layout(G4, seed=42)
# edges = G4.edges(data=True)
# weights = [d['weight'] for (_, _, d) in edges]
# plt.figure(figsize=(13, 11))
# nx.draw_networkx_nodes(G4, pos, node_color='lightblue', node_size=1000)
# nx.draw_networkx_labels(G4, pos, font_size=10)
# nx.draw_networkx_edges(G4, pos, width=weights, edge_color='gray')
# plt.axis('off')
# plt.show()

# louvain=nx.community.louvain_communities(G4, weight='weight')
# print("Louvain Communities:", louvain)

# louvain_dict = {}
# for i, community in enumerate(louvain):
#     for node in community:
#         louvain_dict[node] = i

# pos = nx.spring_layout(G4)
# cmap = plt.cm.get_cmap('tab10')
# nx.draw_networkx_nodes(G4, pos, node_color=[louvain_dict[n] for n in G4.nodes()], cmap=cmap)
# nx.draw_networkx_edges(G4, pos, width=0.45)
# nx.draw_networkx_labels(G4, pos)
# plt.show()

communautes = [{'Cyrus Grace Dunham', 'KRS-One', 'Topher', 'Rebecca Moore', 'Michael Cuccione', 'Tupac Shakur', 'Cat Brooks', 'Ben Patrick Johnson', 'Talib Kweli', 'Dallas Goldtooth', 'Alexandra Paul', 'Dexter Scott King', 'Joaquin Phoenix', 'Montana Tucker', 'Wyclef Jean', 'Malynda Hale', 'Killer Mike'}, {'L7a9d', 'Saba Saba', 'Tam Tak-chi'}, {'Luisa Mell', 'Raphaël Coleman', 'Sonia Nassery Cole', 'Qandeel Baloch', 'Madeleina Kay', 'Arundhati Roy', 'M.I.A.', 'Juliana Olayode', 'Emma Watson', 'Adé Bantu', 'Lesego Motsepe', 'Mai Khôi', 'Anneka Svenska', 'Jacline Mouraud'}]
themes_par_communaute = []

for i, comm in enumerate(communautes):
    themes = []
    for artiste in comm:
        if artiste in artiste_to_themes:
            themes.extend(artiste_to_themes[artiste])
    themes_uniques = list(set(themes))
    themes_par_communaute.append(themes_uniques)

for i, themes in enumerate(themes_par_communaute):
    print(f"Communauté {i+1} : {themes}")