In [None]:
import requests
import urllib.parse
from bs4 import BeautifulSoup
import re
import spacy
import json
import networkx as nx
from networkx.algorithms import bipartite
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
import string
import pathlib

nlp = spacy.load("en_core_web_sm")
    
def occurences(url, word):
    response = requests.get(url)
    if response.status_code != 200:
        return ("erreur")

    soup = BeautifulSoup(response.text, 'html.parser')
    paragraphs = soup.find_all('p')
    full_text = ' '.join(p.get_text() for p in paragraphs)

    occurences = 0
    for match in re.finditer(rf'\b{re.escape(word)}\b', full_text, re.IGNORECASE):
        occurences += 1
        print(f"{occurences} starts at: {match.start()} ends at: {match.end()}")

    return occurences



def theme(url, word):
    response = requests.get(url)
    if response.status_code != 200:
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    paragraphs = soup.find_all('p')
    full_text = ' '.join(p.get_text() for p in paragraphs)

    themes = []
    occurences = 0

    for match in re.finditer(rf'\b{re.escape(word)}\b', full_text, re.IGNORECASE):
        occurences += 1
        x = match.start()
        y = match.end()
        
        # On selectionne quelques mots qui entourent le mot-clé 
        sentence = full_text[max(0, x - 70): y]

        # On supprime les références wikipedia eventuellement contenue
        sentence = re.sub(r'\[\d+\]', '', sentence)  

        print("> Phrase séléctionnée: ", sentence.strip())

        # On selectionne uniquement le fragment de mots contenus autour de notre mot-clé et 
        # entre 2 signes de ponctuations si presents
        fragments = re.split(r'[.?!;,:\n]', sentence)
        fragment = next((frag.strip() for frag in fragments if word.lower() in frag.lower()), None)
        print("> Fragment: ",fragment)

        # On utilise spaCy pour supprimer les "stopwords" 
        doc = nlp(fragment)
        without_stopwords = [token.text for token in doc if not token.is_stop]
        sentence = ' '.join(without_stopwords)
        print("> Fragment sans stopwords:", sentence)

        # On extrait le theme, en utilisant spaCy
        for mot in doc:
            if mot.text.lower() == word.lower():
                mots_lies = []
                i = mot.i - 1
                while i >= 0:
                    prev = doc[i]
                    if prev.dep_ in ("amod", "compound") and prev.pos_ in ("ADJ", "NOUN"):
                        mots_lies.insert(0, prev.text)
                        i -= 1
                    else:
                        break

                theme = ' '.join(w.lower() for w in mots_lies + [mot.text])
                print("> Theme: ", theme)
                themes.append(theme)
                break  

    return themes



def wikidata_to_wikipedia(wikidata_url, lang='en'):
    entity_id = wikidata_url.strip().split('/')[-1]
    api_url = 'https://www.wikidata.org/w/api.php'
    params = {'action': 'wbgetentities', 'ids': entity_id, 'format': 'json', 'props': 'sitelinks'}

    response = requests.get(api_url, params=params)
    data = response.json()

    sitelinks = data['entities'].get(entity_id, {}).get('sitelinks', {})
    wiki_key = f'{lang}wiki'
    if wiki_key in sitelinks:
        title = sitelinks[wiki_key].get('title')
        if title:
            encoded_title = urllib.parse.quote(title.replace(' ', '_'))
            return f'https://{lang}.wikipedia.org/wiki/{encoded_title}'
    return f"Non trouvee."


#theme("https://en.wikipedia.org/wiki/Rapha%C3%ABl_Coleman","activist")
#wikidata_to_wikipedia("http://www.wikidata.org/entity/Q240573", lang='en')

# G1 = nx.Graph()
# artistes = json.loads(pathlib.Path("artistes.json").read_text())
with open("artistes.json", "r", encoding="utf-8") as f:artistes = json.load(f)
# for artiste in artistes:
#     nom = artiste.get("personLabel")
#     wikidata_url = artiste.get("person")
#     wikipedia_url = wikidata_to_wikipedia(wikidata_url)
#     if wikipedia_url.startswith("http"):
#         themes_extraits = theme(wikipedia_url, "activist")
#         if themes_extraits:
#             G1.add_node(nom, bipartite=0)
#             for t in themes_extraits:
#                 G1.add_node(t, bipartite=1)
#                 G1.add_edge(nom, t)


# left_nodes = {n for n, d in G1.nodes(data=True) if d.get("bipartite") == 0}
# right_nodes = {n for n, d in G1.nodes(data=True) if d.get("bipartite") == 1}
# pos = nx.bipartite_layout(G1, left_nodes)
# plt.figure(figsize=(14, 10))
# nx.draw_networkx(
#     G1, pos,
#     node_color=["skyblue" if n in left_nodes else "lightgreen" for n in G1.nodes()],
#     node_size=1200,
#     font_size=9,
#     with_labels=True,
#     edge_color="gray"
# )
# plt.show()

# for theme in right_nodes:
#     connected_artists = list(G1.neighbors(theme))
#     print(f"Le thème '{theme}' est relié à {len(connected_artists)} artiste(s)")


communautes = {"climate" :["climate activist", "environmental activist", "climate change activist", "anti - nuclear activist"],
               "animal rights":["animal rights activist", "vegan activist"],
                "human rights" : ["human rights activist", "civil rights activist"],
                "women & lgbt rights" : ["feminist activist", "gay activist", "lesbian activist", "transgender activist", "sexual purity activist"],
                "politics" : ["political activist"],
                "social":["social activist", "social media activist", "yellow vest activist"],
                "health" : ["cancer research activist"]}

# G2 = nx.Graph()
# artiste_to_themes = {}
# for artiste in artistes:
#     nom = artiste.get("personLabel")
#     wikidata_url = artiste.get("person")
#     wikipedia_url = wikidata_to_wikipedia(wikidata_url)
#     if wikipedia_url.startswith("http"):
#         themes_extraits = theme(wikipedia_url, "activist")
#         if themes_extraits:
#             artiste_ajoute = False
#             for t in themes_extraits:
#                 for communaute, theme2 in communautes.items():
#                     for i in range (len(theme2)):    
#                         if theme2[i] in t: 
#                             if artiste_ajoute == False:
#                                 G2.add_node(nom, bipartite=0) 
#                                 artiste_ajoute = True
#                                 G2.add_node(communaute, bipartite=1)
#                                 G2.add_edge(nom, communaute)
#                                 artiste_to_themes.setdefault(nom, set()).add(communaute)
#                                 break
#                             else:
#                                 G2.add_node(communaute, bipartite=1)
#                                 G2.add_edge(nom, communaute)
#                                 artiste_to_themes.setdefault(nom, set()).add(communaute)
#                                 break


# left_nodes = {n for n, d in G2.nodes(data=True) if d.get("bipartite") == 0}
# right_nodes = {n for n, d in G2.nodes(data=True) if d.get("bipartite") == 1}
# pos = nx.bipartite_layout(G2, left_nodes, align='vertical', scale=5)
# plt.figure(figsize=(14, 10))
# nx.draw_networkx(
#     G2, pos,
#     node_color=["skyblue" if n in left_nodes else "lightgreen" for n in G2.nodes()],
#     node_size=800,
#     font_size=7,
#     with_labels=True,
#     edge_color="gray"
# )
# plt.show()

# with open("chanteurs.json", "r", encoding="utf-8") as f:chanteurs = json.load(f)
# chanteurs_activists = {}
# for personne in artiste_to_themes:
#     for chanteur in chanteurs:
#         if personne == chanteur.get("personLabel"):
#             chanteurs_activists[personne] = list(artiste_to_themes[personne])
# print (chanteurs_activists)

def related_words(word):
    related = set()
    for syn in wn.synsets(word):
        for lemma in syn.lemmas():
            related.add(lemma.name())
        for hyper in syn.hypernyms():
            for lemma in hyper.lemmas():
                related.add(lemma.name())
        for hypo in syn.hyponyms():
            for lemma in hypo.lemmas():
                related.add(lemma.name())
    return related

# On commence maintenant l'analyse des chansons 
import lyricsgenius
genius = lyricsgenius.Genius("Fu9F4P_Zft__NzCvberMcG773LOGTEU1bpqA9cdGErZYU41AWnwlmlr649k55FRD", timeout=15)
genius.remove_section_headers = True # Remove section headers (e.g. [Chorus]) from lyrics when searching
genius.excluded_terms = ["(Remix)", "(Live)", "(Edit)", "(Version)"] # Exclude songs with these words in their title

#pour obtenir les paroles d'une chanson
def paroles(chanson, chanteur):
    song = genius.search_song(chanson, chanteur)
    return song.lyrics

#pour obtenir les titres des chansons d'un artiste
def chansons(chanteur, max_chansons):
    artiste = genius.search_artist(chanteur, max_songs=max_chansons, sort="popularity")
    return [chanson.title for chanson in artiste.songs]

def texte_paroles(texte):
    texte = texte.lower()
    texte = texte.replace('-', '')
    texte = texte.translate(str.maketrans('', '', string.punctuation))
    return set(word_tokenize(texte))


from gensim.models import KeyedVectors

model_path = "GoogleNews-vectors-negative300.bin"
model = KeyedVectors.load_word2vec_format(model_path, binary=True,limit=500000)

def nettoyer_mot(mot):
    mot = mot.replace("_", " ")          
    mot = mot.lower()                     
    mot = re.sub(r"[^\w\s]", "", mot)     
    return mot.strip()

def related_words2(keyword, seuil=0.3):
    try:
        similar = model.most_similar(keyword, topn=5000000)  
        mots = set([word for word, score in similar if score >= seuil])
        return set(nettoyer_mot(mot) for mot in mots)
    except KeyError:
        return set()
    

#attribue un score de similarité entre les paroles d'une chanson et un theme
def score_similarite(chanson, chanteur, theme):
    mots_lies = related_words2(theme)
    lyrics = paroles(chanson, chanteur)
    mots_lyrics = texte_paroles(lyrics)
    mots_communs = mots_lyrics.intersection(mots_lies)
    print(mots_communs)
    #return len(mots_communs) / max(1, len(mots_lyrics))
    if len(mots_communs) >= 20:
        return "A+"
    if 15 <= len(mots_communs) <= 20:
        return "A"
    if 10<=len(mots_communs) <= 15:
        return "B"
    if 5<= len(mots_communs) <= 10:
        return "C"
    if 0 < len (mots_communs)<= 5:
        return "D"
    else: 
        return "E"

chanteurs_themes = {'M.I.A.': ['politics'], 'KRS-One': ['human rights', 'animal rights'], 'Killer Mike': ['social'], 'Michael Cuccione': ['health'], 'L7a9d': ['human rights'], 'Saba Saba': ['social'], 'Rebecca Moore': ['animal rights'], 'Joaquin Phoenix': ['animal rights'], 'Wyclef Jean': ['politics'], 'Adé Bantu': ['social'], 'Tupac Shakur': ['politics', 'human rights'], 'Mai Khôi': ['politics'], 'Jacline Mouraud': ['social'], 'Madeleina Kay': ['politics'], 'Topher': ['politics'], 'Montana Tucker': ['social']}
# for chanteur, themes_associes in chanteurs_themes.items():
#     songs = chansons(chanteur, 5)  
#     for theme in themes_associes:
#         scores = []
#         for chanson in songs:
#             score_chanson = score_similarite(chanson, chanteur, theme)
#             scores.append(score_chanson)
#         print (chanteur, theme, scores)