In [34]:
import re
import requests
import json
import darklyrics
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import TweetTokenizer
from string import punctuation
from dotenv import dotenv_values
from IPython import display

In [4]:
# Fazer download das stopwords
nltk.download('stopwords')
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /home/ph/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ph/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Carrega variáveis de ambiente

In [5]:
config = dotenv_values(".env")

### Informe na variável artist o nome da banda que deseja estudar

Visite o site http://www.darklyrics.com/ para ver a lista das bandas disponíveis

In [6]:
artist = "iron maiden"

In [23]:
def last_fm_get_data(method: str, filter: dict):
    headers = {
        'user-agent': 'Dataquest'
    }

    payload = {
        'api_key': config["LAST_FM_KEY"],
        'method': method,
        'format': 'json'
    }

    payload.update(filter)
    
    url = 'https://ws.audioscrobbler.com/2.0/'

    r = requests.get(url, headers=headers, params=payload)
    
    if r.status_code == 200:
        return json.loads(r.content)
    return {}
    

In [20]:
def last_fm_get_artist_info():
    method = 'artist.getInfo'
    filter = {"artist": artist}
    return last_fm_get_data(method, filter)

In [21]:
def last_fm_get_album_info(album: str):
    method = 'album.getInfo'
    filter = {"artist": artist, "album": album}
    return last_fm_get_data(method, filter)

In [None]:
artist_dict = last_fm_get_artist_info()
print(artist_dict)

## Retorna os álbuns da banda

In [24]:
try:
    albuns = darklyrics.get_albums(artist)

except IndexError:
    raise IndexError("Artista não encontrado")

In [25]:
print(f"{artist} tem {len(albuns)} discos encontrados")

iron maiden tem 28 discos encontrados


## Retornar informações dos álbuns

In [26]:
new_albuns = []

for album in albuns:
    new_albuns.append(last_fm_get_album_info(album))

albuns = new_albuns

## Imprimir as capas dos álbuns

In [None]:
from IPython.core.display import Image, display

for album in albuns:
    img = [img["#text"] for img in album["album"]["image"] if img["size"] == "large"][0]
    display(Image(url=img))

## Agrupar as músicas por álbum

In [None]:
album_track = []

for album in albuns:
    payload = {}
    payload["name"] = album["album"]["name"]
    payload["tracks"] = [{"name": track["name"], "duration": track["duration"]} for track in album["album"].get("tracks", {}).get("track", [])]
    album_track.append(payload)
                          
print(json.dumps(album_track, indent=4, sort_keys=True))

In [28]:
#print(albuns)
print(json.dumps(albuns, indent=4, sort_keys=True))

[
    {
        "album": {
            "artist": "Iron Maiden",
            "image": [
                {
                    "#text": "https://lastfm.freetls.fastly.net/i/u/34s/72e43a38898e88c285a131f497ae7092.png",
                    "size": "small"
                },
                {
                    "#text": "https://lastfm.freetls.fastly.net/i/u/64s/72e43a38898e88c285a131f497ae7092.png",
                    "size": "medium"
                },
                {
                    "#text": "https://lastfm.freetls.fastly.net/i/u/174s/72e43a38898e88c285a131f497ae7092.png",
                    "size": "large"
                },
                {
                    "#text": "https://lastfm.freetls.fastly.net/i/u/300x300/72e43a38898e88c285a131f497ae7092.png",
                    "size": "extralarge"
                },
                {
                    "#text": "https://lastfm.freetls.fastly.net/i/u/300x300/72e43a38898e88c285a131f497ae7092.png",
                    "size": "mega

### Retorne o nome das músicas

In [None]:
songs = darklyrics.get_songs(artist)

In [None]:
print(f"{artist} tem {len(songs)} músicas encontradas")

### Remove nome de músicas duplicadas

In [None]:
songs = set(songs)

In [None]:
print(f"{artist} tem {len(songs)} músicas distintas encontradas")

### Retorna todas músicas da banda

In [None]:
lyrics = darklyrics.get_all_lyrics(artist)

In [None]:
print(f"{artist} tem a quantidade de {len(lyrics)} letras nas músicas encontradas")

In [None]:
lyrics[:100]

In [None]:
lyrics = lyrics.replace("\n\n", " ").replace("\n", " ").strip()

In [None]:
print(f"{artist} tem {len(lyrics)} letras distintas encontradas")

## Criar uma nuvem de palavras

Seleciona as palavras mais repetidas e cria uma nuvem de palavras

In [None]:
stop_words = set(nltk.corpus.stopwords.words('english') + list(punctuation))
stop_words.add('-')

In [None]:
def get_tokens_from_text(text: str, keep_punkt=True):
    if keep_punkt:
        return nltk.word_tokenize(text)
    else:
        tk = TweetTokenizer()
        return tk.tokenize(text)

In [None]:
def get_top_most_words(top=10):
    words = []
    
    for val in lyrics.split():
        if val.lower() not in stop_words:
            words.append(val)
    
    group_by_word = Counter(words)
    
    return group_by_word.most_common(top)
    

In [None]:
# informe o valor do parâmetro top na function get_top_most_words para indicar a quantidade de palavras que quer retornar

most_common = get_top_most_words(top=20)

In [None]:
print(most_common)

In [None]:
def create_word_cloud(words: tuple):
    lyric_words = ""
    
    for val in words:
        val = str(val[0]) 
  
        # split the value
        tokens = val.split() 
      
        # converts each token into lowercase 
        for i in range(len(tokens)): 
            tokens[i] = tokens[i].lower() 
      
        lyric_words += " ".join(tokens)+" "
        
    # gets color of the image
    custom_image = np.array(Image.open("resources/cloud.png"))

    # creates wordcloud
    wordcloud = WordCloud(
        background_color ='white', 
        stopwords = stop_words, 
        mask = custom_image).generate(lyric_words)
    
     # displays the wordcloud
    plt.imshow(wordcloud, interpolation = 'bilinear') 
    plt.axis("off") 
    plt.tight_layout(pad = 0)   
    plt.show()

In [None]:
create_word_cloud(most_common)

In [None]:
df = pd.DataFrame(most_common, columns=['word', 'total'])

In [None]:
df.head(20)

In [None]:
def autopct(pct): # only show the label when it's > 10%
    return ('%.2f' % pct) if pct > 10 else ''

In [None]:
%matplotlib inline

my_labels = [d[0] for d in most_common]

ax = df['total'].value_counts().plot(kind='pie', figsize=(30,16), autopct=autopct, labels=None)
ax.axes.get_yaxis().set_visible(False)
plt.legend(loc=5, labels=my_labels)

## Contando as frases que mais se repetem nas letras das músicas

In [None]:
phrase_counter = Counter()
texto = ''
non_speaker = re.compile('[A-Za-z]+: (.*)')
length = 3    # quantidade de palavras na frase

for sent in nltk.sent_tokenize(lyrics):
    strip_speaker = non_speaker.match(sent)

    if strip_speaker is not None:
        sent = strip_speaker.group(1)
    words = get_tokens_from_text(sent, False)

    for phrase in ngrams(words, length):
        if all(word not in stop_words for word in phrase):
            phrase_counter[phrase] += 1
            
most_common_phrases = phrase_counter.most_common(10)

for k,v in most_common_phrases:
    print (' '.join(k))

## Brincando com o NLTK Frequency Distribution

In [None]:
from nltk import FreqDist

# Apply a list of tokens to the FreqDist object
lyric_freqdist = FreqDist(get_tokens_from_text(lyrics, False))

# View the most frequent tokens and corresponding counts in descending order
lyric_freqdist.most_common()

# Normalize the frequency by dividing each word's frequency by the total number
# of words in the corpus

# Obtain a total word count in the corpus
total_word_count = sum(lyric_freqdist.values())

# View the top 25 words by normalize frequency distribution
lyric_top_25 = lyric_freqdist.most_common(25)

print("Word\t\t\tNormalized Frequency")
for word in lyric_top_25:
    normalized_frequency = word[1] / total_word_count
    print("{} \t\t\t {:.4}".format(word[0], normalized_frequency))

### Identificar quem é o membro da banda que mais escreve música

In [None]:
text_in_brackets = re.findall(r'\[.*?\]', lyrics)
print(text_in_brackets)