In [2]:
!pip install better_profanity

Collecting better_profanity
  Downloading better_profanity-0.7.0-py3-none-any.whl (46 kB)
[?25l[K     |███████                         | 10 kB 19.3 MB/s eta 0:00:01[K     |██████████████▏                 | 20 kB 24.7 MB/s eta 0:00:01[K     |█████████████████████▎          | 30 kB 28.7 MB/s eta 0:00:01[K     |████████████████████████████▍   | 40 kB 20.2 MB/s eta 0:00:01[K     |████████████████████████████████| 46 kB 2.1 MB/s 
[?25hInstalling collected packages: better-profanity
Successfully installed better-profanity-0.7.0


In [28]:
#libreria para medir el profanity.

from better_profanity import profanity
import os

import numpy as np
import pandas as pd
from nltk.tokenize import WordPunctTokenizer


In [55]:
def load_emolex(path = "emolex"):
  """
  Método para cargar el emolex, indica la emoción de una palabra (anger, fear, anticipation, trust, surprise, sadness, joy, and disgust)
  y 2 sentimientos (positive. negative)
  """

 #Realizar unzip al archivo emolex
  !unzip emolex.zip
  vocab = {}
  base_path = path


  #Cargar los emolex en un diccionario
  for lexicon in os.listdir(base_path):
    with open(os.path.join(base_path, lexicon)) as f:
      vocab[lexicon.split(".")[0]] = f.read().split("\n")


  #regresar el vocab con los emolex
  return vocab


In [59]:
def count_profanity_sentences(doc):
  """
  Método para identificar la cantidad de groserias dentro de un texto
  """
  doc_prof = [profanity.censor(word) for word in str(doc).split(" ")]
  doc_prof = [word for word in doc_prof if "*" in word]

  return len(doc_prof)

In [75]:
# Función auxiliar para estimar la distribución de emociones en un texto
def emotion_count(text, vocab):
  """
  Función para estimar la distribuciones de emociones del emolex
  """
  words = WordPunctTokenizer().tokenize(text) #separamos palabras
  counts = {i: 0 for i in list(vocab.keys())}
  for word in words:
      for emo in vocab:
          if word in vocab[emo]:
              counts[emo] += 1
  return counts

In [84]:
# Consolidamos los sentimientos encontrados en cada artista 

songs_sentiment = []

for i in np.unique(songs_df['artist']):
  songs_sentiment.append(emotion_count(" ".join(np.array(songs_df['lyric'])[songs_df['artist']==i]), vocab))

artist_sentiment = pd.DataFrame(songs_sentiment)
artist_sentiment.set_index(np.unique(songs_df['artist']), inplace=True)

In [107]:
def emolex_df(artist, corpus, vocab):
  """
  Dataframe con el % de emociones que se tiene para cada artista, cada columna es un sentimiento

  artist -> listado de artistas
  corpus -> lyrics por artistas
  vocab -> vocabulario con el emolex de palabras y su sentimiento asociado
  """

  #Lista vacia para agregar el conteo de emolexs
  artist_sentiment =[]

  for i in np.unique(artist):
    artist_sentiment.append(emotion_count(" ".join(np.array(corpus)[artist==i]), vocab))

  #Se crea un DataFrame con los valores y se dejan en terminos porcentuales sobre el total de apariciones por artista (normalización)
  artist_sentiment = pd.DataFrame(artist_sentiment)
  artist_sentiment.set_index(np.unique(artist), inplace=True)

  #Dejar en terminos porcentuales el conteo de sentimientos.
  artist_sentiment['Total'] = artist_sentiment.sum(axis=1)

  #iterar en cada columna para obtener el % sobre el total
  for column in artist_sentiment.columns:
    artist_sentiment[f"{column}"] =  (artist_sentiment[f"{column}"] / artist_sentiment['Total'] * 100 )
    
  artist_sentiment.drop(['Total'],axis=1, inplace= True)

  return artist_sentiment

In [56]:
vocab = load_emolex()

Archive:  emolex.zip
replace emolex/sadness.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: emolex/sadness.txt      
  inflating: emolex/negative.txt     
  inflating: emolex/trust.txt        
  inflating: emolex/surprise.txt     
  inflating: emolex/fear.txt         
  inflating: emolex/anticipation.txt  
  inflating: emolex/disgust.txt      
  inflating: emolex/positive.txt     
  inflating: emolex/joy.txt          
  inflating: emolex/anger.txt        


In [96]:
artist_songs_df = pd.read_csv("artist_corpus.csv",)

In [97]:
artist_songs_df = artist_songs_df[['artist','lyric']]

In [108]:
df_artist_emolex = emolex_df(artist_songs_df['artist'], artist_songs_df['lyric'], vocab)

In [110]:
df_artist_emolex.to_csv("df_features_emolex.csv")