In [1]:
from collections import Counter
import pandas as pd
from tqdm import tqdm
import pickle

# Gensim
import gensim
from gensim.corpora import Dictionary
from gensim.models.wrappers import LdaMallet

# pyLDAvis
import pyLDAvis
import pyLDAvis.gensim

In [2]:
# Apply Joris correction of the year of each song

PATH_DF = 'english_cleaned_lyrics.csv'
PATH_CORRECTION = 'indx2newdate.p'

def load_dataset(data_path, path_correction):
    df = pd.read_csv(data_path)
    indx2newdate = pickle.load(open(PATH_CORRECTION, 'rb'))
    df['year'] = df['index'].apply(lambda x: int(indx2newdate[x][0][:4]) if indx2newdate[x][0] != '' else 0)
    return df[df.year > 1960][['song', 'year', 'artist', 'genre', 'lyrics']]
    
dataset = load_dataset(PATH_DF, PATH_CORRECTION)

### 1. Count how many songs of each genre are in the data set, and pick a genre that 1) you think is interesting to explore and 2) has over 5.000 songs. Make a subset of the data set only containing songs of that genre; this is the data set you work with for the rest of these exercises.

In [3]:
dataset.groupby(by="genre").count()["song"].sort_values()

genre
Folk           1373
R&B            2338
Other          2449
Indie          2489
Jazz           5068
Electronic     5194
Country       10545
Hip-Hop       14878
Metal         15671
Pop           23295
Rock          77556
Name: song, dtype: int64

I will choose Hip-Hop

In [4]:
hiphop = dataset[dataset.genre == "Hip-Hop"]

### 2. Inspect the number of songs for each year, either using a data frame or using a visualization. Do you think you have enough songs for each year (at least more than fifty)? If not, filter out the years that do not contain enough songs.

In [5]:
ids = hiphop.groupby(by="year").count()["song"].apply(lambda v: v if v <= 50 else None).dropna().index
ids

Int64Index([1961, 1963, 1966, 1969, 1970, 1971, 1974, 1977, 1978, 1979, 1980,
            1981, 1982, 1983, 1984, 1985, 1986, 1987],
           dtype='int64', name='year')

There isn't any year where the number of songs is lower or equal than 50. So we will keep all years for our analysis.

### 3. Process the texts of your genre (and only your genre!) using Spacy. Extract the lemmatized tokens for each song, and remove stopwords.

In [6]:
# Process the texts
data = hiphop[~hiphop.year.isin(ids)].sample(n=5000).lyrics.values  # taking a small sample because my computer crashes with a bigger one

In [7]:
import spacy
nlp = spacy.load("en_core_web_sm")
processed_text = [nlp(text.lower()) for text in tqdm(data)]

100%|██████████| 5000/5000 [06:08<00:00, 13.55it/s]


In [11]:
import json
lemma = [[token.lemma_ for token in text if not token.is_punct and not token.is_stop] for text in processed_text]
with open('lemma.json', 'w') as file:
    json.dump(lemma, file)

In [10]:
import json
with open('lemma.json') as json_file:
    lemma = json.load(json_file)

### 4. Create a dictionary and filter out the words that occur less than three times, and all words that occur in over 85% of the documents.

In [22]:
# Vectorization

from gensim.corpora import Dictionary

MIN_DF = 3 # minium document frequency
MAX_DF = 0.85 # maximum document frequency
dictionary = Dictionary(lemma) # get the vocabulary
dictionary.filter_extremes(no_below=MIN_DF, 
                           no_above=MAX_DF)
corpus = [dictionary.doc2bow(text) for text in lemma]

### 5. Train a topic model with 50 topics and inspect the output, both using the ten most relevant words for each topics, and using pyLDAvis. Now also run a topic model with 20 topics, and one with 100 topics. Be sure to save the models using lda.save(’folder/to/save’) What number of topics does result in the “best” topics? [Note: how you operationalize “best” is up to you]

In [None]:
# Train the model (50 topics)

from gensim.models.wrappers import LdaMallet

PATH_TO_MALLET = '/home/pablo/Documents/github-repos/Mallet/bin/mallet'
N_TOPICS = 50
N_ITERATIONS = 1000

lda = LdaMallet(PATH_TO_MALLET,
                corpus=corpus,
                id2word=dictionary,
                num_topics=N_TOPICS,
                iterations=N_ITERATIONS)

In [56]:
lda.save('fifty_topics.txt')

In [57]:
# Train the model (20 topics)

from gensim.models.wrappers import LdaMallet

PATH_TO_MALLET = '/home/pablo/Documents/github-repos/Mallet/bin/mallet'
N_TOPICS = 20
N_ITERATIONS = 1000

lda = LdaMallet(PATH_TO_MALLET,
                corpus=corpus,
                id2word=dictionary,
                num_topics=N_TOPICS,
                iterations=N_ITERATIONS)

In [59]:
lda.save('twenty_topics.txt')

In [60]:
# Train the model (20 topics)

from gensim.models.wrappers import LdaMallet

PATH_TO_MALLET = '/home/pablo/Documents/github-repos/Mallet/bin/mallet'
N_TOPICS = 100
N_ITERATIONS = 1000

lda = LdaMallet(PATH_TO_MALLET,
                corpus=corpus,
                id2word=dictionary,
                num_topics=N_TOPICS,
                iterations=N_ITERATIONS)

In [62]:
lda.save('hundred_topics.txt')

In [27]:
# Load models (I added this so I don't have to train them all the time)

lda_twenty = LdaMallet.load('twenty_topics.model')
lda_fifty = LdaMallet.load('fifty_topics.model')
lda_hundred = LdaMallet.load('hundred_topics.model')

In order to decide which models performs best, I will use the coherence score of each of them. This index measures how coherent the topics are with the words of our dataset. The closest it is to 1, the better the model performs.

In [24]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_hundred, texts=lemma, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  nan


  score += np.sum((self.eta - _lambda) * Elogbeta)
  score += np.sum(gammaln(_lambda) - gammaln(self.eta))



Coherence Score:  0.4779881143808568


In [25]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_fifty, texts=lemma, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  nan

Coherence Score:  0.4480110762777182


In [26]:
# Compute Perplexity
lda_conv = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_twenty)
print('\nPerplexity: ', lda_conv.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_twenty, texts=lemma, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  nan

Coherence Score:  0.42100125806247934


Hundred topics seems to work best as it has the highest coherence score.