# Data Mining: lab 3.1

#### Necessary imports

In [1]:
# Counter
from collections import Counter

# gensim
import gensim
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
from gensim.models.wrappers import LdaMallet

# json
import json

# pandas
import pandas as pd

# pickle
import pickle

# pyLDAvis
import pyLDAvis
import pyLDAvis.gensim

# tqdm
from tqdm import tqdm, tqdm_notebook

# spacy
import spacy
nlp = spacy.load("en_core_web_sm")

#### Correction of the dataset

In [2]:
PATH_DF = 'english_cleaned_lyrics.csv'
PATH_CORRECTION = 'indx2newdate.p'

def load_dataset(data_path, path_correction):
    df = pd.read_csv(data_path)
    indx2newdate = pickle.load(open(PATH_CORRECTION, 'rb'))
    df['year'] = df['index'].apply(lambda x: int(indx2newdate[x][0][:4]) if indx2newdate[x][0] != '' else 0)
    return df[df.year > 1960][['song', 'year', 'artist', 'genre', 'lyrics']]
    
dataset = load_dataset(PATH_DF, PATH_CORRECTION)

### 1. Count how many songs of each genre are in the data set, and pick a genre that 1) you think is interesting to explore and 2) has over 5.000 songs. Make a subset of the data set only containing songs of that genre; this is the data set you work with for the rest of these exercises.

In [3]:
dataset.groupby(by="genre").count()["song"].sort_values()

genre
Folk           1373
R&B            2338
Other          2449
Indie          2489
Jazz           5068
Electronic     5194
Country       10545
Hip-Hop       14878
Metal         15671
Pop           23295
Rock          77556
Name: song, dtype: int64

I will choose to explore the lyrics of Hip Hop songs.

In [4]:
hiphop = dataset[dataset.genre == "Hip-Hop"]

### 2. Inspect the number of songs for each year, either using a data frame or using a visualization. Do you think you have enough songs for each year (at least more than fifty)? If not, filter out the years that do not contain enough songs.

In [5]:
ids = hiphop.groupby(by="year").count()["song"].apply(lambda v: v if v <= 50 else None).dropna().index
ids

Int64Index([1961, 1963, 1966, 1969, 1970, 1971, 1974, 1977, 1978, 1979, 1980,
            1981, 1982, 1983, 1984, 1985, 1986, 1987],
           dtype='int64', name='year')

In [6]:
data = hiphop[~hiphop.year.isin(ids)].lyrics.values
len(data)

14777

After filtering out of the selection those years where there were less than 50 hiphop songs, there are 14777 songs remaining in the dataset, enough to perform the analysis. 

### 3. Process the texts of your genre (and only your genre!) using Spacy. Extract the lemmatized tokens for each song, and remove stopwords.

I will use nlp.pipe to process the lyrics of the songs.

In [None]:
nlp = spacy.load("en_core_web_sm")

processed_texts = [text 
                   for text in tqdm_notebook(
                       nlp.pipe(
                           data, n_process=-1, disable=["ner", "parser"]
                           ), total=len(data)
                   )
                   ]
lemma = [
         [token.lemma_ for token in text if not token.is_punct and not token.is_stop] 
         for text in processed_texts
         ]

Since the step of processing texts takes a long time, I will save the lemmas as a json file so I don't have to do it again if I restart the runtime.

In [None]:
with open('hiphop_lemma.json', 'w') as file:
    json.dump(lemma, file)

In [8]:
with open('hiphop_lemma.json') as json_file:
    lemma = json.load(json_file)

### 4. Create a dictionary and filter out the words that occur less than three times, and all words that occur in over 85% of the documents.

In [9]:
MIN_DF = 3 # minium document frequency
MAX_DF = 0.85 # maximum document frequency
dictionary = Dictionary(lemma) # get the vocabulary
dictionary.filter_extremes(no_below=MIN_DF, no_above=MAX_DF)
corpus = [dictionary.doc2bow(text) for text in lemma]

### 5. Train a topic model with 50 topics and inspect the output, both using the ten most relevant words for each topics, and using pyLDAvis. Now also run a topic model with 20 topics, and one with 100 topics. Be sure to save the models using lda.save('folder_to_save') What number of topics does result in the “best” topics?

In [10]:
PATH_TO_MALLET = '/home/pablo/Documents/github-repos/Mallet/bin/mallet'
N_TOPICS = 50
N_ITERATIONS = 1000

lda = LdaMallet(PATH_TO_MALLET,
                corpus=corpus,
                id2word=dictionary,
                num_topics=N_TOPICS,
                iterations=N_ITERATIONS)

In [11]:
lda.save('fifty_topics.model')

In [12]:
PATH_TO_MALLET = '/home/pablo/Documents/github-repos/Mallet/bin/mallet'
N_TOPICS = 20
N_ITERATIONS = 1000

lda = LdaMallet(PATH_TO_MALLET,
                corpus=corpus,
                id2word=dictionary,
                num_topics=N_TOPICS,
                iterations=N_ITERATIONS)

In [13]:
lda.save('twenty_topics.model')

In [14]:
PATH_TO_MALLET = '/home/pablo/Documents/github-repos/Mallet/bin/mallet'
N_TOPICS = 100
N_ITERATIONS = 1000

lda = LdaMallet(PATH_TO_MALLET,
                corpus=corpus,
                id2word=dictionary,
                num_topics=N_TOPICS,
                iterations=N_ITERATIONS)

In [15]:
lda.save('hundred_topics.model')

In [16]:
lda_twenty = LdaMallet.load('twenty_topics.model')
lda_fifty = LdaMallet.load('fifty_topics.model')
lda_hundred = LdaMallet.load('hundred_topics.model')

In order to decide which models performs best, I will use the coherence score of each of them. This index measures how coherent the topics are with the words of our dataset. The closest it is to 1, the better the model performs.

In [17]:
coherence_model_lda = CoherenceModel(model=lda_hundred, texts=lemma, dictionary=dictionary, coherence='c_v')
coherence_lda_hundred = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda_hundred)


Coherence Score:  0.3919732347888762


In [18]:
coherence_model_lda = CoherenceModel(model=lda_fifty, texts=lemma, dictionary=dictionary, coherence='c_v')
coherence_lda_fifty = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda_fifty)


Coherence Score:  0.4025249677018343


In [19]:
coherence_model_lda = CoherenceModel(model=lda_twenty, texts=lemma, dictionary=dictionary, coherence='c_v')
coherence_lda_twenty = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda_twenty)


Coherence Score:  0.41639203549255577


In our case, the model with twenty topics seems to be the one that performs the best (as it has the highest coherence score)

### 6. Make a change in the preprocessing stage and run the topic model again. This could be: not removing stop words, only selecting nouns (or only nouns, adjectives and verbs – something I do quite often when topic modeling), not using lemmas but tokens, etc. Inspect the output.  Name one benefit and one downside of the change you selected on the preprocessing stage for finding useful topics.

I decided to select the tokens instead of the lemmas, and to also include stopwords in the selection.

In [21]:
with open('hiphop_tokens.json') as json_file:
    tokens = json.load(json_file)

In [22]:
dictionary_tokens = Dictionary(tokens) # get the vocabulary
dictionary_tokens.filter_extremes(no_below=MIN_DF, no_above=MAX_DF)
corpus_tokens = [dictionary.doc2bow(text) for text in tokens]

In [23]:
lda = LdaMallet(PATH_TO_MALLET,
                corpus=corpus_tokens,
                id2word=dictionary_tokens,
                num_topics=N_TOPICS,
                iterations=N_ITERATIONS)

In [31]:
lda.save('twenty_topics_tokens.model')

In [25]:
coherence_model_tokens = CoherenceModel(model=lda, texts=tokens, dictionary=dictionary_tokens, coherence='c_v')
coherence_tokens = coherence_model_tokens.get_coherence()
print('\nCoherence Score: ', coherence_tokens)


Coherence Score:  0.536634905151316
