# Ejercicio 1: Preparar el ambiente de trabajoº

In [32]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import glob
import os

import seaborn as sns
import warnings

In [33]:
# lectura archivos alojados en dump - se guardan en lista
files_list = []
for filename in glob.glob('dump/*.csv'):
    with open(os.path.join(os.getcwd(), filename), 'r') as f:
        temp = pd.read_csv(f).drop('Unnamed: 0', axis=1)
        files_list.append(temp)

In [34]:
# todos los datos a un dataframe global
df = pd.DataFrame()
for data_idx in range(0, len(files_list)):
    df = pd.concat([df, files_list[data_idx]], axis=0, ignore_index=True)
    

In [35]:
# renaming columns
df.columns = ['artista', 'genero', 'cancion', 'letra']

In [36]:
df.head()

Unnamed: 0,artista,genero,cancion,letra
0,Public Enemy,hiphop,You're Gonna Get Yours,"(Flavor Flav) \n Oh-oh Chuck, they out to get ..."
1,Public Enemy,hiphop,Sophisticated Bitch,"That woman in the corner, cold playin' the rol..."
2,Public Enemy,hiphop,Miuzi Weighs A Ton,"Yo Chuck, run a power move on them \n (Yeah) \..."
3,Public Enemy,hiphop,Timebomb,"(Intro - Flavor Flav) \n Hey, Chuck, we got so..."
4,Public Enemy,hiphop,Too Much Posse,(Intro - Flavor Flav) \n What do you got to sa...


# Ejercicio 2: Matriz de ocurrencias

In [37]:
from sklearn.feature_extraction.text import CountVectorizer

In [38]:
count_vectorizer=CountVectorizer(stop_words='english')

In [39]:
count_vectorizer_fit = count_vectorizer.fit_transform(df.letra)

In [40]:
words = count_vectorizer.get_feature_names()
words_freq = count_vectorizer_fit.toarray().sum(axis=0)



In [41]:
words_freq_df = pd.DataFrame([list(words), list(words_freq)]).T

### 5000 palabras más repetidas

In [42]:
words_freq_df.columns= ['word', 'freq']
words_freq_df = words_freq_df.sort_values(by='freq', ascending=False).iloc[0:5000, :]

In [43]:
words_freq_df[:10]

Unnamed: 0,word,freq
24950,like,19629
12859,don,17398
23856,know,14962
18439,got,14171
23287,just,13978
25479,love,11268
48591,yeah,11071
25165,ll,10028
29851,oh,9879
7009,cause,8356


# Ejercicio 3: Entrenamiento del Modelo

In [44]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

In [48]:
params = {'n_components': [5, 10, 15],
          'learning_decay': [0.7, 0.5]}

In [49]:
get_params = GridSearchCV(LatentDirichletAllocation(), params, cv=5, n_jobs=-1, verbose=3)

In [50]:
# ajustamos datos para obtener hiperparámetros
get_params.fit(count_vectorizer_fit)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [None]:
print(get_params.best_params_)
print(get_params.best_score_)

{'learning_decay': 0.5, 'n_components': 5}
-2573072.776658353


De acuerdo al GridSearch realizado, obtenemos que:

* n_components=5
* learning_decay=0.7

Son la mejor combinación de hiperparámetros para nuestro modelo.

# Ejercicio 4 : Inferencia e Identificación de Tópicos

In [51]:
best_estimator = get_params.best_estimator_

In [54]:
# mediante .components_ podemos extraer una matriz que entrega las distribución de palabras por cada tópico.
for topic_id, topic_name in enumerate(best_estimator.components_):
    # para cada tópico
    print("tópico: {}".format(topic_id + 1))
    # mediante argsort logramos ordenar los elementos por magnitud
    # para los elementos más relevantes ordenados por argsort, buscamos su correlativo
    # en la matriz dispersa y devolvemos el nombre.
    # finalmente concatenamos las palabras
    print("-".join([count_vectorizer.get_feature_names()[i] for i in
topic_name.argsort()[:-15 - 1: -1]]))

tópico: 1
like-got-yeah-don-man-ya-yo-know-rock-cause-ain-let-just-come-make
tópico: 2
like-shit-got-fuck-nigga-ain-don-know-just-niggas-em-cause-yo-man-bitch
tópico: 3
future-moment-song-lyrics-boom-page-random-display-able-hopefully-licensed-unfortunately-ba-bye-chick
tópico: 4
don-love-know-just-oh-ll-yeah-baby-like-got-ve-let-want-time-come
tópico: 5
life-la-god-death-blood-die-dead-eyes-time-world-soul-like-hell-black-war


In [55]:
df.genero.unique()

array(['hiphop', 'metal', 'pop', 'rock'], dtype=object)

### Tópicos asociados a cada clase inferida

# Ejercicio 5: Identificación de probabilidades

In [61]:
# generamos una transformación de los datos a distribución de tópico por palabra en el documento
fit_best_lda = best_estimator.transform(count_vectorizer_fit)

# esta transformación la podemos coaccionar a un dataframe de la siguiente manera
topics_for_each_doc = pd.DataFrame(
        # pasamos esta matriz y la redondeamos en 3 decimales
        np.round(fit_best_lda, 3),
        # agregamos un índice
        index=df.index
        )
#agregamos identificadores de columna
topics_for_each_doc.columns = list(map(lambda x: "T: {}".format(x), range(1, best_estimator.n_components + 1)))
# concatenamos las probabilidades de tópico por documento a nuestra matriz original
concatenated_df = pd.concat([df, topics_for_each_doc], axis=1)
# argmax en la matriz de tópicos
concatenated_df['highest_topic'] = np.argmax(topics_for_each_doc.values, axis=1) + 1

In [63]:
concatenated_df.head()

Unnamed: 0,artista,genero,cancion,letra,T: 1,T: 2,T: 3,T: 4,T: 5,highest_topic
0,Public Enemy,hiphop,You're Gonna Get Yours,"(Flavor Flav) \n Oh-oh Chuck, they out to get ...",0.599,0.027,0.098,0.274,0.001,1
1,Public Enemy,hiphop,Sophisticated Bitch,"That woman in the corner, cold playin' the rol...",0.461,0.191,0.001,0.346,0.001,1
2,Public Enemy,hiphop,Miuzi Weighs A Ton,"Yo Chuck, run a power move on them \n (Yeah) \...",0.877,0.001,0.001,0.001,0.121,1
3,Public Enemy,hiphop,Timebomb,"(Intro - Flavor Flav) \n Hey, Chuck, we got so...",0.9,0.001,0.001,0.071,0.028,1
4,Public Enemy,hiphop,Too Much Posse,(Intro - Flavor Flav) \n What do you got to sa...,0.617,0.103,0.002,0.276,0.002,1


### Matriz de correlaciones: probabilidad tópicos

In [None]:
topic_corr = concatenated_df.loc[:, ['genero', '']]