# Ejercicio 1: Preparar el ambiente de trabajoº

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import glob
import os

import seaborn as sns
import warnings

In [2]:
# lectura archivos alojados en dump - se guardan en lista
files_list = []
for filename in glob.glob('dump/*.csv'):
    with open(os.path.join(os.getcwd(), filename), 'r') as f:
        temp = pd.read_csv(f).drop('Unnamed: 0', axis=1)
        files_list.append(temp)

In [3]:
# todos los datos a un dataframe global
df = pd.DataFrame()
for data_idx in range(0, len(files_list)):
    df = pd.concat([df, files_list[data_idx]], axis=0, ignore_index=True)
    

In [4]:
# renaming columns
df.columns = ['artista', 'genero', 'cancion', 'letra']

In [5]:
df.head()

Unnamed: 0,artista,genero,cancion,letra
0,Public Enemy,hiphop,You're Gonna Get Yours,"(Flavor Flav) \n Oh-oh Chuck, they out to get ..."
1,Public Enemy,hiphop,Sophisticated Bitch,"That woman in the corner, cold playin' the rol..."
2,Public Enemy,hiphop,Miuzi Weighs A Ton,"Yo Chuck, run a power move on them \n (Yeah) \..."
3,Public Enemy,hiphop,Timebomb,"(Intro - Flavor Flav) \n Hey, Chuck, we got so..."
4,Public Enemy,hiphop,Too Much Posse,(Intro - Flavor Flav) \n What do you got to sa...


# Ejercicio 2: Matriz de ocurrencias

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
count_vectorizer=CountVectorizer(stop_words='english')

In [8]:
count_vectorizer_fit = count_vectorizer.fit_transform(df.letra)

In [9]:
words = count_vectorizer.get_feature_names()
words_freq = count_vectorizer_fit.toarray().sum(axis=0)



In [10]:
words_freq_df = pd.DataFrame([list(words), list(words_freq)]).T

### 5000 palabras más repetidas

In [11]:
words_freq_df.columns= ['word', 'freq']
words_freq_df = words_freq_df.sort_values(by='freq', ascending=False).iloc[0:5000, :]

In [12]:
words_freq_df[:10]

Unnamed: 0,word,freq
24950,like,19629
12859,don,17398
23856,know,14962
18439,got,14171
23287,just,13978
25479,love,11268
48591,yeah,11071
25165,ll,10028
29851,oh,9879
7009,cause,8356


# Ejercicio 3: Entrenamiento del Modelo

In [13]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

In [14]:
params = {'n_components': [5, 10, 15],
          'learning_decay': [0.7, 0.5]}

In [15]:
get_params = GridSearchCV(LatentDirichletAllocation(), params, cv=5, n_jobs=-1, verbose=3)

In [16]:
# ajustamos datos para obtener hiperparámetros
get_params.fit(count_vectorizer_fit)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [18]:
print(get_params.best_params_)
print(get_params.best_score_)

{'learning_decay': 0.7, 'n_components': 5}
-2570848.81262933


De acuerdo al GridSearch realizado, obtenemos que:

* n_components=5
* learning_decay=0.7

Son la mejor combinación de hiperparámetros para nuestro modelo.

# Ejercicio 4 : Inferencia e Identificación de Tópicos

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [22]:
# conjuntos de entrenamiento y validacion
X_train, X_test, y_train, y_test = train_test_split(df['letra'], df['genero'], test_size=.33, random_state=99231)

In [23]:
# definimos etapas del proceso
letters_pipeline = Pipeline( [('cvect', CountVectorizer(stop_words='english')),
                      ('lda', LatentDirichletAllocation(n_components=5, learning_decay=0.7, random_state=99131, n_jobs = -1))])

In [24]:
# ajustamos los datos
letters_pipeline.fit(X_train, y_train)

In [26]:
enumerate(letters_pipeline.components_)

AttributeError: 'Pipeline' object has no attribute 'components_'

In [None]:
# mediante .components_ podemos extraer una matriz que entrega la distribución de palabras por cada tópico.
for topic_id, topic_name in enumerate(letters_pipeline.components_):
    # para cada tópico
    print("tópico: {}".format(topic_id + 1))
    # mediante argsort logramos ordenar los elementos por magnitud
    # para los elementos más relevantes ordenados por argsort, buscamos su correlativo en la matriz dispersa y devolvemos el nombre
    # finalmente concatenamos las palabras
    print(" ".join([counter.get_feature_names()[i] for i in
topic_name.argsort()[:-15 - 1: -1]]))