In [14]:
import pandas as pd
from scipy.stats import zscore
import scipy.linalg as la

In [15]:
filepath = 'scraped_data_1_1999.csv'
data = pd.read_csv(filepath, encoding = 'utf-8')

## Estilos más comunes del dataset

In [16]:
styles_count = data['style'].value_counts() # Frecuencia de cada clase
styles = list(styles_count.keys()) # Lista completa de estilos
popularity = data['style'].value_counts(normalize=True) # Proporción de cada clase
top_ten = popularity[:10] # 10 estilos de mayor frecuencia
top_ten_styles = list(top_ten.keys()) # Nombres de las clases

# Imprimir resultados
print('Estilos totales: ',len(styles))
print(" ")
print('Estilos más comunes:')
print(top_ten)

Estilos totales:  185
 
Estilos más comunes:
American IPA              0.151568
American Pale Ale         0.090761
Saison                    0.032221
Imperial IPA              0.028819
American Amber Ale        0.027418
Blonde Ale                0.022282
Witbier                   0.017712
American Stout            0.017412
American Brown Ale        0.016711
Russian Imperial Stout    0.016511
Name: style, dtype: float64


## Limpieza de los datos

(A completar con el set de al menos 100k observaciones)

In [18]:
cov_mat_cols = ["s{}{}".format(j+1,k+1) for j in range(3) for k in range(3)]
new_cols = ['id', 'style', 'label', 'freq', 'u_ABV', 'u_IBU', 'u_Color'] + cov_mat_cols
clean_data = pd.DataFrame(columns = new_cols)
ctr = 0
for s in styles:
    temp = data.loc[ data['style'] == s ] # Subconjunto de cada estilo
    temp = temp.loc[(abs(zscore(temp[['abv', 'ibu', 'color']])) < 2).all(axis = 1)] # Quitar atípicos
    
    freq = len(temp.index) # Cantidad de recetas del estilo actual
    avgs = temp.mean() # Promedio
    cov = temp.cov() # Matriz de covarianza del conjunto
    
    if freq < 100: # No agregar si hay menos de 100 observaciones
        continue
    if la.det(cov) < 0.01: # La matriz de covarianza debe ser invertible
        print("Estilo", s, "tiene matriz de covarianza singular")        
        continue

    inv_cov = pd.DataFrame(la.inv(cov.values), cov.columns, cov.index) # Inversa de la covarianza
    
    # Generar fila del dataframe
    row = (s.replace(' ', '_'), s, s, freq) # ID, estilo, etiqueta, frecuencia     
    for v in ['abv', 'ibu', 'color']:
        row = row + (avgs[v],) # Agregar promedios
    for i, vi in inv_cov.items(): # Para cada fila
        for j, vj in vi.items(): # Para cada columna
            row = row + (vj, ) # Agregar indices de la matriz de covarianza inv
    
    clean_data = clean_data.append( pd.DataFrame([row], columns = new_cols, index = [ctr]) )
    ctr = ctr + 1

In [24]:

clean_data.to_json('beer_styles.json', orient = 'records')
clean_data.head()

Unnamed: 0,id,style,label,freq,u_ABV,u_IBU,u_Color,s11,s12,s13,s21,s22,s23,s31,s32,s33
0,American_IPA,American IPA,American IPA,4052,6.451844,68.747359,7.816604,1.637172,-0.009303,-0.028986,-0.009303,0.001511,-0.000674,-0.028986,-0.000674,0.116052
1,American_Pale_Ale,American Pale Ale,American Pale Ale,2530,5.429435,43.059032,6.966644,2.847424,-0.019387,-0.11952,-0.019387,0.004302,-0.001422,-0.11952,-0.001422,0.196843
2,Saison,Saison,Saison,843,6.080225,28.852147,5.824733,1.419164,-0.021945,-0.070311,-0.021945,0.015212,-0.000529,-0.070311,-0.000529,0.182961
3,Imperial_IPA,Imperial IPA,Imperial IPA,728,8.287294,107.537816,9.52544,1.207887,3.9e-05,-0.020785,3.9e-05,0.000568,0.000972,-0.020785,0.000972,0.075823
4,American_Amber_Ale,American Amber Ale,American Amber Ale,754,5.676698,39.938634,13.936989,2.023453,-0.027221,-0.031097,-0.027221,0.003589,-0.002085,-0.031097,-0.002085,0.091292


In [25]:
print('Cantidad de estilos: '+str(clean_data.shape[0]))

Cantidad de estilos: 61
