In [75]:
import pandas as pd
from scipy.stats import zscore
import scipy.linalg as la

In [76]:
filepath = 'brewers_friend_database.csv'
data = pd.read_csv(filepath, encoding = 'utf-8')
print('Archivo cargado')

Archivo cargado


## Estilos más comunes del dataset

In [77]:
styles_count = data['style'].value_counts() # Frecuencia de cada clase
styles = list(styles_count.keys()) # Lista completa de estilos
popularity = data['style'].value_counts(normalize=True) # Proporción de cada clase
top_ten = popularity[:10] # 10 estilos de mayor frecuencia
top_ten_styles = list(top_ten.keys()) # Nombres de las clases

# Imprimir resultados
print('Estilos totales: ',len(styles))
print(" ")
print('Estilos más comunes:')
print(top_ten)

Estilos totales:  313
 
Estilos más comunes:
American IPA                      0.140651
American Pale Ale                 0.093721
No Profile Selected               0.074814
American Light Lager              0.037365
Specialty IPA: New England IPA    0.033693
Saison                            0.029037
Blonde Ale                        0.027004
American Amber Ale                0.022150
Weissbier                         0.017991
American Stout                    0.014767
Name: style, dtype: float64


## Preprocesamiento de clases

#### Las siguientes clases son equivalentes y deben combinarse:  
"German Pils" -> "German Pilsner (Pils)"  
"Oktoberfest\/M\u00e4rzen" -> "M\u00e4rzen"  
"California Common Beer" -> "California Common"  
"Weizen\/Weissbier" -> "Weissbier"  
"Scottish Export_80\/-" -> "Scottish Export"  
"Standard\/Ordinary Bitter" -> "Ordinary Bitter"  
"Wee Heavy" -> "Strong Scotch Ale"  
"Light American Lager" -> "American Light Lager"  

#### Eliminar siguientes clases:  
"No Profile Selected"  
"Dry Mead"  
"Semi-Sweet Mead"  
"Common Cider"  
"Other Fruit Melomel"  

#### Procesar nombres de las siguientes clases:  
"Experimental Beer"  
"Mixed Style Beer"  


In [78]:
equivs = {
    "German Pils": "German Pilsner (Pils)",
    "Oktoberfest/Märzen": "Märzen",
    "California Common Beer": "California Common",
    "Weizen/Weissbier": "Weissbier",
    "Scottish Export 80/-": "Scottish Export",
    "Standard/Ordinary Bitter": "Ordinary Bitter",
    "Wee Heavy": "Strong Scotch Ale",
    "Light American Lager": "American Light Lager",
    "American Wheat Beer": "American Wheat or Rye Beer",
    "Special/Best/Premium Bitter": "Best Bitter"
}
for key in equivs:
    data['style'].replace(key, equivs[key], inplace = True)

In [79]:
drop_styles = ["No Profile Selected", "Dry Mead", "Semi-Sweet Mead", "Common Cider", "Other Fruit Melomel", "Experimental Beer", "Mixed Style Beer"]
for s in drop_styles:
    data.drop(data[data['style'] == s].index, inplace = True)

In [80]:
l = list(data['style'].value_counts().keys())
print("Cantidad de estilos restante: " + str(len(l)))

Cantidad de estilos restante: 297


## Obtener distribución de clases

In [81]:
cov_mat_cols = ["s{}{}".format(j+1,k+1) for j in range(3) for k in range(3)]
new_cols = ['id', 'style', 'freq', 'u_ABV', 'u_IBU', 'u_Color'] + cov_mat_cols
clean_data = pd.DataFrame(columns = new_cols)
ctr = 0
for s in styles:
    temp = data.loc[ data['style'] == s ] # Subconjunto de cada estilo
    temp = temp.loc[(abs(zscore(temp[['abv', 'ibu', 'color']])) < 2).all(axis = 1)] # Quitar atípicos
    
    freq = len(temp.index) # Cantidad de recetas del estilo actual
    avgs = temp.mean() # Promedio
    cov = temp.cov() # Matriz de covarianza del conjunto
    
    if freq < 100: # No agregar si hay menos de 100 observaciones
        continue
    
    if la.det(cov) < 0.01: # La matriz de covarianza debe ser invertible
        print("Estilo", s, "tiene matriz de covarianza singular")        
        continue

    inv_cov = pd.DataFrame(la.inv(cov.values), cov.columns, cov.index) # Inversa de la covarianza
    
    # Generar fila del dataframe
    row = (s.replace(' ', '_'), s, freq) # ID, estilo, frecuencia     
    for v in ['abv', 'ibu', 'color']:
        row = row + (avgs[v],) # Agregar promedios
    for i, vi in inv_cov.items(): # Para cada fila
        for j, vj in vi.items(): # Para cada columna
            row = row + (vj, ) # Agregar indices de la matriz de covarianza inv
    
    clean_data = clean_data.append( pd.DataFrame([row], columns = new_cols, index = [ctr]) )
    ctr = ctr + 1
print('Listo.')

Listo.


In [82]:

clean_data.to_json('beer_styles.json', orient = 'records')
clean_data.head()

Unnamed: 0,id,style,freq,u_ABV,u_IBU,u_Color,s11,s12,s13,s21,s22,s23,s31,s32,s33
0,American_IPA,American IPA,26181,6.352121,61.511807,7.277955,1.332477,-0.008529,-0.055345,-0.008529,0.001284,-0.001072,-0.055345,-0.001072,0.141168
1,American_Pale_Ale,American Pale Ale,17501,5.425265,40.329333,6.588622,2.460109,-0.012689,-0.108097,-0.012689,0.003617,-0.002127,-0.108097,-0.002127,0.213945
2,American_Light_Lager,American Light Lager,6956,5.249731,21.956147,6.526673,0.453282,-0.005236,-0.026822,-0.005236,0.001504,-0.000769,-0.026822,-0.000769,0.04082
3,Specialty_IPA:_New_England_IPA,Specialty IPA: New England IPA,6289,6.507685,44.170059,5.392923,1.253097,-0.003979,-0.21027,-0.003979,0.001281,-0.002322,-0.21027,-0.002322,0.630576
4,Saison,Saison,4980,6.008534,28.213544,5.644341,1.320154,-0.012921,-0.102781,-0.012921,0.0143,0.000138,-0.102781,0.000138,0.169583


In [84]:
print('Cantidad de estilos final: '+str(clean_data.shape[0]))

Cantidad de estilos final: 125
