# Hops Dataset Preprocessing
## 1. Preparation

In [52]:
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd
pd.set_option('display.max_columns', 500)

In [59]:
def split_intervals(df, col, lower, upper):
    df[[col+' '+lower, col+' '+upper]] = df[col].str.split("-", expand=True)
    df[[col+' '+lower, col+' '+upper]] = df[[col+' '+lower, col+' '+upper]].replace(regex=True, to_replace=r'%', value=r'')
    df = df.drop(col, axis=1)
    return df

In [60]:
bittering_hops = pd.read_csv('../data/processed/bittering-hops.tsv', sep='\t')
bittering_hops = bittering_hops.loc[:,~bittering_hops.columns.str.startswith('Unnamed')]
bittering_hops = pd.concat([pd.Series(row.name, row[0].split('\n')) for _, row in bittering_hops.T.iterrows()]).reset_index()
bittering_hops['index'] = [row['index'].split(';') for _, row in bittering_hops.iterrows()]
bittering_hops = pd.concat([pd.DataFrame(bittering_hops['index'].values.tolist(), index=bittering_hops.index), bittering_hops[0]], axis=1)
bittering_hops.columns = ['Category', 'Value', 'Hop']
bittering_hops = bittering_hops.dropna()
bittering_hops = bittering_hops.pivot(index='Hop', columns='Category', values='Value')

In [61]:
bittering_hops = split_intervals(bittering_hops, 'Alpha Acid Composition', 'Low (%)', 'High (%)')
bittering_hops = split_intervals(bittering_hops, 'Beta Acid Composition', 'Low (%)', 'High (%)')
bittering_hops = split_intervals(bittering_hops, 'Caryophyllene Oil', 'Low (%)', 'High (%)')
bittering_hops = split_intervals(bittering_hops, 'Co-Humulone Composition', 'Low (%)', 'High (%)')
bittering_hops = split_intervals(bittering_hops, 'Farnesene Oil', 'Low (%)', 'High (%)')
bittering_hops = split_intervals(bittering_hops, 'Humulene Oil Composition', 'Low (%)', 'High (%)')
bittering_hops = split_intervals(bittering_hops, 'Myrcene Oil Composition', 'Low (%)', 'High (%)')
bittering_hops['Total Oil Composition'] = bittering_hops['Total Oil Composition'].replace(regex=True, to_replace=r' mL/100g| mls/100 grams', value=r'')
bittering_hops = split_intervals(bittering_hops, 'Total Oil Composition', 'Low (mL/100g)', 'High (mL/100g)')
bittering_hops['Yield Amount'] = bittering_hops['Yield Amount'].replace(regex=True, to_replace=r' kg/hectare \(.*\)| kg/hectares \(.*\)', value=r'')
bittering_hops = split_intervals(bittering_hops, 'Yield Amount', 'Low (kg/hectare)', 'High (kg/hectare)')

In [62]:
bittering_hops['Style Guide'] = bittering_hops['Style Guide'].apply(lambda x: x.split(','))
mlb = MultiLabelBinarizer()
bittering_hops = bittering_hops.join(pd.DataFrame(mlb.fit_transform(bittering_hops.pop('Style Guide')),
                                      columns=mlb.classes_,
                                      index=bittering_hops.index))

In [63]:
bittering_hops.index = bittering_hops.reset_index()['Hop'].str.split('/').str[-1]

In [64]:
bittering_hops

Unnamed: 0_level_0,Also Known As,Characteristics,Cone Density,Cone Size,Country,Ease of Harvest,East of Harvest,Growth Rate,Purpose,Resistant to,Seasonal Maturity,Storability,Substitutes,Susceptible to,Alpha Acid Composition Low (%),Alpha Acid Composition High (%),Beta Acid Composition Low (%),Beta Acid Composition High (%),Caryophyllene Oil Low (%),Caryophyllene Oil High (%),Co-Humulone Composition Low (%),Co-Humulone Composition High (%),Farnesene Oil Low (%),Farnesene Oil High (%),Humulene Oil Composition Low (%),Humulene Oil Composition High (%),Myrcene Oil Composition Low (%),Myrcene Oil Composition High (%),Total Oil Composition Low (mL/100g),Total Oil Composition High (mL/100g),Yield Amount Low (kg/hectare),Yield Amount High (kg/hectare),Unnamed: 33_level_0,Ale,Amber Ale,Amecan Lager,American Ale,American Ales,American India Pale Ale,American Lager,American Pale Ale,Australian Lager,Barley Wine,Belgian India Pale Ale,Bitter,Blonde Ale,Bright Ale,Cream Ale,Dark Ale,Doppelbock,English Pale Ale,European Lagers,Experimental Beers,Extra Special Bitter,Fruit Lambic,German Lager,Golden Ale,Hefeweizen,Honey Ale,Imperial India Pale Ale,Imperial Stout,India Pale Ale,Lager,Nut Brown Ale,Pale Ale,Pilsner,Porter,Saison,Stout,Strong Ale,Ale,Amber Ale,American Ale,American Barley Wine,American Lager,American Pale Ale,Bitter,Czech Pilsner,Extra Special Bitter,Imperial Pale Ale,Imperial Stout,India Pale Ale,India Pilsner,Lager,Pale Ale,Pilsner,Stout,Strong Ale,American Pale Ale
Hop,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1
Admiral,,Aggressive but smooth bittering with an orange...,Compact,Small to medium,UK,Easy to moderate,,Very high,Bittering,Resistant to verticillium wilt and downy mildew,Mid,Retains 85% alpha acid after 6 months storage ...,,Susceptible to powdery mildew,13.0,16.2,4.8,6.1,6.0,7.0,37.0,45.0,1.8,2.2,23.0,26.0,39.0,48.0,1.0,1.7,1300.0,1900.0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
Agnus,,Strong spicy and herbal notes,,,Czechia,,,,Bittering & Aroma,,,Fair to poor,,,9.0,14,4.0,6.5,8.0,10.0,4.0,6.5,1,,15.0,20.0,40.0,55.0,2.0,3.0,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
Apollo,,"Sharp, clean bittering, grapefruit notes",Compact,Small to medium,US,Fair,,Moderate,Bittering,Resistant to downy mildew,Mid to late,Retains 80%-90% alpha acid after 6 months stor...,,Susceptible to powdery mildew,15.0,20,5.5,8.0,14.0,20.0,23.0,28.0,1,,20.0,35.0,30.0,50.0,1.5,2.5,2900.0,3350.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
Banner,,"Moderate bittering potential, pleasant aroma",,,US,,,Moderate to high,Bittering,,Early,Retains 43% alpha acid after 6 months storage ...,"Aquila, Cluster, Galena",Susceptible to downy mildew,8.4,13,5.3,8.0,7.7,,34.0,,Trace,,11.8,,66.4,,2.17,,2017.0,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
Bitter-Gold,,"Use for bittering only, has no notable aroma",,,US,,,,Bittering,,,Retains 55.6% alpha acid after 6 months stora...,,,15.4,18.8,6.1,8.0,8.4,,36.0,41.0,1.2,,7.5,,68.2,,0.81,3.92,,,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Bravo,,"Spicy, earthy, and lightly floral aroma",Moderate to compact,Medium,US,Difficult,,Very high,Bittering,Resistant to powdery mildew and verticillium wilt,Late,Retains 70% alpha acid after 6 months storage ...,,Susceptible to downy mildew,14.0,17,3.0,5.0,10.0,12.0,29.0,34.0,0,1.0,18.0,20.0,25.0,50.0,1.6,2.4,2700.0,3100.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
Brewers-Gold,,Notes of spice and blackcurrant,Compact,Small,UK,Easy,,Very high,Bittering,"Resistant to verticillium wilt, moderately res...",Late,Retains 60%-70% alpha acid after 6 months stor...,,,7.1,11.3,3.3,6.1,6.5,,3.3,6.1,,,11.6,,66.7,,1.96,,1760.0,2400.0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Brewers-Gold-US,,,,Medium,US,,,,Bittering,"Resistant to verticillium wilt, moderately res...",Late,Poor,,,8.1,13.1,3.7,6.8,35.0,,41.0,,Trace,,35.0,,40.0,,1.8,,1681.0,2690.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
Bullion,,Elements of spice and dark fruits,Compact,Medium,UK,Difficult,,Very high,Bittering,"Resistant to verticillium wilt, moderately res...",Early,Retains 40%-50% alpha acid after 6 months stor...,,Infected with most viruses,6.7,12.9,3.7,9.1,9.0,11.0,39.0,,,,23.0,30.0,45.0,55.0,1.14,2.7,2000.0,2400.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
Bullion-10A,,Same as,,Small to medium,,,,Very high,Bittering,"Resistant to verticillium wilt, moderately res...",Early,Poor,,,8.0,13.8,2.8,6.9,7.0,,42.0,,0.2,,12.0,,63.0,,1.55,,2240.0,2800.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
