# Spark Beer Analysis #
Analyse de dataset de bières, avec Apache Spark.



[Guide de styles BJCP](https://www.brassageamateur.com/wiki/index.php/Guide_de_styles_BJCP) (Beer Judge Certification Program ou "Programme de certification de juge de bière")

[Dictionnaire Anglais-Français des termes brassicoles](https://www.brassageamateur.com/wiki/index.php/Dictionnaire_Anglais-Fran%C3%A7ais_des_termes_brassicoles)

[Lexique francais/anglais](https://www.brassageamateur.com/wiki/index.php/Lexique_francais/anglais)


Dataset:
- [Recettes de bière](https://www.kaggle.com/jtrofe/beer-recipes/kernels) scraped from [Brewer's Friend](https://www.brewersfriend.com/)
- [Critiques de bières](https://data.world/socialmediadata/beeradvocate)


## TODO

- [X] Charger le fichier des critiques et trouver les style de bières les plus appréciés
- [ ] Chercher parmis les résultats des corrélations entre note et degré d'alcool et amertume
- [ ] Charger le fichier de recettes et filtrer sur les styles trouvés précédement
- [ ] puis grouper par ABV / IBU


In [1]:
val spark = SparkSession.builder.config(sc.getConf).getOrCreate

spark = org.apache.spark.sql.SparkSession@426a0dd


org.apache.spark.sql.SparkSession@426a0dd

## Reviews

In [2]:
val reviews = spark.read
    .format("csv")
    .option("header","true")
    .option("delimiter", ",")
    .option("inferSchema", "true")
    .load("/home/jovyan/work/beer_reviews.csv")

reviews.printSchema()

root
 |-- brewery_id: integer (nullable = true)
 |-- brewery_name: string (nullable = true)
 |-- review_time: integer (nullable = true)
 |-- review_overall: double (nullable = true)
 |-- review_aroma: double (nullable = true)
 |-- review_appearance: double (nullable = true)
 |-- review_profilename: string (nullable = true)
 |-- beer_style: string (nullable = true)
 |-- review_palate: double (nullable = true)
 |-- review_taste: double (nullable = true)
 |-- beer_name: string (nullable = true)
 |-- beer_abv: double (nullable = true)
 |-- beer_beerid: integer (nullable = true)



reviews = [brewery_id: int, brewery_name: string ... 11 more fields]


[brewery_id: int, brewery_name: string ... 11 more fields]

In [4]:
val bestReviewBeerStyles = reviews
    .filter($"review_taste" > 4)
    .filter($"review_palate" > 4)
    .filter($"review_aroma" > 4)
    .filter($"review_overall" > 3.5)
    .groupBy("beer_style")
    .count().sort($"count".desc)

bestReviewBeerStyles.show(bestReviewBeerStyles.count().toInt, false) // false, not truncate column name
//reviews.show()

+-----------------------------------+-----+
|beer_style                         |count|
+-----------------------------------+-----+
|American Double / Imperial IPA     |14327|
|American Double / Imperial Stout   |11698|
|Russian Imperial Stout             |10329|
|American IPA                       |10017|
|Quadrupel (Quad)                   |4279 |
|Belgian Strong Dark Ale            |4170 |
|American Strong Ale                |4064 |
|American Porter                    |3593 |
|American Wild Ale                  |3562 |
|American Barleywine                |3264 |
|Tripel                             |3039 |
|Belgian Strong Pale Ale            |2829 |
|Saison / Farmhouse Ale             |2499 |
|American Pale Ale (APA)            |2360 |
|Doppelbock                         |2064 |
|Dubbel                             |1982 |
|Hefeweizen                         |1922 |
|American Stout                     |1909 |
|English Barleywine                 |1819 |
|Old Ale                        

bestReviewBeerStyles = [beer_style: string, count: bigint]


[beer_style: string, count: bigint]

In [5]:
%%dataframe
bestReviewBeerStyles

beer_style,count
American Double / Imperial IPA,14327
American Double / Imperial Stout,11698
Russian Imperial Stout,10329
American IPA,10017
Quadrupel (Quad),4279
Belgian Strong Dark Ale,4170
American Strong Ale,4064
American Porter,3593
American Wild Ale,3562
American Barleywine,3264


## Recipies

### Styles

In [6]:
val styles = spark.read
    .format("csv")
    .option("header","true")
    .option("delimiter", ",")
    .option("inferSchema", "true")
    .load("/home/jovyan/work/styleData.csv")

styles.printSchema()

root
 |-- Style: string (nullable = true)
 |-- StyleID: integer (nullable = true)



styles = [Style: string, StyleID: int]


[Style: string, StyleID: int]

In [7]:
%%dataframe
styles

Style,StyleID
Altbier,1
Alternative Grain Beer,2
Alternative Sugar Beer,3
American Amber Ale,4
American Barleywine,5
American Brown Ale,6
American IPA,7
American Lager,8
American Light Lager,9
American Pale Ale,10


In [12]:
//val bestStyles = bestReviewBeerStyles.select("beer_style").map(_.getString(0)).collect.toSeq
val bestStyles = bestReviewBeerStyles.select("beer_style").as[String].collect.toSeq
styles.filter($"Style" isin (bestStyles: _*)).show

+-------------------+-------+
|              Style|StyleID|
+-------------------+-------+
|            Altbier|      1|
|American Barleywine|      5|
| American Brown Ale|      6|
|       American IPA|      7|
|    American Porter|     11|
|     American Stout|     12|
|American Strong Ale|     13|
|      Baltic Porter|     19|
|   Belgian Pale Ale|     24|
|            Braggot|     32|
|          Cream Ale|     45|
|         Doppelbock|     54|
|       Dunkelweizen|     59|
|            Eisbock|     62|
| English Barleywine|     63|
|     English Porter|     66|
|   Flanders Red Ale|     71|
|               Gose|     82|
|             Gueuze|     83|
|      Irish Red Ale|     92|
+-------------------+-------+
only showing top 20 rows



bestStyles = WrappedArray(American Double / Imperial IPA, American Double / Imperial Stout, Russian Imperial Stout, American IPA, Quadrupel (Quad), Belgian Strong Dark Ale, American Strong Ale, American Porter, American Wild Ale, American Barleywine, Tripel, Belgian Strong Pale Ale, Saison / Farmhouse Ale, American Pale Ale (APA), Doppelbock, Dubbel, Hefeweizen, American Stout, English Barleywine, Old Ale, American Amber / Red Ale, Lambic - Fruit, Oatmeal Stout, Fruit / Vegetable Beer, American Brown Ale, Scotch Ale / Wee Heavy, Weizenbock, Baltic Porter, Gueuze, Belgian IPA, Belgian Pale Ale, Flanders Red Ale, Witbier, Rye Beer, Milk / Sweet Stout, American Black Ale, English Porter, Winter Warmer, Pumpkin Ale, American Pale Wheat Ale, German Pilsener, Eisbock, Extra Speci...


WrappedArray(American Double / Imperial IPA, American Double / Imperial Stout, Russian Imperial Stout, American IPA, Quadrupel (Quad), Belgian Strong Dark Ale, American Strong Ale, American Porter, American Wild Ale, American Barleywine, Tripel, Belgian Strong Pale Ale, Saison / Farmhouse Ale, American Pale Ale (APA), Doppelbock, Dubbel, Hefeweizen, American Stout, English Barleywine, Old Ale, American Amber / Red Ale, Lambic - Fruit, Oatmeal Stout, Fruit / Vegetable Beer, American Brown Ale, Scotch Ale / Wee Heavy, Weizenbock, Baltic Porter, Gueuze, Belgian IPA, Belgian Pale Ale, Flanders Red Ale, Witbier, Rye Beer, Milk / Sweet Stout, American Black Ale, English Porter, Winter Warmer, Pumpkin Ale, American Pale Wheat Ale, German Pilsener, Eisbock, Extra Speci...

### Recipe

In [15]:
val recipes = spark.read
    .format("csv")
    .option("header","true")
    .option("delimiter", ",")
    .option("inferSchema", "true")
    .load("/home/jovyan/work/recipeData.csv")

recipes.printSchema

root
 |-- BeerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- URL: string (nullable = true)
 |-- Style: string (nullable = true)
 |-- StyleID: integer (nullable = true)
 |-- Size(L): double (nullable = true)
 |-- OG: double (nullable = true)
 |-- FG: double (nullable = true)
 |-- ABV: double (nullable = true)
 |-- IBU: double (nullable = true)
 |-- Color: double (nullable = true)
 |-- BoilSize: double (nullable = true)
 |-- BoilTime: integer (nullable = true)
 |-- BoilGravity: string (nullable = true)
 |-- Efficiency: double (nullable = true)
 |-- MashThickness: string (nullable = true)
 |-- SugarScale: string (nullable = true)
 |-- BrewMethod: string (nullable = true)
 |-- PitchRate: string (nullable = true)
 |-- PrimaryTemp: string (nullable = true)
 |-- PrimingMethod: string (nullable = true)
 |-- PrimingAmount: string (nullable = true)
 |-- UserId: integer (nullable = true)



recipes = [BeerID: int, Name: string ... 21 more fields]


[BeerID: int, Name: string ... 21 more fields]

In [None]:
val stylesIds = styles.select("StyleID").as[Int].collect.toSeq
recipie.filter($"StyleID" isin (stylesIds: _*))