# Spark Beer Analysis #
Analyse de dataset de bières, avec Apache Spark.



[Guide de styles BJCP](https://www.brassageamateur.com/wiki/index.php/Guide_de_styles_BJCP) (Beer Judge Certification Program ou "Programme de certification de juge de bière")

[Dictionnaire Anglais-Français des termes brassicoles](https://www.brassageamateur.com/wiki/index.php/Dictionnaire_Anglais-Fran%C3%A7ais_des_termes_brassicoles)

[Lexique francais/anglais](https://www.brassageamateur.com/wiki/index.php/Lexique_francais/anglais)


Dataset:
- [Recettes de bière](https://www.kaggle.com/jtrofe/beer-recipes/kernels) scraped from [Brewer's Friend](https://www.brewersfriend.com/)
- [Critiques de bières](https://data.world/socialmediadata/beeradvocate)


## TODO

- [X] Charger le fichier des critiques et trouver les style de bières les plus appréciés
- [ ] Chercher parmis les résultats des corrélations entre note et degré d'alcool et amertume
- [ ] Charger le fichier de recettes et filtrer sur les styles trouvés précédement
- [ ] puis grouper par ABV / IBU


In [1]:
val spark = SparkSession.builder.config(sc.getConf).getOrCreate

spark = org.apache.spark.sql.SparkSession@426a0dd


org.apache.spark.sql.SparkSession@426a0dd

## Reviews

In [4]:
val reviews = spark.read
    .format("csv")
    .option("header","true")
    .option("delimiter", ",")
    .option("inferSchema", "true")
    .load("./data/beer_reviews.csv")

reviews.printSchema()

root
 |-- brewery_id: integer (nullable = true)
 |-- brewery_name: string (nullable = true)
 |-- review_time: integer (nullable = true)
 |-- review_overall: double (nullable = true)
 |-- review_aroma: double (nullable = true)
 |-- review_appearance: double (nullable = true)
 |-- review_profilename: string (nullable = true)
 |-- beer_style: string (nullable = true)
 |-- review_palate: double (nullable = true)
 |-- review_taste: double (nullable = true)
 |-- beer_name: string (nullable = true)
 |-- beer_abv: double (nullable = true)
 |-- beer_beerid: integer (nullable = true)



reviews = [brewery_id: int, brewery_name: string ... 11 more fields]


lastException: Throwable = null


[brewery_id: int, brewery_name: string ... 11 more fields]

In [26]:
val reviewFilter = $"review_taste" > 4 &&
    $"review_palate" > 4 &&
    $"review_aroma" > 4 &&
    $"review_overall" > 3.5

val goodBeerCountByStyle = reviews
    .filter(reviewFilter)
    .groupBy("beer_style")
    .count().sort($"count".desc)

val countByStyle = reviews.groupBy("beer_style").count().withColumnRenamed("count","total")

val bestStyles = goodBeerCountByStyle.join(countByStyle, "beer_style")
// or join(countByStyle, Seq("beer_style"), "inner") possible join types: inner, outer, left_outer, right_outer, leftsemi
    .withColumn("percentage", goodBeerCountByStyle.col("count") / countByStyle.col("total")  * 100)
    .sort($"percentage".desc)

bestStyles.show(bestStyles.count().toInt, false)  // false, not truncate column name


+-----------------------------------+-----+------+-------------------+
|beer_style                         |count|total |percentage         |
+-----------------------------------+-----+------+-------------------+
|Quadrupel (Quad)                   |4279 |18083 |23.663108997400872 |
|American Double / Imperial Stout   |11649|50238 |23.187626895975157 |
|American Wild Ale                  |3562 |17751 |20.06647512816179  |
|Gueuze                             |1174 |6009  |19.537360625728077 |
|Eisbock                            |513  |2663  |19.263987983477282 |
|Russian Imperial Stout             |10314|53819 |19.16423567884948  |
|Lambic - Unblended                 |186  |1106  |16.817359855334537 |
|American Double / Imperial IPA     |14114|85071 |16.590847644908372 |
|Flanders Red Ale                   |942  |6386  |14.751017851550266 |
|English Barleywine                 |1818 |13703 |13.26716777348026  |
|Weizenbock                         |1222 |9396  |13.005534269902085 |
|Lambi

myFilter = ((((review_taste > 4) AND (review_palate > 4)) AND (review_aroma > 4)) AND (review_overall > 3.5))
goodBeerCountByStyle = [beer_style: string, count: bigint]
countByStyle = [beer_style: string, total: bigint]
bestStyles = [beer_style: string, count: bigint ... 2 more fields]


[beer_style: string, count: bigint ... 2 more fields]

In [19]:
%%dataframe
bestStyles

beer_style,count,total,percentage
Quadrupel (Quad),4279,18083,23.663108997400872
American Double / Imperial Stout,11649,50238,23.18762689597516
American Wild Ale,3562,17751,20.06647512816179
Gueuze,1174,6009,19.53736062572808
Eisbock,513,2663,19.26398798347728
Russian Imperial Stout,10314,53819,19.16423567884948
Lambic - Unblended,186,1106,16.817359855334537
American Double / Imperial IPA,14114,85071,16.590847644908372
Flanders Red Ale,942,6386,14.751017851550266
English Barleywine,1818,13703,13.26716777348026


## Recipies

### Styles

In [22]:
val styles = spark.read
    .format("csv")
    .option("header","true")
    .option("delimiter", ",")
    .option("inferSchema", "true")
    .load("./data/styleData.csv")

styles.printSchema()

root
 |-- Style: string (nullable = true)
 |-- StyleID: integer (nullable = true)



styles = [Style: string, StyleID: int]


lastException: Throwable = null


[Style: string, StyleID: int]

In [23]:
%%dataframe
styles

Style,StyleID
Altbier,1
Alternative Grain Beer,2
Alternative Sugar Beer,3
American Amber Ale,4
American Barleywine,5
American Brown Ale,6
American IPA,7
American Lager,8
American Light Lager,9
American Pale Ale,10


In [24]:
//val bestStyles = bestReviewBeerStyles.select("beer_style").map(_.getString(0)).collect.toSeq
val bestStylesList = bestStyles.select("beer_style").as[String].collect.toSeq
styles.filter($"Style" isin (bestStylesList: _*)).show

+-------------------+-------+
|              Style|StyleID|
+-------------------+-------+
|            Altbier|      1|
|American Barleywine|      5|
| American Brown Ale|      6|
|       American IPA|      7|
|    American Porter|     11|
|     American Stout|     12|
|American Strong Ale|     13|
|      Baltic Porter|     19|
|   Belgian Pale Ale|     24|
|            Braggot|     32|
|          Cream Ale|     45|
|         Doppelbock|     54|
|       Dunkelweizen|     59|
|            Eisbock|     62|
| English Barleywine|     63|
|     English Porter|     66|
|   Flanders Red Ale|     71|
|               Gose|     82|
|             Gueuze|     83|
|      Irish Red Ale|     92|
+-------------------+-------+
only showing top 20 rows



bestStylesList = WrappedArray(Quadrupel (Quad), American Double / Imperial Stout, American Wild Ale, Gueuze, Eisbock, Russian Imperial Stout, Lambic - Unblended, American Double / Imperial IPA, Flanders Red Ale, English Barleywine, Weizenbock, Lambic - Fruit, American Strong Ale, Old Ale, American Barleywine, Belgian Strong Dark Ale, Bière de Champagne / Bière Brut, Baltic Porter, Tripel, Dubbel, Flanders Oud Bruin, Doppelbock, Rye Beer, Belgian Strong Pale Ale, Wheatwine, American IPA, Belgian IPA, Saison / Farmhouse Ale, American Stout, Oatmeal Stout, American Black Ale, Scotch Ale / Wee Heavy, Hefeweizen, American Double / Imperial Pilsner, American Porter, Rauchbier, English Porter, Milk / Sweet Stout, Bière de Garde, Foreign / Export Stout, Belgian Pale Ale, Roggenbier...


WrappedArray(Quadrupel (Quad), American Double / Imperial Stout, American Wild Ale, Gueuze, Eisbock, Russian Imperial Stout, Lambic - Unblended, American Double / Imperial IPA, Flanders Red Ale, English Barleywine, Weizenbock, Lambic - Fruit, American Strong Ale, Old Ale, American Barleywine, Belgian Strong Dark Ale, Bière de Champagne / Bière Brut, Baltic Porter, Tripel, Dubbel, Flanders Oud Bruin, Doppelbock, Rye Beer, Belgian Strong Pale Ale, Wheatwine, American IPA, Belgian IPA, Saison / Farmhouse Ale, American Stout, Oatmeal Stout, American Black Ale, Scotch Ale / Wee Heavy, Hefeweizen, American Double / Imperial Pilsner, American Porter, Rauchbier, English Porter, Milk / Sweet Stout, Bière de Garde, Foreign / Export Stout, Belgian Pale Ale, Roggenbier...

### Recipe

In [15]:
val recipes = spark.read
    .format("csv")
    .option("header","true")
    .option("delimiter", ",")
    .option("inferSchema", "true")
    .load("/home/jovyan/work/recipeData.csv")

recipes.printSchema

root
 |-- BeerID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- URL: string (nullable = true)
 |-- Style: string (nullable = true)
 |-- StyleID: integer (nullable = true)
 |-- Size(L): double (nullable = true)
 |-- OG: double (nullable = true)
 |-- FG: double (nullable = true)
 |-- ABV: double (nullable = true)
 |-- IBU: double (nullable = true)
 |-- Color: double (nullable = true)
 |-- BoilSize: double (nullable = true)
 |-- BoilTime: integer (nullable = true)
 |-- BoilGravity: string (nullable = true)
 |-- Efficiency: double (nullable = true)
 |-- MashThickness: string (nullable = true)
 |-- SugarScale: string (nullable = true)
 |-- BrewMethod: string (nullable = true)
 |-- PitchRate: string (nullable = true)
 |-- PrimaryTemp: string (nullable = true)
 |-- PrimingMethod: string (nullable = true)
 |-- PrimingAmount: string (nullable = true)
 |-- UserId: integer (nullable = true)



recipes = [BeerID: int, Name: string ... 21 more fields]


[BeerID: int, Name: string ... 21 more fields]

In [None]:
val stylesIds = styles.select("StyleID").as[Int].collect.toSeq
recipie.filter($"StyleID" isin (stylesIds: _*))