# Spark Beer Analysis #
Analyse de dataset de bières, avec Apache Spark.



[Guide de styles BJCP](https://www.brassageamateur.com/wiki/index.php/Guide_de_styles_BJCP) (Beer Judge Certification Program ou "Programme de certification de juge de bière")

[Dictionnaire Anglais-Français des termes brassicoles](https://www.brassageamateur.com/wiki/index.php/Dictionnaire_Anglais-Fran%C3%A7ais_des_termes_brassicoles)

[Lexique francais/anglais](https://www.brassageamateur.com/wiki/index.php/Lexique_francais/anglais)


Dataset:
- [Recettes de bière](https://www.kaggle.com/jtrofe/beer-recipes/kernels) scraped from [Brewer's Friend](https://www.brewersfriend.com/)
- [Critiques de bières](https://data.world/socialmediadata/beeradvocate)


## TODO

- [X] Charger le fichier des critiques
- [ ] Faire correspondre sur un graphique les notes, avec le degré d'alcool et le style
- [X] Créer les features
- [ ] Déterminer le type d'algo nécessaire pour effectuer un apprentissage
- [ ] Créer un model
- [ ] Passer les recettes dans le model, et trouver la recette la plus prometteuse

In [1]:
val spark = SparkSession.builder.config(sc.getConf).getOrCreate

// Configure checkpoint
val savePath = "./"
spark.sparkContext.setCheckpointDir(savePath)

spark = org.apache.spark.sql.SparkSession@5adf4cb5
savePath = ./


./

# Reviews

In [2]:
val reviews = spark.read
    .format("csv")
    .option("header","true")
    .option("delimiter", ",")
    .option("inferSchema", "true")
    .load("./data/beer_reviews.csv")

reviews.printSchema()

root
 |-- brewery_id: integer (nullable = true)
 |-- brewery_name: string (nullable = true)
 |-- review_time: integer (nullable = true)
 |-- review_overall: double (nullable = true)
 |-- review_aroma: double (nullable = true)
 |-- review_appearance: double (nullable = true)
 |-- review_profilename: string (nullable = true)
 |-- beer_style: string (nullable = true)
 |-- review_palate: double (nullable = true)
 |-- review_taste: double (nullable = true)
 |-- beer_name: string (nullable = true)
 |-- beer_abv: double (nullable = true)
 |-- beer_beerid: integer (nullable = true)



reviews = [brewery_id: int, brewery_name: string ... 11 more fields]


[brewery_id: int, brewery_name: string ... 11 more fields]

## Features

In [7]:
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{OneHotEncoderEstimator, StandardScaler, StringIndexer, VectorAssembler}

val indexer = new StringIndexer().setInputCol("beer_style").setOutputCol("styleIndex")
val encoder = new OneHotEncoderEstimator()
  .setInputCols(Array(indexer.getOutputCol))
  .setOutputCols(Array("styleVec"))

val oneHotStylesPipeline = new Pipeline().setStages(Array(indexer, encoder))
val withOneHotStyle = oneHotStylesPipeline.fit(reviews).transform(reviews)
 
val assembler = new VectorAssembler()
    .setInputCols(Array("beer_abv", "styleVec"))
    .setOutputCol("features")

val scaler = new StandardScaler()
    .setInputCol("features")
    .setOutputCol("scaledFeatures")
    .setWithStd(true)
    .setWithMean(false)

val featurizePipeline = new Pipeline().setStages(Array(assembler))//, scaler))

val featurizedReviews = featurizePipeline.fit(withOneHotStyle).transform(withOneHotStyle)

indexer = strIdx_db13a7ca55df
encoder = oneHotEncoder_e725625576e6
oneHotStylesPipeline = pipeline_c6747131259d
withOneHotStyle = [brewery_id: int, brewery_name: string ... 13 more fields]
assembler = vecAssembler_25633ee62630
scaler = stdScal_07e71bc9999e
featurizePipeline = pipeline_2a6826e55fe5
featurizedReviews = [brewery_id: int, brewery_...


lastException: Throwable = null


[brewery_id: int, brewery_...

In [5]:
%%dataframe
featurizedReviews

brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid,styleIndex,styleVec,features
10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986,15.0,"(103,[15],[1.0])","(104,[0,16],[5.0,1.0])"
10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213,73.0,"(103,[73],[1.0])","(104,[0,74],[6.2,1.0])"
10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215,69.0,"(103,[69],[1.0])","(104,[0,70],[6.5,1.0])"
10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969,22.0,"(103,[22],[1.0])","(104,[0,23],[5.0,1.0])"
1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883,1.0,"(103,[1],[1.0])","(104,[0,2],[7.7,1.0])"
1075,Caldera Brewing Company,1325524659,3.0,3.5,3.5,oline73,Herbed / Spiced Beer,3.0,3.5,Caldera Ginger Beer,4.7,52159,50.0,"(103,[50],[1.0])","(104,[0,51],[4.7,1.0])"
1075,Caldera Brewing Company,1318991115,3.5,3.5,3.5,Reidrover,Herbed / Spiced Beer,4.0,4.0,Caldera Ginger Beer,4.7,52159,50.0,"(103,[50],[1.0])","(104,[0,51],[4.7,1.0])"
1075,Caldera Brewing Company,1306276018,3.0,2.5,3.5,alpinebryant,Herbed / Spiced Beer,2.0,3.5,Caldera Ginger Beer,4.7,52159,50.0,"(103,[50],[1.0])","(104,[0,51],[4.7,1.0])"
1075,Caldera Brewing Company,1290454503,4.0,3.0,3.5,LordAdmNelson,Herbed / Spiced Beer,3.5,4.0,Caldera Ginger Beer,4.7,52159,50.0,"(103,[50],[1.0])","(104,[0,51],[4.7,1.0])"
1075,Caldera Brewing Company,1285632924,4.5,3.5,5.0,augustgarage,Herbed / Spiced Beer,4.0,4.0,Caldera Ginger Beer,4.7,52159,50.0,"(103,[50],[1.0])","(104,[0,51],[4.7,1.0])"
