<a href="https://colab.research.google.com/github/pkrosoff/wine_machine_learning/blob/justans_branch/wine_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
# Find the latest version of spark 3.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
# For example:
spark_version = 'spark-3.0.1'
# spark_version = 'spark-3.<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Hit:2 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:3 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:4 http://security.ubuntu.com/ubuntu bionic-security InRelease
Ign:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:9 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:10 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Hit:11 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Reading package lists... Done


In [2]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("NaiveBayes").getOrCreate()

In [3]:
# Read in data from Google Drive and put the data into a Panda's dataframe
from pyspark import SparkFiles
import pandas as pd

path = "/content/drive/MyDrive/winemag-data_first150k.csv"
df = pd.read_csv(path)

In [4]:
df.dropna()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz
2,2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley
3,3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi
8,8,US,This re-named vineyard was formerly bottled as...,Silice,95,65.0,Oregon,Chehalem Mountains,Willamette Valley,Pinot Noir,Bergström
9,9,US,The producer sources from two blocks of the vi...,Gap's Crown Vineyard,95,60.0,California,Sonoma Coast,Sonoma,Pinot Noir,Blue Farm
...,...,...,...,...,...,...,...,...,...,...,...
150889,150889,US,A bizarre style of wine. The aromas are Port-l...,Lafond Vineyard,82,35.0,California,Santa Ynez Valley,Central Coast,Pinot Noir,Lafond
150892,150892,US,"A light, earthy wine, with violet, berry and t...",Coastal,82,10.0,California,California,California Other,Merlot,Callaway
150914,150914,US,"Old-gold in color, and thick and syrupy. The a...",Late Harvest Cluster Select,94,25.0,California,Anderson Valley,Mendocino/Lake Counties,White Riesling,Navarro
150915,150915,US,"Decades ago, Beringer’s then-winemaker Myron N...",Nightingale,93,30.0,California,North Coast,North Coast,White Blend,Beringer


In [5]:
# Create a second data fram to hold only the columns we want

df_2 = df[['description', 'points']]

In [6]:
# Pandas to Spark
wine_scores = spark.createDataFrame(df_2)

wine_scores.show()

+--------------------+------+
|         description|points|
+--------------------+------+
|This tremendous 1...|    96|
|Ripe aromas of fi...|    96|
|Mac Watson honors...|    96|
|This spent 20 mon...|    96|
|This is the top w...|    95|
|Deep, dense and p...|    95|
|Slightly gritty b...|    95|
|Lush cedary black...|    95|
|This re-named vin...|    95|
|The producer sour...|    95|
|Elegance, complex...|    95|
|From 18-year-old ...|    95|
|A standout even i...|    95|
|This wine is in p...|    95|
|With its sophisti...|    95|
|First made in 200...|    95|
|This blockbuster,...|    95|
|Nicely oaked blac...|    95|
|Coming from a sev...|    95|
|This fresh and li...|    95|
+--------------------+------+
only showing top 20 rows



In [7]:
# Import regexp_replace to replace any characters that are not alphanumeric
from pyspark.sql.functions import regexp_replace, col

# Utitlize regexp_replace on the description column of the data frame
df_3 = wine_scores.withColumn("description", regexp_replace(col("description"), "[^a-zA-Z\d\s:]", ""));

In [8]:
from pyspark.sql.functions import length

# Create a length column to be used as a future feature 
wine_df = df_3.withColumn('length', length(df_3['description']))

# Show new wine_df with length column
wine_df.show()

+--------------------+------+------+
|         description|points|length|
+--------------------+------+------+
|This tremendous 1...|    96|   345|
|Ripe aromas of fi...|    96|   309|
|Mac Watson honors...|    96|   273|
|This spent 20 mon...|    96|   370|
|This is the top w...|    95|   364|
|Deep dense and pu...|    95|   305|
|Slightly gritty b...|    95|   309|
|Lush cedary black...|    95|   347|
|This renamed vine...|    95|   289|
|The producer sour...|    95|   300|
|Elegance complexi...|    95|   365|
|From 18yearold vi...|    95|   250|
|A standout even i...|    95|   282|
|This wine is in p...|    95|   297|
|With its sophisti...|    95|   412|
|First made in 200...|    95|   271|
|This blockbuster ...|    95|   268|
|Nicely oaked blac...|    95|   329|
|Coming from a sev...|    95|   361|
|This fresh and li...|    95|   253|
+--------------------+------+------+
only showing top 20 rows



In [9]:
from pyspark.sql.functions import isnan, when, count, col, isnull

# Find any null values
wine_df.select([count(when(isnull(c), c)).alias(c) for c in wine_df.columns]).show()

+-----------+------+------+
|description|points|length|
+-----------+------+------+
|          0|     0|     0|
+-----------+------+------+



In [10]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
# Create all the features to the data set
# Create label column for predictor
wine_points = StringIndexer(inputCol='points',outputCol='label')
tokenizer = Tokenizer(inputCol="description", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')

In [11]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

In [12]:
# Create and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[wine_points, tokenizer, stopremove, hashingTF, idf, clean_up])

In [13]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(wine_df)
cleaned = cleaner.transform(wine_df)

In [14]:
# Show label, points, and resulting features. We can see that points range from 0-21. 0 being the most common score which is 87.
cleaned.select(['label','features']).show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
| 16.0|(262145,[2701,471...|
| 16.0|(262145,[11481,33...|
| 16.0|(262145,[10077,12...|
| 16.0|(262145,[1546,153...|
| 13.0|(262145,[1546,181...|
| 13.0|(262145,[8408,104...|
| 13.0|(262145,[5561,114...|
| 13.0|(262145,[4235,121...|
| 13.0|(262145,[2306,316...|
| 13.0|(262145,[3354,584...|
| 13.0|(262145,[3848,939...|
| 13.0|(262145,[21534,32...|
| 13.0|(262145,[4176,894...|
| 13.0|(262145,[8297,120...|
| 13.0|(262145,[2306,584...|
| 13.0|(262145,[1546,439...|
| 13.0|(262145,[5847,189...|
| 13.0|(262145,[2701,104...|
| 13.0|(262145,[1546,124...|
| 13.0|(262145,[3572,311...|
+-----+--------------------+
only showing top 20 rows



In [15]:
from pyspark.ml.classification import NaiveBayes

# Break data down into a training set and a testing set
training, testing = cleaned.randomSplit([0.7, 0.3])

# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor = nb.fit(training)

In [16]:
# Tranform the model with the testing data
test_results = predictor.transform(testing)
test_results.show(5)

+--------------------+------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|         description|points|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+--------------------+------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
| Big lively and v...|    91|   288|  7.0|[, big, lively, a...|[, big, lively, i...|(262144,[18911,45...|(262144,[18911,45...|(262145,[18911,45...|[-1075.0828573832...|[2.41174227496105...|       2.0|
| Christoph Neumei...|    93|   223| 10.0|[, christoph, neu...|[, christoph, neu...|(262144,[18176,38...|(262144,[18176,38...|(262145,[18176,38...|[-1366.0720883603...|[2.37164265011818...|      1

In [17]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting reviews was: %f" % acc)

Accuracy of model at predicting reviews was: 0.355120
