In [1]:
%%configure -f
{
    "conf": {
        "spark.pyspark.python": "python3",
        "spark.pyspark.virtualenv.enabled": "true",
        "spark.pyspark.virtualenv.type":"native",
        "spark.pyspark.virtualenv.bin.path":"/usr/bin/virtualenv"
    }
}

In [2]:
sc.install_pypi_package("boto3==1.19.2")
sc.install_pypi_package("pandas==1.0.5")
sc.install_pypi_package("scipy==1.4.1")
sc.install_pypi_package("matplotlib==3.2.1")
sc.install_pypi_package("seaborn==0.10.1")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1,application_1685139367557_0003,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Collecting boto3==1.19.2
  Using cached https://files.pythonhosted.org/packages/59/60/163503f24cf09553d0bb6c37db9ff3254f6cda812cab00430602867d03f5/boto3-1.19.2-py3-none-any.whl
Collecting botocore<1.23.0,>=1.22.2 (from boto3==1.19.2)
  Using cached https://files.pythonhosted.org/packages/6a/73/552b27e3a1b4f83630907c4958be78e9d4c906e73efd554ebd5e21cb1692/botocore-1.22.12-py3-none-any.whl
Collecting s3transfer<0.6.0,>=0.5.0 (from boto3==1.19.2)
  Using cached https://files.pythonhosted.org/packages/7b/9c/f51775ebe7df5a7aa4e7c79ed671bde94e154bd968aca8d65bb24aba0c8c/s3transfer-0.5.2-py3-none-any.whl
Collecting python-dateutil<3.0.0,>=2.1 (from botocore<1.23.0,>=1.22.2->boto3==1.19.2)
  Using cached https://files.pythonhosted.org/packages/36/7a/87837f39d0296e723bb9b62bbb257d0355c7f6128853c78955f57342a56d/python_dateutil-2.8.2-py2.py3-none-any.whl
Collecting urllib3<1.27,>=1.25.4 (from botocore<1.23.0,>=1.22.2->boto3==1.19.2)
  Using cached https://files.pythonhosted.org/packages/c5/05/c214b

In [3]:
train = spark.read.json('s3://amazon-reviews-ml/json/train/dataset_en_train.json')
test = spark.read.json('s3://amazon-reviews-ml/json/test/dataset_en_test.json')
dev = spark.read.json('s3://amazon-reviews-ml/json/dev/dataset_en_dev.json')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
comb = train.union(dev).persist()
comb = comb.union(test).persist()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
print('Total Columns: %d' % len(comb.dtypes))
print('Total Rows: %d' % comb.count())
comb.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Total Columns: 8
Total Rows: 210000
root
 |-- language: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- review_title: string (nullable = true)
 |-- reviewer_id: string (nullable = true)
 |-- stars: string (nullable = true)

In [6]:
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import numpy as np

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
regexTokenizer1 = RegexTokenizer(inputCol="review_body", outputCol="body", pattern="\\W")
regexTokenizer2 = RegexTokenizer(inputCol="review_title", outputCol="title", pattern="\\W")
label_stringIdx3 = StringIndexer(inputCol = "product_category", outputCol = "feature3")
label_stringIdx4 = StringIndexer(inputCol = "stars", outputCol = "label")
countVectors1 = CountVectorizer(inputCol="body", outputCol="feature1", minDF=5)
countVectors2 = CountVectorizer(inputCol="title", outputCol="feature2", minDF=5)
pipeline = Pipeline(stages=[regexTokenizer1, regexTokenizer2, 
                            label_stringIdx3,label_stringIdx4,
                            countVectors1, countVectors2])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
pipelineFit = pipeline.fit(comb)
transformed_data = pipelineFit.transform(comb)
data = transformed_data.select("feature1","feature2","feature3","label").persist()
data.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+--------------------+--------+-----+
|            feature1|            feature2|feature3|label|
+--------------------+--------------------+--------+-----+
|(18079,[0,1,2,3,4...|(6805,[0,2,4,7,10...|    21.0|  0.0|
|(18079,[0,38,51,5...|(6805,[1,51,588],...|    11.0|  0.0|
|(18079,[0,1,2,3,8...|(6805,[0,13,16,19...|     0.0|  0.0|
|(18079,[1,2,3,4,5...|(6805,[17,50,91,5...|     2.0|  0.0|
|(18079,[3,13,15,2...| (6805,[1656],[1.0])|    14.0|  0.0|
+--------------------+--------------------+--------+-----+
only showing top 5 rows

In [9]:
features = ["feature1","feature2","feature3"]
assembler = VectorAssembler(inputCols = features, outputCol = 'features')
transformed_data = assembler.transform(data.na.drop())
train, test = transformed_data.randomSplit([0.7, 0.3], seed = 1203)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
stars = (train.groupBy('label')
             .count()
             .sort('label', ascending=False)
        )
stars.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+-----+
|label|count|
+-----+-----+
|  4.0|29344|
|  3.0|29328|
|  2.0|29409|
|  1.0|29436|
|  0.0|29420|
+-----+-----+

In [10]:
lr = LogisticRegression(standardization = True)
grid = ParamGridBuilder().addGrid(lr.regParam, np.arange(0,1,.1)) \
.addGrid(lr.elasticNetParam, [0, 0.5, 1]).build()
evaluator = MulticlassClassificationEvaluator()
cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,
    parallelism=320, numFolds = 5)
cvModel = cv.fit(train)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [33]:
print(cvModel.bestModel.getRegParam())
print(cvModel.bestModel.getElasticNetParam())
evaluationSummary = cvModel.bestModel.transform(test)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(evaluationSummary)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0.5
0.0
0.5642606529585728

In [35]:
evaluationSummary.repartition(1).write.json("s3://finalproject1203/jupyter/jovyan/logit_all/", mode="overwrite")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
test_1 = test.filter("label == 0.0").persist()
test_2 = test.filter("label == 1.0").persist()
test_3 = test.filter("label == 2.0").persist()
test_4 = test.filter("label == 3.0").persist()
test_5 = test.filter("label == 4.0").persist()
test_labels = [test_1,test_2,test_3,test_4,test_5]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [37]:
for subsets in test_labels:
    evaluationSummary = cvModel.bestModel.transform(subsets)
    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
    evaluator.evaluate(evaluationSummary)
    evaluationSummary.repartition(1).write.json("s3://finalproject1203/jupyter/jovyan/logit_5/", mode="append")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0.8349694387849603
0.6084067120784183
0.6023976023976023
0.6507666098807496
0.882225656877898

In [11]:
from pyspark.ml.classification import RandomForestClassifier

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
rf = RandomForestClassifier(maxDepth=30, labelCol="label", seed=1203,leafCol="leafId")
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [10, 50, 100]).build()
crossval = CrossValidator(estimator=rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction"),
                          numFolds=3) 
cvModel = crossval.fit(train)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
evaluationSummary = cvModel.bestModel.transform(test)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(evaluationSummary)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0.5343987457546678

In [15]:
for subsets in test_labels:
    evaluationSummary = cvModel.bestModel.transform(subsets)
    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
    evaluator.evaluate(evaluationSummary)
    evaluationSummary.repartition(1).write.json("s3://finalproject1203/jupyter/jovyan/trees_5/", mode="append")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0.8669218644449449
0.5138396025550035
0.5585575271894677
0.6229828850855745
0.8894348894348895

In [20]:
from pyspark.ml.classification import NaiveBayes 

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [21]:
nb = NaiveBayes(modelType="multinomial",labelCol="label", featuresCol="features")
nbparamGrid = (ParamGridBuilder()
               .addGrid(nb.smoothing, np.arange(1, 40, 1))
               .build())
nbevaluator = MulticlassClassificationEvaluator(labelCol="label", 
                                                predictionCol="prediction", 
                                                metricName="accuracy")
nbcv = CrossValidator(estimator = nb,
                      estimatorParamMaps = nbparamGrid,
                      evaluator = nbevaluator,
                      numFolds = 5)
nbmodel = nbcv.fit(train)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [38]:
nbpredictions = nbmodel.transform(test)
print('Accuracy:', nbevaluator.evaluate(nbpredictions))
nbpredictions.repartition(1).write.json("s3://finalproject1203/jupyter/jovyan/naive_bayes/", mode="overwrite")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Accuracy: 0.564990564990565

In [39]:
for subsets in test_labels:
    evaluationSummary = nbmodel.bestModel.transform(subsets)
    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
    evaluator.evaluate(evaluationSummary)
    evaluationSummary.repartition(1).write.json("s3://finalproject1203/jupyter/jovyan/bayes_5/", mode="append")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

0.830148323801553
0.595730412428747
0.6297001686891223
0.6419462008359232
0.8625353884869456