In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='pyspark-shell'
os.environ["PYSPARK_PYTHON"]='python3'
os.environ["SPARK_HOME"]='/opt/cloudera/parcels/SPARK2/lib/spark2/'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.6-src.zip'))
os.environ["PYSPARK_PYTHON"] = 'python3'
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.3.0.cloudera2
      /_/

Using Python version 3.4.3 (default, Nov 17 2016 01:08:31)
SparkSession available as 'spark'.


In [2]:
from pyspark.ml.feature import *
from pyspark.ml import Pipeline

In [3]:
train_df = spark.read.parquet('/data/amazon/train.parquet')
test_df = spark.read.parquet('/data/amazon/test.parquet')

In [4]:
train_df

DataFrame[asin: string, reviewerID: string, helpful: array<bigint>, overall: double, reviewText: string, reviewTime: string, reviewerName: string, summary: string, unixReviewTime: bigint]

In [5]:
tokenizer = Tokenizer(inputCol="reviewText", outputCol="words")

In [6]:
stop_words = StopWordsRemover.loadDefaultStopWords("english")
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="words_filtered", stopWords=stop_words)

In [7]:
train_tok = tokenizer.transform(train_df)
test_tok = tokenizer.transform(test_df)

In [8]:
hasher = HashingTF(numFeatures=10000, binary=True, inputCol=swr.getOutputCol(), outputCol="word_vector")

In [9]:
from pyspark.ml.regression import LinearRegression

In [10]:
lr = LinearRegression(featuresCol=hasher.getOutputCol(), labelCol="overall", maxIter=15)

In [11]:
pipeline = Pipeline(stages=[
    tokenizer,
    swr,
    hasher,
    lr
])

In [12]:
pipeline_model = pipeline.fit(train_df)

In [14]:
pipeline_model.save("model")

In [45]:
predictions = pipeline_model.transform(train_df)

In [46]:
predictions.show(10)

+----------+--------------+-------+-------+--------------------+-----------+---------------+--------------------+--------------+--------------------+--------------------+--------------------+------------------+
|      asin|    reviewerID|helpful|overall|          reviewText| reviewTime|   reviewerName|             summary|unixReviewTime|               words|      words_filtered|         word_vector|        prediction|
+----------+--------------+-------+-------+--------------------+-----------+---------------+--------------------+--------------+--------------------+--------------------+--------------------+------------------+
|0972683275|A2IDCSC6NVONIZ| [1, 1]|    5.0|This mount is jus...|04 30, 2013|        2Cents!|             Perfect|    1367280000|[this, mount, is,...|[mount, needed., ...|(10000,[778,945,1...| 4.628821908491464|
|0972683275| A7060R14RJF3I| [0, 0]|    4.0|Took a bit of wor...|05 18, 2013|Amazon Customer|               Works|    1368835200|[took, a, bit, of...|[took, 

In [50]:
from pyspark.ml.evaluation import  RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol="predictions", labelCol="overall")

In [52]:
evaluator.evaluate(predictions)

0.9595187571498409

In [53]:
test_predictions = pipeline_model.transform(test_df)

In [99]:
from pyspark.sql import functions as sf
from pyspark.sql.functions import *

def create_submission(predictions):
    selection = predictions.select(["asin", "reviewerID", "prediction"])
    selection = selection.withColumn('id', sf.concat(sf.col('asin'),sf.lit('+'), sf.col('reviewerID')))
    selection = selection.drop("asin")
    selection = selection.drop("reviewerID")
    
    solution = selection.select(col("id"), col("prediction").alias("rating"))
    solution.show(10)
    solution.repartition(1).write.csv('submission.csv', header=True)

In [100]:
create_submission(test_predictions)

+--------------------+------------------+
|                  id|            rating|
+--------------------+------------------+
|B000MCGF1O+A3NF14...| 4.815537286702539|
|B008X9Z8NE+A1045N...| 4.504974166336569|
|B00B99JU5M+AIA5WY...| 4.603447831343314|
|B002IPHA3A+AXX57L...|4.7297543125722274|
|B00884WH74+A4WEZJ...| 4.966771318775946|
|B001FY0B90+A140J1...| 4.001172535008404|
|B007WTAJTO+A2XW4C...|5.7656135361364695|
|B006UV6YMQ+A226R1...| 3.413559215767966|
|B0015DYMVO+A2UOHA...| 4.279609146380762|
|B0042J9BQE+A3N48N...|   4.0152214935724|
+--------------------+------------------+
only showing top 10 rows

