In [1]:
# library imports 
import sys
from importlib import reload
import findspark
import customHelpers as helper
findspark.init()
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, count, countDistinct
from pyspark.sql.types import StructType, StructField, StringType,IntegerType, FloatType,BooleanType,DateType,ArrayType,LongType,DoubleType
from pyspark.sql.window import Window
import pyspark.sql.functions as F
import numpy as np

reload(helper)

W0513 16:06:24.330469 4586583488 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


<module 'customHelpers' from '/Users/nagibshah/dev/COMP5349_AmazonProductReviewAnalysis/notebooks/customHelpers.py'>

In [2]:
# initialise the session 

spark = SparkSession \
    .builder \
    .appName("Amazon Product Review Analysis") \
    .getOrCreate()

#sc = SparkContext(appName="Amazon Product Review Analysis")

## Load Dataset 

| Column | Description | 
| :--- | :--- |
| marketplace | 2 letter country code of the marketplace where the review was written. |
| customer_id | Random identifier that can be used to aggregate reviews written by a single author. |
| review_id | The unique ID of the review. |
| product_id | The unique Product ID the review pertains to. In the multilingual dataset the reviews for the same product in different countries can be grouped by the same product_id. | 
| product_parent | Random identifier that can be used to aggregate reviews for the same product. |
| product_title | Title of the product. | 
| product_category | Broad product category that can be used to group reviews (also used to group the dataset into  coherent parts). | 
| star_rating | the 1-5 star rating of the review. | 
| helpful_votes | Number of helpful votes. | 
| total_votes | Number of total votes the review received. | 
| vine | Review was written as part of the Vine program. |
| verified_purchase | The review is on a verified purchase. |
| review_headline | The title of the review. |
| review_body | The review text. |
| review_date | The date the review was written | 


DATA FORMAT
Tab ('\t') separated text file, without quote or escape characters.
First line in each file is header; 1 line corresponds to 1 record.


In [3]:
# load the data set 
#review_data = '../data/sample_us.tsv'
# actual data load - PERFORMANCE WARNING ON LOCAL MACHINE
review_data = '../data/amazon_reviews_us_Music_v1_00.tsv'

aws_product_review_schema = StructType([
    StructField("marketplace", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("review_id", StringType(), True),
    StructField("product_id",StringType(),True),
    StructField("product_parent",StringType(),False),
    StructField("product_title", StringType(), False),
    StructField("product_category", StringType(), False),
    StructField("star_rating", IntegerType(), False),
    StructField("helpful_votes",IntegerType(),False),
    StructField("total_votes", IntegerType(), False),
    StructField("vine",StringType(),False),
    StructField("verified_purchase", StringType(), False),
    StructField("review_headline", StringType(), False),
    StructField("review_body", StringType(), False),
    StructField("review_date",DateType(),False)])

aws_product_review_schema_limited = StructType([
    StructField("customer_id", StringType(), True),
    StructField("review_id", StringType(), True),
    StructField("product_id",StringType(),True),
    StructField("product_title", StringType(), False),
    StructField("product_category", StringType(), False),
    StructField("star_rating", IntegerType(), False),
    StructField("helpful_votes",IntegerType(),False),
    StructField("total_votes", IntegerType(), False),
    StructField("review_headline", StringType(), False),
    StructField("review_body", StringType(), False),
    StructField("review_date",DateType(),False)])

%time awsProductReview_raw_data = spark.read.csv(review_data,header=True,sep="\t",schema=aws_product_review_schema)



CPU times: user 2.17 ms, sys: 894 µs, total: 3.06 ms
Wall time: 1.16 s


In [4]:
# when testing in local machine only 
print(awsProductReview_raw_data.count())
# limit to 1 mil
awsProductReview_raw_data = awsProductReview_raw_data.limit(1000000)
print(awsProductReview_raw_data.count())

4751577
1000000


In [5]:
awsProductReview_raw_data.show(5)

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   10140119|R3LI5TRP3YIDQL|B00TXH4OLC|     384427924|Whatever's for Us...|           Music|          5|            0|          0|   N|                Y|          Five Stars|Love this CD alon...| 2015-08-31|
|         US|   27664622|R3LGC3EKEG84PX|B00B6QXN6U|     831769051|Same Trailer Diff...|           Music|          5|    

In [6]:
dfProductReview = awsProductReview_raw_data.drop('vine').drop('verified_purchase') \
                    .drop('product_parent').drop('marketplace')
dfProductReview.show(5)


+-----------+--------------+----------+--------------------+----------------+-----------+-------------+-----------+--------------------+--------------------+-----------+
|customer_id|     review_id|product_id|       product_title|product_category|star_rating|helpful_votes|total_votes|     review_headline|         review_body|review_date|
+-----------+--------------+----------+--------------------+----------------+-----------+-------------+-----------+--------------------+--------------------+-----------+
|   10140119|R3LI5TRP3YIDQL|B00TXH4OLC|Whatever's for Us...|           Music|          5|            0|          0|          Five Stars|Love this CD alon...| 2015-08-31|
|   27664622|R3LGC3EKEG84PX|B00B6QXN6U|Same Trailer Diff...|           Music|          5|            0|          0|A new fave in our...|This is the album...| 2015-08-31|
|   45946560| R9PYL3OYH55QY|B001GCZXW6| Soaring (Jazz Club)|           Music|          5|            0|          1|          Five Stars|  Excellent / 

In [7]:
# remove rows with no review text 
print("number of rows before filter: {0}".format(dfProductReview.count()))
dfFilteredReviews = dfProductReview.na.drop(subset=["review_body"])
print("number of rows after filter: {0}".format(dfFilteredReviews.count()))

number of rows before filter: 1000000
number of rows after filter: 999701


## Stage One: Overall statistics

### Produce overall summary statistics of the data set, in particular,

* the total number of reviews
* the number of unique users
* the number of unique products

In [8]:
dfOverallStats = dfProductReview.agg(countDistinct("customer_id").alias("unique_customers"), \
                    countDistinct("product_id").alias("unique_products"), \
                    count(col="review_id").alias("total_reviews")) \

dfOverallStats.show()

+----------------+---------------+-------------+
|unique_customers|unique_products|total_reviews|
+----------------+---------------+-------------+
|          461198|         297940|      1000000|
+----------------+---------------+-------------+



### For user-review distribution, you are asked to find out:

* the largest number of reviews published by a single user
* the top 10 users ranked by the number of reviews they publish
* the median number of reviews published by a user

In [9]:
dfUserReviewCounts = helper.distributionStats(dfRecords=dfProductReview.select("customer_id", "review_id"), \
                                              partitionBy="customer_id",countBy="review_id", \
                                              returnCountName="total_reviews")
print("Top Reviewer:")
dfUserReviewCounts.show(1)
print("Top 10 Reviewers:")
dfUserReviewCounts.show(10)

Top Reviewer:
+-----------+-------------+
|customer_id|total_reviews|
+-----------+-------------+
|   38214553|         1497|
+-----------+-------------+
only showing top 1 row

Top 10 Reviewers:
+-----------+-------------+
|customer_id|total_reviews|
+-----------+-------------+
|   38214553|         1497|
|   15536614|         1224|
|   18116317|         1053|
|    7080939|          921|
|   14539589|          723|
|   42836721|          652|
|   47924228|          604|
|   15725862|          594|
|   47423754|          558|
|   50736950|          513|
+-----------+-------------+
only showing top 10 rows



In [10]:
# median reviews 
# no median finder in spark... do we need to implement using RDD?

user_review_median=dfUserReviewCounts.approxQuantile("total_reviews", [0.50], 0)[0]
print("median number of {0} reviews published by user".format(user_review_median))

median number of 1.0 reviews published by user


### For product-review distribution, you are asked to find out:
    
* the largest number of reviews written for a single product
* the top 10 products ranked by the number of reviews they have
* the median number of reviews a product has

In [11]:
dfProductReviewCounts = helper.distributionStats(dfRecords=dfProductReview.select("product_id", "review_id"), \
                                              partitionBy="product_id",countBy="review_id", \
                                              returnCountName="total_reviews")
print("Top Product By Review:")
dfProductReviewCounts.show(1)
print("Top 10 Products by Reviews:")
dfProductReviewCounts.show(10)

Top Product By Review:
+----------+-------------+
|product_id|total_reviews|
+----------+-------------+
|B00MIA0KGY|         2699|
+----------+-------------+
only showing top 1 row

Top 10 Products by Reviews:
+----------+-------------+
|product_id|total_reviews|
+----------+-------------+
|B00MIA0KGY|         2699|
|B00NEJ7MMI|         2420|
|B00MRHANNI|         1513|
|B00H3GZMIE|         1277|
|B00MU79IL8|         1172|
|B00UCFVIDQ|         1114|
|B00KLF5J64|         1038|
|B00EDY5KTA|         1008|
|B00NQKWAIQ|          997|
|B00007KWHG|          826|
+----------+-------------+
only showing top 10 rows



In [12]:
# median reviews 
# no median finder in spark... do we need to implement using RDD?

product_review_median=int(dfProductReviewCounts.approxQuantile("total_reviews", [0.5], 0)[0])
print("median number of {0} reviews per product".format(product_review_median))

median number of 1 reviews per product


## Stage Two: Filtering Unwanted Data

filter reviews based on length, reviewer and product feature. In particular, the following reviews should be removed:

* reviews with less than two sentences in the review body.
* reviews published by users with less than median number of reviews published
* reviews from products with less than median number of reviews received

NOTE: Sentence Segmentation Using: NLTK

In [13]:
# reviews with less than 2 sentences in review_body
# convert to RDD and carry out a filter to remove rows with less than 2 sentences 

print("number of rows before filter: {0}".format(dfFilteredReviews.count()))

dfFilteredReviews = dfFilteredReviews.filter(helper.FilterSentences('review_body'))

#reviewRdd = dfProductReview.rdd.map(list).filter(helper.FilterSentences)
#print(reviewRdd.take(1))
# convert back to DF 
#dfFilteredReviews = spark.createDataFrame(reviewRdd, aws_product_review_schema_limited)
#dfFilteredReviews = reviewRdd.toDF(schema=aws_product_review_schema_limited)
dfFilteredReviews.show(1)
dfFilteredReviews.cache()

print("number of rows post filter: {0}".format(dfFilteredReviews.count()))

number of rows before filter: 999701
+-----------+--------------+----------+--------------------+----------------+-----------+-------------+-----------+--------------------+--------------------+-----------+
|customer_id|     review_id|product_id|       product_title|product_category|star_rating|helpful_votes|total_votes|     review_headline|         review_body|review_date|
+-----------+--------------+----------+--------------------+----------------+-----------+-------------+-----------+--------------------+--------------------+-----------+
|   27664622|R3LGC3EKEG84PX|B00B6QXN6U|Same Trailer Diff...|           Music|          5|            0|          0|A new fave in our...|This is the album...| 2015-08-31|
+-----------+--------------+----------+--------------------+----------------+-----------+-------------+-----------+--------------------+--------------------+-----------+
only showing top 1 row

number of rows post filter: 329468


In [14]:
# user review filter 
print("number of rows before filter: {0}".format(dfFilteredReviews.count()))

window = Window.partitionBy("customer_id")
dfFilteredReviews = dfFilteredReviews \
    .withColumn("review_count", count("review_id") \
    .over(window)) \
    .filter(col("review_count") >= user_review_median) \
    .drop("review_count")

print("number of rows post filter: {0}".format(dfFilteredReviews.count()))

number of rows before filter: 329468
number of rows post filter: 329468


In [15]:
# product review filter
print("number of rows before filter: {0}".format(dfFilteredReviews.count()))

window = Window.partitionBy("product_id")
dfFilteredReviews = dfFilteredReviews \
    .withColumn("review_count", count("review_id") \
    .over(window)) \
    .filter(col("review_count") >= product_review_median) \
    .drop("review_count")

print("number of rows post filter: {0}".format(dfFilteredReviews.count()))

number of rows before filter: 329468
number of rows post filter: 329468


In [16]:
dfFilteredReviews.show(5)

+-----------+--------------+----------+--------------------+----------------+-----------+-------------+-----------+--------------------+--------------------+-----------+
|customer_id|     review_id|product_id|       product_title|product_category|star_rating|helpful_votes|total_votes|     review_headline|         review_body|review_date|
+-----------+--------------+----------+--------------------+----------------+-----------+-------------+-----------+--------------------+--------------------+-----------+
|   16615744|R36G0ETK84RMNN|5552936752|Scherrer/Fritz: S...|           Music|          4|            1|          1|Swiss Orchestral ...|Little is known a...| 2015-08-08|
|   40083442| R6A3LEKK38HG6|B0000004IT|        Singles 1-12|           Music|          5|            0|          0| Melvins Starter Kit|Some might say, i...| 2014-07-03|
|   20890474|R3RQWT6V0NHDSY|B0000005PU|        Mi Vida Loca|           Music|          4|            2|          2|    Delightful Music|Gaffney was a 

In [17]:
print("Original Number of Rows before cleanup: {0}".format(dfProductReview.count()))
print("Number of rows after all filters applied: {0}".format(dfFilteredReviews.count()))

Original Number of Rows before cleanup: 1000000
Number of rows after all filters applied: 329468


In [18]:
dfFilteredReviews.cache()

DataFrame[customer_id: string, review_id: string, product_id: string, product_title: string, product_category: string, star_rating: int, helpful_votes: int, total_votes: int, review_headline: string, review_body: string, review_date: date]

#### After filtering out the above, find out:

* top 10 users ranked by median number of sentences in the reviews they have published
* top 10 products ranked by median number of sentences in the reviews they have received

In [19]:
# top 10 users ranked by median number of sentences in the reviews they have published
#dfTop10UsersBySents = helper.getTopBySentNumber(dfRecords=dfFilteredReviews, topnCol="customer_id", \
#                                                textCol="review_body",n=10)

dfTop10UsersBySents = helper.getTopBySentMedian(dfRecords=dfFilteredReviews,partitionBy="customer_id", \
                                                textCol="review_body",medianColName="median_sents",n=10)
#dfTop10UsersBySents.select("customer_id","product_id","product_title","review_body","median_sents").show(10)
dfTop10UsersBySents.show()


+-----------+------------+
|customer_id|median_sents|
+-----------+------------+
|   41838529|         984|
|   51970720|         454|
|   51865782|         440|
|   52672392|         311|
|   50595705|         211|
|   17821650|         183|
|   36934717|         171|
|   29705444|         163|
|   13551370|         157|
|   14678937|         155|
+-----------+------------+



In [20]:
# top 10 products ranked by median number of sentences in the reviews they have received
#dfTop10ProductsBySents = helper.getTopBySentNumber(dfRecords=dfFilteredReviews, topnCol="product_id", \
#                                                textCol="review_body",n=10)
dfTop10ProductsBySents = helper.getTopBySentMedian(dfRecords=dfFilteredReviews,partitionBy="product_id", \
                                                textCol="review_body",medianColName="median_sents",n=10)

dfTop10ProductsBySents.show(10)

+----------+------------+
|product_id|median_sents|
+----------+------------+
|B00LTQ5EVY|         984|
|B00000411D|         302|
|B000001G8Z|         295|
|B002QZPVBK|         289|
|B00HSP0P0U|         274|
|B005MJVMO2|         274|
|B005OZBTWE|         273|
|B000005986|         271|
|B000004119|         269|
|B000003425|         266|
+----------+------------+



In [21]:
# save the cleaned and filtered dataframe to file system 

#dfFilteredReviews.coalesce(1).write.format("parquet") \
#    .option("header", "true").saveAsTable('filteredReviews',mode="overwrite")
dfFilteredReviews.coalesce(1).write.csv("../output",mode="overwrite",header=True,sep="\t")

## Stage 3 Similarity analysis with Sentence Embedding

perform similarity analysis on the review sentences. The analysis involves segmenting review body into multiple sentences; encoding each sentence as vector so that the distance between pair of sentences can be computed.

### Positive vs. Negative Reviews

* pick a product from the top 10 products in stage 1
* Create a positive and negative class of reviews using the rating 
    - Positive Class - rate >=4 
    - Negative Class - rate <= 2
    - for each review, extracting the review body part and segment it into multiple sentences.
    - encode the sentences using google universal encoder


In [3]:
# load the base/filtered dataset created earlier 
filtered_data = "../output/part-*.csv"
dfBaseDataset = spark.read.csv(filtered_data,header=True,sep="\t",schema=helper.AWS_PRODUCT_REVIEW_SCHEMA_LIMITED)

In [4]:
# product from top 10 by review number 
base_product_id = "B00MIA0KGY"

dfSelectedProduct = dfBaseDataset.where((col("product_id") == base_product_id))
dfPositiveClass = dfBaseDataset.where((col("product_id") == base_product_id) & (col("star_rating") >= 4))
dfNegativeClass = dfBaseDataset.where((col("product_id") == base_product_id) & (col("star_rating") <= 2))

In [5]:
print("number of reviews from {0}: {1}".format(base_product_id, dfSelectedProduct.count()))
print("number of positives reviews from {0}: {1}".format(base_product_id,dfPositiveClass.count()))
print("number of negatives reviews from {0}: {1}".format(base_product_id,dfNegativeClass.count()))

number of reviews from B00MIA0KGY: 638
number of positives reviews from B00MIA0KGY: 549
number of negatives reviews from B00MIA0KGY: 50


### Extract the sentences - similar to flatMap

In [6]:
# for each review, extract the review body part and segment it into multiple sentences
#extract the positive sentences
dfPosSents = dfPositiveClass.select("review_id","review_body") \
    .withColumn("sentences", helper.GenerateSentences("review_body")) \
    .select("review_id", F.explode_outer("sentences").alias("sentence")) \
    .na.drop(subset=["sentence"])

print("number of sentences from positive reviews: {0}".format(dfPosSents.count()))

number of sentences from positive reviews: 3018


In [7]:
# extract the negative sentences
dfNegSents = dfNegativeClass.select("review_id","review_body") \
    .withColumn("sentences", helper.GenerateSentences("review_body")) \
    .select("review_id", F.explode_outer("sentences").alias("sentence")) \
    .na.drop(subset=["sentence"])

print("number of sentences from negative reviews: {0}".format(dfNegSents.count()))

number of sentences from negative reviews: 314


In [8]:
dfPosSents.show(5)
dfNegSents.show(5)

+--------------+--------------------+
|     review_id|            sentence|
+--------------+--------------------+
|R33AKM6TMGP62U| Barbra at her best!|
|R33AKM6TMGP62U|Absolutely amazin...|
|R33AKM6TMGP62U|Can't pick a favo...|
|R338L3ESXHT0XJ|                Wow!|
|R338L3ESXHT0XJ|The absolutely pe...|
+--------------+--------------------+
only showing top 5 rows

+--------------+--------------------+
|     review_id|            sentence|
+--------------+--------------------+
|R2P2KVK3GRJBHP|     Love her voice!|
|R2P2KVK3GRJBHP|Few surprises, sh...|
|R2P2KVK3GRJBHP|Pairing with a pa...|
|R2P2KVK3GRJBHP|Her son has a bea...|
|R1HY9W9AU5S4WB|   So disappointed!!|
+--------------+--------------------+
only showing top 5 rows



### Encoding the sentences - google universal encoder 

In [9]:
# bug with tensorflow if no limit is set
# bypass by setting df count as limit 

# get the negative embeddings + dense vectors 
rddTemp = dfNegSents.limit(dfNegSents.count()).select("review_id","sentence").rdd.map(list) \
            .mapPartitions(helper.vectorizeSents).cache()
print(rddTemp.take(2))
dfNegSentsVectorised = spark.createDataFrame(rddTemp, helper.VECTOR_SCHEMA)
    

[('R2P2KVK3GRJBHP', DenseVector([-0.0506, 0.002, -0.0256, -0.0092, -0.0636, 0.0052, -0.0069, 0.0087, 0.0357, 0.0315, 0.0764, -0.0637, 0.0233, 0.0507, 0.0457, 0.0611, -0.0274, -0.0113, -0.0015, 0.0041, 0.0306, -0.0607, 0.0022, -0.0587, 0.014, 0.0033, -0.0296, -0.0767, -0.0062, -0.0664, 0.0265, -0.0523, 0.028, -0.0001, 0.0277, -0.0082, -0.0735, 0.02, -0.016, 0.0119, 0.0343, 0.0356, 0.0755, -0.0602, -0.0555, 0.0448, 0.0425, -0.0005, 0.0235, 0.0455, 0.0136, 0.0656, -0.0124, 0.0063, 0.0662, 0.0729, 0.0339, 0.0283, -0.0391, 0.0359, 0.0375, -0.0849, -0.0092, -0.0265, -0.0098, -0.0041, 0.0551, -0.0489, 0.0791, 0.0737, -0.009, -0.0613, 0.0225, -0.01, 0.0285, 0.0366, 0.0845, -0.0057, -0.0537, 0.0481, 0.0798, -0.0531, 0.0883, -0.0268, 0.0542, -0.0147, -0.0166, -0.0352, 0.0137, -0.076, 0.0692, -0.0102, -0.0055, 0.0447, 0.0428, -0.0311, -0.0179, -0.0239, 0.0334, 0.0237, -0.0476, -0.0364, -0.0103, 0.0401, 0.0612, 0.0294, -0.0075, 0.0102, -0.0591, -0.0244, 0.0584, 0.029, -0.0042, -0.0461, -0.0675, 0.

In [10]:
# get the positive embeddings + dense vectors 
rddTemp = dfPosSents.limit(dfPosSents.count()).select("review_id","sentence").rdd.map(list) \
            .mapPartitions(helper.vectorizeSents).cache()
print(rddTemp.take(2))
dfPosSentsVectorised = spark.createDataFrame(rddTemp, helper.VECTOR_SCHEMA)

[('R33AKM6TMGP62U', DenseVector([-0.0017, 0.0142, -0.0067, 0.0075, -0.0665, -0.0347, -0.048, -0.0526, -0.0696, 0.0228, 0.0516, -0.0209, 0.0988, 0.0027, 0.0323, 0.0299, -0.0071, -0.0124, 0.0232, 0.0738, -0.0476, -0.0139, -0.0436, -0.04, -0.0136, -0.0242, 0.0415, -0.0891, -0.0438, -0.0116, -0.0058, -0.0417, -0.0294, -0.047, 0.0148, 0.019, -0.0706, 0.003, 0.0183, -0.0214, 0.0743, 0.0351, 0.0822, -0.0077, -0.0337, -0.008, 0.0417, 0.0264, 0.0105, 0.016, 0.024, 0.0345, -0.0015, 0.0162, 0.0935, -0.0248, 0.0534, -0.0423, -0.0079, 0.013, -0.0576, -0.0624, -0.0273, -0.0317, 0.0428, -0.0209, 0.0415, -0.049, 0.1022, 0.0646, -0.0355, -0.0107, -0.0092, -0.0205, -0.0371, -0.0142, 0.0951, 0.0105, -0.0166, 0.0162, 0.0231, -0.0595, 0.0993, 0.0106, -0.0087, -0.0187, -0.0097, -0.0108, 0.0534, -0.0695, 0.0449, 0.0211, -0.0317, -0.0452, 0.0536, -0.0245, 0.0165, -0.0295, -0.0461, 0.0792, 0.002, -0.0547, -0.0637, 0.0045, -0.0002, 0.0365, -0.0146, 0.0193, -0.007, -0.013, -0.0085, -0.0287, -0.0009, -0.0087, -0.

In [11]:
# print some lines 
print("Total embedded and vectorised positive sentences {0}".format(dfPosSentsVectorised.count()))
print("Pos vector sample:")
dfPosSentsVectorised.show(1)

print("Total embedded and vectorised negative sentences {0}".format(dfNegSentsVectorised.count()))
print("Neg vector sample:")
dfNegSentsVectorised.show(1)

Total embedded and vectorised positive sentences 3018
Pos vector sample:
+--------------+--------------------+
|     review_id|             vectors|
+--------------+--------------------+
|R33AKM6TMGP62U|[-0.0016616235952...|
+--------------+--------------------+
only showing top 1 row

Total embedded and vectorised negative sentences 314
Neg vector sample:
+--------------+--------------------+
|     review_id|             vectors|
+--------------+--------------------+
|R2P2KVK3GRJBHP|[-0.0506365746259...|
+--------------+--------------------+
only showing top 1 row



### Intra-Class Similarity

We want to find out if sentences in the same category are closely related with each other. The closeness is measured by average distance between points in the class. In our case, point refers to the sentence encoding and pair-wise distance is measured by Cosine distance. Cosine distance is computed as “1 − CosineSimilarity”. It has a value between 0 and 2.

In [12]:
reload(helper)

<module 'customHelpers' from '/Users/nagibshah/dev/COMP5349_AmazonProductReviewAnalysis/notebooks/customHelpers.py'>

In [None]:
pos_vect_index = positive_review_embedding.map(Vectors.dense).zipWithIndex()

pos_distances = pos_vect_index.cartesian(pos_vect_index) \
                       .map(helper.CosineDistance) \
                       .filter(lambda x: x[0]!=0)

pos_result = pos_distances.collect()
pos_avg = sum(i[0] for i in pos_result)/len(pos_result)

print("Average cosine distance between positive reviews: {0}".format(pos_avg))

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 51471)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/Users/nagibshah/anaconda/lib/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/Users/nagibshah/anaconda/lib/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/Users/nagibshah/anaconda/lib/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/accumulators.py", line 268, in handle
    poll(accum_updates)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/usr/local/Cellar/apache-spark/2.4.0

E0511 13:34:17.143673 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.143154 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.142732 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.142318 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.141869 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.141463 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.141002 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.140602 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.140213 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.139814 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.139337 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.138952 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.138154 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.135258 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.134078 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.131485 4481263040 java_gateway.py:1003] Exception while sending command.
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, in get_return_value
    format(target_id, ".",

E0511 13:34:17.183728 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.183375 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.183091 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.182815 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.182528 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.182187 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.181916 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.181647 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.181383 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.180993 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.180748 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.180261 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.179798 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.179340 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

E0511 13:34:17.178535 4481263040 java_gateway.py:1078] An error occurred while trying to connect to the Java server (127.0.0.1:51437)
Traceback (most recent call last):
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-095fda5f6d23>", line 7, in <module>
    pos_result = pos_distances.collect()
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/rdd.py", line 816, in collect
    sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/usr/local/Cellar/apache-spark/2.4.0/libexec/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/nagibshah/anaconda/lib/python3.6/site-packages/py4j/protocol.py", line 328, 

### Class Center Sentences

Find out the class center and its 10 closest neighbours for positive and negative class respectively. We define class center as the point that has the smallest average distance to other points in the class. Again in this case point refers to the sentence encoding and pair-wise distance are measured by Cosine distance.
The result should show the text of the center sentence, the review id it belongs to and its 10 closest neighbouring sentences text and their respective review id.

In [13]:
reload(helper)

<module 'customHelpers' from '/Users/nagibshah/dev/COMP5349_AmazonProductReviewAnalysis/notebooks/customHelpers.py'>

In [14]:
from pyspark.ml.feature import Normalizer

normalizer = Normalizer(inputCol="vectors", outputCol="normFeatures") # default uses L2 norm 
l2NegNormData = normalizer.transform(dfNegSentsVectorised)
print("Normalized using L2 norm")
l2NegNormData.show(5)


Normalized using L2 norm
+--------------+--------------------+--------------------+
|     review_id|             vectors|        normFeatures|
+--------------+--------------------+--------------------+
|R2P2KVK3GRJBHP|[-0.0506365746259...|[-0.0506365814959...|
|R2P2KVK3GRJBHP|[-0.0580368600785...|[-0.0580368628959...|
|R2P2KVK3GRJBHP|[0.03794876486063...|[0.03794876963988...|
|R2P2KVK3GRJBHP|[0.03115259110927...|[0.03115259288005...|
|R1HY9W9AU5S4WB|[0.01509211771190...|[0.01509211794457...|
+--------------+--------------------+--------------------+
only showing top 5 rows



In [15]:
masterNegMapData = l2NegNormData.select("review_id","normFeatures").rdd.zipWithIndex().cache()
print(masterNegMapData.take(1))

[(Row(review_id='R2P2KVK3GRJBHP', normFeatures=DenseVector([-0.0506, 0.002, -0.0256, -0.0092, -0.0636, 0.0052, -0.0069, 0.0087, 0.0357, 0.0315, 0.0764, -0.0637, 0.0233, 0.0507, 0.0457, 0.0611, -0.0274, -0.0113, -0.0015, 0.0041, 0.0306, -0.0607, 0.0022, -0.0587, 0.014, 0.0033, -0.0296, -0.0767, -0.0062, -0.0664, 0.0265, -0.0523, 0.028, -0.0001, 0.0277, -0.0082, -0.0735, 0.02, -0.016, 0.0119, 0.0343, 0.0356, 0.0755, -0.0602, -0.0555, 0.0448, 0.0425, -0.0005, 0.0235, 0.0455, 0.0136, 0.0656, -0.0124, 0.0063, 0.0662, 0.0729, 0.0339, 0.0283, -0.0391, 0.0359, 0.0375, -0.0849, -0.0092, -0.0265, -0.0098, -0.0041, 0.0551, -0.0489, 0.0791, 0.0737, -0.009, -0.0613, 0.0225, -0.01, 0.0285, 0.0366, 0.0845, -0.0057, -0.0537, 0.0481, 0.0798, -0.0531, 0.0883, -0.0268, 0.0542, -0.0147, -0.0166, -0.0352, 0.0137, -0.076, 0.0692, -0.0102, -0.0055, 0.0447, 0.0428, -0.0311, -0.0179, -0.0239, 0.0334, 0.0237, -0.0476, -0.0364, -0.0103, 0.0401, 0.0612, 0.0294, -0.0075, 0.0102, -0.0591, -0.0244, 0.0584, 0.029, -0

In [18]:
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
#mat = IndexedRowMatrix(masterMapData.map(lambda row: IndexedRow(row.review_id, row.normFeatures.toArray()))) \
#            .toBlockMatrix()

def calc_distance(row):
    rowid = row.index
    distances = 1-row.vector
    # now get the sums 
    totalDistance = distances.toArray().sum()
    length = len(distances.toArray())
    avg = totalDistance / length
    return (rowid, avg)

matNeg = IndexedRowMatrix(masterNegMapData.map(lambda row: IndexedRow(row[1], (row[0][1]).toArray()))).toBlockMatrix()
dotNeg = matNeg.multiply(matNeg.transpose())
negCosines = dotNeg.toIndexedRowMatrix()

In [19]:
# test = dot.toIndexedRowMatrix()
# calculate the avg negative distances 
avgNegDistances = negCosines.rows.map(lambda row: calc_distance(row))

In [20]:
# print a few rows 
print(avgNegDistances.take(5))

#test2 = test.rows.map(lambda row: IndexedRow(row.index, (1-row.vector)))
#test2 = test.rows.map(lambda row: calc_distance(row))

#row.vector.toArray().sum()

[(147, 0.6603121269072003), (19, 0.7631380686764193), (39, 0.6497401823067195), (297, 0.7448456777501764), (71, 0.7863506432706412)]
