# Predicting Product Ratings using Customer Reviews

In [0]:
# start the timer
import time
start_time = time.time()

## Environment Setup

**`Classroom-Setup`**

In [0]:
%run ./Includes/Classroom-Setup

### Loading the DataFrame

In [0]:
textDF = spark.read.csv("/mnt/training/reviews/reviews.csv", inferSchema = True, header=True, escape='"')
display(textDF.limit(10))

Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than most.
2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,"Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as ""Jumbo""."
3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all","This is a confection that has been around a few centuries. It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar. And it is a tiny mouthful of heaven. Not too chewy, and very flavorful. I highly recommend this yummy treat. If you are familiar with the story of C.S. Lewis' ""The Lion, The Witch, and The Wardrobe"" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch."
4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient in Robitussin I believe I have found it. I got this in addition to the Root Beer Extract I ordered (which was good) and made some cherry soda. The flavor is very medicinal.
5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,"Great taffy at a great price. There was a wide assortment of yummy taffy. Delivery was very quick. If your a taffy lover, this is a deal."
6,B006K2ZZ7K,ADT0SRK1MGOEU,Twoapennything,0,0,4,1342051200,Nice Taffy,"I got a wild hair for taffy and ordered this five pound bag. The taffy was all very enjoyable with many flavors: watermelon, root beer, melon, peppermint, grape, etc. My only complaint is there was a bit too much red/black licorice-flavored pieces (just not my particular favorites). Between me, my kids, and my husband, this lasted only two weeks! I would recommend this brand of taffy -- it was a delightful treat."
7,B006K2ZZ7K,A1SP2KVKFXXRU1,David C. Sullivan,0,0,5,1340150400,Great! Just as good as the expensive brands!,"This saltwater taffy had great flavors and was very soft and chewy. Each candy was individually wrapped well. None of the candies were stuck together, which did happen in the expensive version, Fralinger's. Would highly recommend this candy! I served it at a beach-themed party and everyone loved it!"
8,B006K2ZZ7K,A3JRGQVEQN31IQ,Pamela G. Williams,0,0,5,1336003200,"Wonderful, tasty taffy",This taffy is so good. It is very soft and chewy. The flavors are amazing. I would definitely recommend you buying it. Very satisfying!!
9,B000E7L2R4,A1MZYO9TZK0BBI,R. James,1,1,5,1322006400,Yay Barley,Right now I'm mostly just sprouting this so my cats can eat the grass. They love it. I rotate it around with Wheatgrass and Rye too
10,B00171APVA,A21BT40VZCCYT4,Carol A. Reed,0,0,5,1351209600,Healthy Dog Food,This is a very healthy dog food. Good for their digestion. Also good for small puppies. My dog eats her required amount at every feeding.


In [0]:
# check numbers of rows and columns

print((textDF.count(), len(textDF.columns)))

(568454, 10)


In [0]:
# sample 10% data for debugging
textDF = textDF.sample(False, 0.1)

### Selecting Columns

In [0]:
textDF = textDF.select("Id", "ProductId", "Score", "Summary", "Text")

textDF.cache().count()

Out[43]: 57055

In [0]:
textDF.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- ProductId: string (nullable = true)
 |-- Score: integer (nullable = true)
 |-- Summary: string (nullable = true)
 |-- Text: string (nullable = true)



In [0]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col

textDF = textDF.withColumn('Score', col('Score').cast(IntegerType()))

In [0]:
textDF.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- ProductId: string (nullable = true)
 |-- Score: integer (nullable = true)
 |-- Summary: string (nullable = true)
 |-- Text: string (nullable = true)



## LDA

In [0]:
import sparknlp

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from nltk.corpus import stopwords

from pyspark.ml import Pipeline


In [0]:
## text cleaning annotators & Sentiment Analysis stages
documentAssembler = DocumentAssembler()\
    .setInputCol("Text")\
    .setOutputCol("document")
    
senEmbedder = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")

sentimentModel = SentimentDLModel.pretrained(name='sentimentdl_use_twitter', lang="en")\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")

finisher = Finisher() \
    .setInputCols(["sentiment"]) \
    .setOutputCols(["sentiment_result2"]) \
    .setOutputAsArray(False) \
    .setCleanAnnotations(True)

from pyspark.ml.feature import StringIndexer

## string indexer on sentiment results 
sentIndex = StringIndexer(inputCol='sentiment_result2',outputCol='sent_label')

from pyspark.ml.feature import Tokenizer,StopWordsRemover, CountVectorizer,IDF,StringIndexer


## LDA pre precessing steps
tokenizer = Tokenizer(inputCol="Text", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens',outputCol='c_vec')


from pyspark.ml.clustering import LDA

# Trains a LDA model.
lda = LDA(k=10, maxIter=10, featuresCol='c_vec')

## assemble features vector
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector
assembler = VectorAssembler(inputCols=['sent_label','topicDistribution'],outputCol='features')

## create pipeline for sentiment analysis, lda, and vector assembler
from pyspark.ml import Pipeline


lda_pipe = Pipeline(stages=[documentAssembler, senEmbedder, sentimentModel, finisher, sentIndex, tokenizer, stopremove, count_vec, lda, assembler])


tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[ | ][OK!]
sentimentdl_use_twitter download started this may take some time.
Approximate size to download 11.4 MB
[ | ][OK!]


In [0]:
lda_model = lda_pipe.fit(textDF)
processed = lda_model.transform(textDF).persist()

In [0]:
display(processed.limit(10))

Id,ProductId,Score,Summary,Text,sentiment_result2,sent_label,token_text,stop_tokens,c_vec,topicDistribution,features
8,B006K2ZZ7K,5,"Wonderful, tasty taffy",This taffy is so good. It is very soft and chewy. The flavors are amazing. I would definitely recommend you buying it. Very satisfying!!,positive,0.0,"List(this, taffy, is, so, good., , it, is, very, soft, and, chewy., , the, flavors, are, amazing., , i, would, definitely, recommend, you, buying, it., , very, satisfying!!)","List(taffy, good., , soft, chewy., , flavors, amazing., , definitely, recommend, buying, it., , satisfying!!)","Map(vectorType -> sparse, length -> 140052, indices -> List(0, 18, 55, 98, 100, 135, 138, 366, 1189, 3358, 5704, 122795), values -> List(4.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 10, values -> List(0.005566218170119425, 0.005611343533239611, 0.005551693192480247, 0.0056048831503632025, 0.005552638951381288, 0.005562603179898767, 0.9498516681333362, 0.005568093404497909, 0.005562015428270905, 0.005568842856412426))","Map(vectorType -> dense, length -> 11, values -> List(0.0, 0.005566218170119425, 0.005611343533239611, 0.005551693192480247, 0.0056048831503632025, 0.005552638951381288, 0.005562603179898767, 0.9498516681333362, 0.005568093404497909, 0.005562015428270905, 0.005568842856412426))"
13,B0009XLVG0,1,My Cats Are Not Fans of the New Food,"My cats have been happily eating Felidae Platinum for more than two years. I just got a new bag and the shape of the food is different. They tried the new food when I first put it in their bowls and now the bowls sit full and the kitties will not touch the food. I've noticed similar reviews related to formula changes in the past. Unfortunately, I now need to find a new food that my cats will eat.",negative,1.0,"List(my, cats, have, been, happily, eating, felidae, platinum, for, more, than, two, years., i, just, got, a, new, bag, and, the, shape, of, the, food, is, different., they, tried, the, new, food, when, i, first, put, it, in, their, bowls, and, now, the, bowls, sit, full, and, the, kitties, will, not, touch, the, food., i've, noticed, similar, reviews, related, to, formula, changes, in, the, past., unfortunately,, i, now, need, to, find, a, new, food, that, my, cats, will, eat.)","List(cats, happily, eating, felidae, platinum, two, years., got, new, bag, shape, food, different., tried, new, food, first, put, bowls, bowls, sit, full, kitties, touch, food., noticed, similar, reviews, related, formula, changes, past., unfortunately,, need, find, new, food, cats, eat.)","Map(vectorType -> sparse, length -> 140052, indices -> List(14, 23, 24, 33, 42, 50, 54, 83, 89, 107, 136, 168, 188, 202, 277, 429, 432, 549, 568, 787, 886, 1043, 1066, 1611, 1913, 2623, 2636, 2697, 2759, 2922, 4280, 10000, 15228), values -> List(3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 10, values -> List(0.002220026979203955, 0.0022380010864382818, 0.002214219590192951, 0.0022354236094553376, 0.0022146039780849835, 0.0022185836542123736, 0.979998963139558, 0.0022207640924727223, 0.002218339916477742, 0.0022210739539034424))","Map(vectorType -> dense, length -> 11, values -> List(1.0, 0.002220026979203955, 0.0022380010864382818, 0.002214219590192951, 0.0022354236094553376, 0.0022146039780849835, 0.0022185836542123736, 0.979998963139558, 0.0022207640924727223, 0.002218339916477742, 0.0022210739539034424))"
27,B001GVISJM,1,Nasty No flavor,"The candy is just red , No flavor . Just plan and chewy . I would never buy them again",negative,1.0,"List(the, candy, is, just, red, ,, no, flavor, ., just, , plan, and, chewy, ., , i, would, never, buy, them, again)","List(candy, red, ,, flavor, ., , plan, chewy, ., , never, buy)","Map(vectorType -> sparse, length -> 140052, indices -> List(0, 13, 22, 57, 292, 334, 517, 768, 842, 870), values -> List(2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 10, values -> List(0.006858290025990384, 0.0069138692165904665, 0.0068403567091160945, 0.006905932830447659, 0.006841559290535272, 0.006853831148008518, 0.9382109693193177, 0.0068605579394475065, 0.006853113488650832, 0.006861520031895483))","Map(vectorType -> dense, length -> 11, values -> List(1.0, 0.006858290025990384, 0.0069138692165904665, 0.0068403567091160945, 0.006905932830447659, 0.006841559290535272, 0.006853831148008518, 0.9382109693193177, 0.0068605579394475065, 0.006853113488650832, 0.006861520031895483))"
32,B003F6UO7K,5,THIS IS MY TASTE...,"This offer is a great price and a great taste, thanks Amazon for selling this product. Staral",positive,0.0,"List(this, offer, is, a, great, price, and, a, great, taste,, thanks, amazon, for, selling, this, product.staral)","List(offer, great, price, great, taste,, thanks, amazon, selling, product.staral)","Map(vectorType -> sparse, length -> 140052, indices -> List(1, 5, 35, 37, 302, 583, 1012, 1122, 1634, 138647), values -> List(1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 10, values -> List(0.0074338296269999046, 0.00749414045135798, 0.007414430459873014, 0.007485638382866978, 0.007415810626918955, 0.007429081812077447, 0.9330250699241449, 0.0074363990962649715, 0.007428129143152479, 0.007437470476343399))","Map(vectorType -> dense, length -> 11, values -> List(0.0, 0.0074338296269999046, 0.00749414045135798, 0.007414430459873014, 0.007485638382866978, 0.007415810626918955, 0.007429081812077447, 0.9330250699241449, 0.0074363990962649715, 0.007428129143152479, 0.007437470476343399))"
37,B001EO5QW8,5,Love Gluten Free Oatmeal!!!,"For those of us with celiac disease this product is a lifesaver and what could be better than getting it at almost half the price of the grocery or health food store! I love McCann's instant oatmeal - all flavors!!! Thanks, Abby",neutral,2.0,"List(for, those, of, us, with, celiac, disease, this, product, is, a, lifesaver, and, what, could, be, better, than, getting, it, at, almost, half, the, price, of, the, grocery, or, health, food, store!, , i, love, mccann's, instant, oatmeal, -, all, flavors!!!thanks,abby)","List(us, celiac, disease, product, lifesaver, better, getting, almost, half, price, grocery, health, food, store!, , love, mccann's, instant, oatmeal, -, flavors!!!thanks,abby)","Map(vectorType -> sparse, length -> 140052, indices -> List(0, 1, 7, 10, 14, 19, 32, 37, 137, 176, 178, 187, 248, 273, 561, 587, 2263, 3006, 5591, 6060, 9266, 15633, 93156, 102746), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 10, values -> List(0.0035563278478382998, 0.0035851788658388885, 0.0035470609172359017, 0.00358105645855103, 0.0035476530324701347, 0.003554050511791025, 0.967959491323229, 0.003557536315526835, 0.003553652390020925, 0.003557992337497972))","Map(vectorType -> dense, length -> 11, values -> List(2.0, 0.0035563278478382998, 0.0035851788658388885, 0.0035470609172359017, 0.00358105645855103, 0.0035476530324701347, 0.003554050511791025, 0.967959491323229, 0.003557536315526835, 0.003553652390020925, 0.003557992337497972))"
39,B001EO5QW8,4,GOOD WAY TO START THE DAY....,"I WAS VISITING MY FRIEND NATE THE OTHER MORNING FOR COFFEE , HE CAME OUT OF HIS STORAGE ROOM WITH ( A PACKET OF McCANNS INSTANT IRISH OATMEAL .) HE SUGGESTED THAT I TRY IT FOR MY OWN USE ,IN MY STASH . SOMETIMES NATE DOSE NOT GIVE YOU A CHANCE TO SAY NO , SO I ENDED UP TRYING THE APPLE AND CINN . FOUND IT TO BE VERY TASTEFULL WHEN MADE WITH WATER OR POWDERED MILK . IT GOES GOOD WITH O.J. AND COFFEE AND A SLICE OF TOAST AND YOUR READY TO TAKE ON THE WORLD...OR THE DAY AT LEAST.. JERRY REITH...",positive,0.0,"List(i, was, visiting, my, friend, nate, the, other, morning, for, coffee, ,, he, came, out, of, his, storage, room, with, (, a, packet, of, mccanns, instant, irish, oatmeal, .), he, suggested, that, i, try, it, for, my, own, use, ,in, my, stash, ., sometimes, nate, dose, not, give, you, a, chance, to, say, no, ,, so, i, ended, up, trying, the, apple, and, cinn, ., found, it, to, be, very, tastefull, when, made, with, water, or, powdered, milk, ., it, goes, good, with, o.j., and, coffee, and, a, slice, of, toast, and, your, ready, to, take, on, the, world...or, the, day, at, least.., , jerry, reith...)","List(visiting, friend, nate, morning, coffee, ,, came, storage, room, (, packet, mccanns, instant, irish, oatmeal, .), suggested, try, use, ,in, stash, ., sometimes, nate, dose, give, chance, say, ,, ended, trying, apple, cinn, ., found, tastefull, made, water, powdered, milk, ., goes, good, o.j., coffee, slice, toast, ready, take, world...or, day, least.., , jerry, reith...)","Map(vectorType -> sparse, length -> 140052, indices -> List(0, 4, 8, 17, 29, 36, 40, 48, 51, 90, 118, 129, 145, 162, 173, 285, 350, 373, 517, 552, 561, 585, 587, 685, 780, 839, 870, 940, 1039, 1153, 1231, 1442, 1792, 2020, 2406, 2506, 2733, 2858, 3346, 26236, 31675, 38276, 38755, 40748, 45057, 47885, 63347, 92585, 122990, 138066), values -> List(1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 10, values -> List(0.0015849085647570945, 0.0015977358711001019, 0.0015807608786107157, 0.0015958958755557457, 0.0015810419767963986, 0.0015838714709439814, 0.9857208736250537, 0.00158542262426886, 0.0015838419286062818, 0.0015856471843070643))","Map(vectorType -> dense, length -> 11, values -> List(0.0, 0.0015849085647570945, 0.0015977358711001019, 0.0015807608786107157, 0.0015958958755557457, 0.0015810419767963986, 0.0015838714709439814, 0.9857208736250537, 0.00158542262426886, 0.0015838419286062818, 0.0015856471843070643))"
41,B001EO5QW8,5,Why wouldn't you buy oatmeal from Mcanns? Tastes great!,"The variety packs taste great! I have them every morning. At $0.30 cents per meal, I don't understand why everyone on earth isn't buying this stuff up. Maple and brown sugar is terrific, followed by apples and cinnamon, followed by regular. You don't get tired of the same ole thing, and they taste great. I just boil water from a small pot, empty the packet or 2 in a bowl, pour in boiling water, and watch it expand to 2x its size! Taste really good and takes minutes to prepare. Not sure why everyone on earth isn't this. Convenient, healthy, very quick, excellent quality, and extremely cheap...",negative,1.0,"List(the, variety, packs, taste, great!i, have, them, every, morning., at, $0.30, cents, per, meal,, i, don't, understand, why, everyone, on, earth, isn't, buying, this, stuff, up.maple, and, brown, sugar, is, terrific,, followed, by, apples, and, cinnamon,, followed, by, regular., you, don't, get, tired, of, the, same, ole, thing,, and, they, taste, great.i, just, boil, water, from, a, small, pot,, empty, the, packet, or, 2, in, a, bowl,, pour, in, boiling, water,, and, watch, it, expand, to, 2x, its, size!taste, really, good, and, takes, minutes, to, prepare.not, sure, why, everyone, on, earth, isn't, this., convenient,, healthy,, very, quick,, excellent, quality,, and, extremely, cheap...)","List(variety, packs, taste, great!i, every, morning., $0.30, cents, per, meal,, understand, everyone, earth, buying, stuff, up.maple, brown, sugar, terrific,, followed, apples, cinnamon,, followed, regular., get, tired, ole, thing,, taste, great.i, boil, water, small, pot,, empty, packet, 2, bowl,, pour, boiling, water,, watch, expand, 2x, size!taste, really, good, takes, minutes, prepare.not, sure, everyone, earth, this., convenient,, healthy,, quick,, excellent, quality,, extremely, cheap...)","Map(vectorType -> sparse, length -> 140052, indices -> List(1, 4, 6, 9, 12, 31, 51, 60, 70, 77, 87, 100, 101, 140, 147, 207, 274, 324, 388, 421, 430, 449, 485, 491, 629, 685, 803, 997, 1068, 1101, 1132, 1158, 1408, 1446, 1454, 1470, 1650, 1770, 1886, 1907, 2025, 2294, 2715, 2938, 3072, 3101, 3260, 3381, 3394, 3740, 5342, 6052, 7000, 7042, 8743, 9319, 10624, 26696, 30567, 32062, 38104, 137414), values -> List(5.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 10, values -> List(0.0012323266644889514, 0.001242313804396048, 0.001229106498273743, 0.0012408743783171655, 0.0012293147605256342, 0.0012315201666468047, 0.9888975103138901, 0.0012327300933465007, 0.0012313958597548677, 0.0012329074603604385))","Map(vectorType -> dense, length -> 11, values -> List(1.0, 0.0012323266644889514, 0.001242313804396048, 0.001229106498273743, 0.0012408743783171655, 0.0012293147605256342, 0.0012315201666468047, 0.9888975103138901, 0.0012327300933465007, 0.0012313958597548677, 0.0012329074603604385))"
43,B001EO5QW8,5,Food-Great,I have McCann's Oatmeal every morning and by ordering it from Amazon I am able to save almost $3.00 per box. It is a great product. Tastes great and very healthy,positive,0.0,"List(i, have, mccann's, oatmeal, every, morning, and, by, ordering, it, from, amazon, i, am, able, to, save, almost, $3.00, per, box.it, is, a, great, product., tastes, great, and, very, healthy)","List(mccann's, oatmeal, every, morning, ordering, amazon, able, save, almost, $3.00, per, box.it, great, product., tastes, great, healthy)","Map(vectorType -> sparse, length -> 140052, indices -> List(5, 35, 43, 60, 105, 124, 137, 147, 206, 285, 299, 308, 523, 561, 4192, 6060, 10860), values -> List(2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 10, values -> List(0.004683735133434455, 0.004721674262558895, 0.004671509595059264, 0.004716266161473063, 0.004672288456731633, 0.004680675424606294, 0.9578024710712074, 0.004685273996681718, 0.0046801780384107435, 0.0046859278598365365))","Map(vectorType -> dense, length -> 11, values -> List(0.0, 0.004683735133434455, 0.004721674262558895, 0.004671509595059264, 0.004716266161473063, 0.004672288456731633, 0.004680675424606294, 0.9578024710712074, 0.004685273996681718, 0.0046801780384107435, 0.0046859278598365365))"
47,B001EO5QW8,5,good,Good oatmeal. I like the apple cinnamon the best. Though I wouldn't follow the directions on the package since it always comes out too soupy for my taste. That could just be me since I like my oatmeal really thick to add some milk on top of.,positive,0.0,"List(good, oatmeal., , i, like, the, apple, cinnamon, the, best., , though, i, wouldn't, follow, the, directions, on, the, package, since, it, always, comes, out, too, soupy, for, my, taste., , that, could, just, be, me, since, i, like, my, oatmeal, really, thick, to, add, some, milk, on, top, of.)","List(good, oatmeal., , like, apple, cinnamon, best., , though, follow, directions, package, since, always, comes, soupy, taste., , since, like, oatmeal, really, thick, add, milk, top, of.)","Map(vectorType -> sparse, length -> 140052, indices -> List(0, 2, 4, 12, 47, 66, 67, 145, 150, 193, 212, 215, 279, 418, 552, 561, 628, 757, 1128, 1714, 2635, 2987, 14865), values -> List(3.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 10, values -> List(0.003174084684202073, 0.0031997923519918895, 0.003165779565866592, 0.003196107262509054, 0.0031663327387300507, 0.0031720112703257364, 0.9714035033625242, 0.003175129913737266, 0.003171684916983103, 0.003175573933129953))","Map(vectorType -> dense, length -> 11, values -> List(0.0, 0.003174084684202073, 0.0031997923519918895, 0.003165779565866592, 0.003196107262509054, 0.0031663327387300507, 0.0031720112703257364, 0.9714035033625242, 0.003175129913737266, 0.003171684916983103, 0.003175573933129953))"
59,B004N5KULM,5,Awsome - Kids in neighborhood loved us!,"Great product, nice combination of chocolates and perfect size! The bags had plenty, and they were shipped promptly. The kids in the neighborhood liked our candies!",positive,0.0,"List(great, product,, nice, combination, of, chocolates, and, perfect, size!, , the, bags, had, plenty,, and, they, were, shipped, promptly., , the, kids, in, the, neighborhood, liked, our, candies!)","List(great, product,, nice, combination, chocolates, perfect, size!, , bags, plenty,, shipped, promptly., , kids, neighborhood, liked, candies!)","Map(vectorType -> sparse, length -> 140052, indices -> List(0, 5, 63, 114, 155, 263, 284, 321, 589, 909, 1625, 5080, 6241, 12716, 26290, 36502), values -> List(2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 10, values -> List(0.004945186493506853, 0.004985238502044831, 0.004932268366668754, 0.004979562408511787, 0.0049331507124178615, 0.0049420157910921685, 0.9554467339374283, 0.004946860854188972, 0.004941439002172584, 0.004947543931967994))","Map(vectorType -> dense, length -> 11, values -> List(0.0, 0.004945186493506853, 0.004985238502044831, 0.004932268366668754, 0.004979562408511787, 0.0049331507124178615, 0.0049420157910921685, 0.9554467339374283, 0.004946860854188972, 0.004941439002172584, 0.004947543931967994))"


In [0]:
(training,testing) = processed.randomSplit([0.8,0.2], seed=100)

In [0]:
from pyspark.ml.classification import RandomForestClassifier
nb = RandomForestClassifier(numTrees=3, maxDepth=2, featuresCol='features', labelCol="Score", seed=100)
fitted = nb.fit(training)
amazon_prediction = fitted.transform(testing)

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="Score", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(amazon_prediction)
print("Accuracy of model at predicting user rating was: {}".format(accuracy))

Accuracy of model at predicting user rating was: 0.6345581802274716
