# Predicting Product Ratings using Customer Reviews

In [0]:
# start the timer
import time
start_time = time.time()

## Environment Setup

**`Classroom-Setup`**

In [0]:
%run ./Includes/Classroom-Setup

### Loading the DataFrame

In [0]:
textDF = spark.read.csv("/mnt/training/reviews/reviews.csv", inferSchema = True, header=True, escape='"')
display(textDF.limit(1000))

Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than most.
2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,"Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as ""Jumbo""."
3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all","This is a confection that has been around a few centuries. It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar. And it is a tiny mouthful of heaven. Not too chewy, and very flavorful. I highly recommend this yummy treat. If you are familiar with the story of C.S. Lewis' ""The Lion, The Witch, and The Wardrobe"" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch."
4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient in Robitussin I believe I have found it. I got this in addition to the Root Beer Extract I ordered (which was good) and made some cherry soda. The flavor is very medicinal.
5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,"Great taffy at a great price. There was a wide assortment of yummy taffy. Delivery was very quick. If your a taffy lover, this is a deal."
6,B006K2ZZ7K,ADT0SRK1MGOEU,Twoapennything,0,0,4,1342051200,Nice Taffy,"I got a wild hair for taffy and ordered this five pound bag. The taffy was all very enjoyable with many flavors: watermelon, root beer, melon, peppermint, grape, etc. My only complaint is there was a bit too much red/black licorice-flavored pieces (just not my particular favorites). Between me, my kids, and my husband, this lasted only two weeks! I would recommend this brand of taffy -- it was a delightful treat."
7,B006K2ZZ7K,A1SP2KVKFXXRU1,David C. Sullivan,0,0,5,1340150400,Great! Just as good as the expensive brands!,"This saltwater taffy had great flavors and was very soft and chewy. Each candy was individually wrapped well. None of the candies were stuck together, which did happen in the expensive version, Fralinger's. Would highly recommend this candy! I served it at a beach-themed party and everyone loved it!"
8,B006K2ZZ7K,A3JRGQVEQN31IQ,Pamela G. Williams,0,0,5,1336003200,"Wonderful, tasty taffy",This taffy is so good. It is very soft and chewy. The flavors are amazing. I would definitely recommend you buying it. Very satisfying!!
9,B000E7L2R4,A1MZYO9TZK0BBI,R. James,1,1,5,1322006400,Yay Barley,Right now I'm mostly just sprouting this so my cats can eat the grass. They love it. I rotate it around with Wheatgrass and Rye too
10,B00171APVA,A21BT40VZCCYT4,Carol A. Reed,0,0,5,1351209600,Healthy Dog Food,This is a very healthy dog food. Good for their digestion. Also good for small puppies. My dog eats her required amount at every feeding.


In [0]:
# check numbers of rows and columns

print((textDF.count(), len(textDF.columns)))

(568454, 10)


In [0]:
# sample 10% data for debugging
textDF = textDF.sample(False, 0.1)

### Selecting Columns

In [0]:
textDF = textDF.select("Id", "ProductId", "Score", "Summary", "Text")

textDF.cache().count()

Out[14]: 57119

In [0]:
textDF.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- ProductId: string (nullable = true)
 |-- Score: integer (nullable = true)
 |-- Summary: string (nullable = true)
 |-- Text: string (nullable = true)



In [0]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col

textDF = textDF.withColumn('Score', col('Score').cast(IntegerType()))

In [0]:
textDF.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- ProductId: string (nullable = true)
 |-- Score: integer (nullable = true)
 |-- Summary: string (nullable = true)
 |-- Text: string (nullable = true)



## LDA

In [0]:
import sparknlp

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from nltk.corpus import stopwords

from pyspark.ml import Pipeline


In [0]:
## text cleaning annotators & Sentiment Analysis stages
documentAssembler = DocumentAssembler()\
    .setInputCol("Text")\
    .setOutputCol("document")
    
senEmbedder = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")

sentimentModel = SentimentDLModel.pretrained(name='sentimentdl_use_twitter', lang="en")\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")

finisher = Finisher() \
    .setInputCols(["sentiment"]) \
    .setOutputCols(["sentiment_result2"]) \
    .setOutputAsArray(False) \
    .setCleanAnnotations(True)

from pyspark.ml.feature import StringIndexer

## string indexer on sentiment results 
sentIndex = StringIndexer(inputCol='sentiment_result2',outputCol='sent_label')

from pyspark.ml.feature import Tokenizer,StopWordsRemover, CountVectorizer,IDF,StringIndexer


## LDA pre precessing steps
tokenizer = Tokenizer(inputCol="Text", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens',outputCol='c_vec')


from pyspark.ml.clustering import LDA

# Trains a LDA model.
lda = LDA(k=10, maxIter=10, featuresCol='c_vec')

## assemble features vector
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector
assembler = VectorAssembler(inputCols=['sent_label','topicDistribution'],outputCol='features')

## create pipeline for sentiment analysis, lda, and vector assembler
from pyspark.ml import Pipeline


lda_pipe = Pipeline(stages=[documentAssembler, senEmbedder, sentimentModel, finisher, sentIndex, tokenizer, stopremove, count_vec, lda, assembler])


tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[ | ][OK!]
sentimentdl_use_twitter download started this may take some time.
Approximate size to download 11.4 MB
[ | ][ / ][ — ][OK!]


In [0]:
lda_model = lda_pipe.fit(textDF)
processed = lda_model.transform(textDF).persist()

In [0]:
display(processed)

Id,ProductId,Score,Summary,Text,sentiment_result2,sent_label,token_text,stop_tokens,c_vec,topicDistribution,features
5,B006K2ZZ7K,5,Great taffy,"Great taffy at a great price. There was a wide assortment of yummy taffy. Delivery was very quick. If your a taffy lover, this is a deal.",positive,0.0,"List(great, taffy, at, a, great, price., , there, was, a, wide, assortment, of, yummy, taffy., , delivery, was, very, quick., , if, your, a, taffy, lover,, this, is, a, deal.)","List(great, taffy, great, price., , wide, assortment, yummy, taffy., , delivery, quick., , taffy, lover,, deal.)","Map(vectorType -> sparse, length -> 138549, indices -> List(0, 5, 285, 698, 820, 1253, 2323, 2773, 3925, 4409, 5339, 19283), values -> List(3.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0))","Map(vectorType -> dense, length -> 10, values -> List(0.005236686550598841, 0.005224697304993408, 0.00522621758721869, 0.005229692881361002, 0.005224523817218892, 0.005244311001291487, 0.005233420393775082, 0.005249792319068091, 0.9528979943942993, 0.005232663750175271))","Map(vectorType -> dense, length -> 11, values -> List(0.0, 0.005236686550598841, 0.005224697304993408, 0.00522621758721869, 0.005229692881361002, 0.005224523817218892, 0.005244311001291487, 0.005233420393775082, 0.005249792319068091, 0.9528979943942993, 0.005232663750175271))"
17,B001GVISJM,2,poor taste,I love eating them and they are good for watching TV and looking at movies! It is not too sweet. I like to transfer them to a zip lock baggie so they stay fresh so I can take my time eating them.,positive,0.0,"List(i, love, eating, them, and, they, are, good, for, watching, tv, and, looking, at, movies!, it, is, not, too, sweet., i, like, to, transfer, them, to, a, zip, lock, baggie, so, they, stay, fresh, so, i, can, take, my, time, eating, them.)","List(love, eating, good, watching, tv, looking, movies!, sweet., like, transfer, zip, lock, baggie, stay, fresh, take, time, eating, them.)","Map(vectorType -> sparse, length -> 138549, indices -> List(2, 4, 7, 32, 74, 88, 119, 120, 156, 591, 728, 1455, 3469, 4306, 5026, 6266, 8137, 36291), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 10, values -> List(0.004448161108790056, 0.004437972971644085, 0.004439283652490741, 0.00444222746998604, 0.00443784311581882, 0.004454759506764526, 0.004445396661996366, 0.004459320533416299, 0.959990285193029, 0.004444749786064036))","Map(vectorType -> dense, length -> 11, values -> List(0.0, 0.004448161108790056, 0.004437972971644085, 0.004439283652490741, 0.00444222746998604, 0.00443784311581882, 0.004454759506764526, 0.004445396661996366, 0.004459320533416299, 0.959990285193029, 0.004444749786064036))"
23,B001GVISJM,5,Delicious product!,I can remember buying this candy as a kid and the quality hasn't dropped in all these years. Still a superb product you won't be disappointed with.,negative,1.0,"List(i, can, remember, buying, this, candy, as, a, kid, and, the, quality, hasn't, dropped, in, all, these, years., still, a, superb, product, you, won't, be, disappointed, with.)","List(remember, buying, candy, kid, quality, dropped, years., still, superb, product, disappointed, with.)","Map(vectorType -> sparse, length -> 138549, indices -> List(10, 47, 97, 111, 268, 401, 531, 762, 1463, 1667, 2972, 3853), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 10, values -> List(0.0068575941950072515, 0.006841856587054536, 0.0068438490610422354, 0.006848416651910043, 0.006841622393480426, 0.0068675501158173705, 0.0068532686488196395, 0.006874717043881701, 0.9383188176985096, 0.006852307604477306))","Map(vectorType -> dense, length -> 11, values -> List(1.0, 0.0068575941950072515, 0.006841856587054536, 0.0068438490610422354, 0.006848416651910043, 0.006841622393480426, 0.0068675501158173705, 0.0068532686488196395, 0.006874717043881701, 0.9383188176985096, 0.006852307604477306))"
26,B001GVISJM,5,Twizzlers - Strawberry,"Product received is as advertised. Twizzlers, Strawberry, 16-Ounce Bags (Pack of 6)",neutral,2.0,"List(product, received, is, as, advertised.twizzlers,, strawberry,, 16-ounce, bags, (pack, of, 6))","List(product, received, advertised.twizzlers,, strawberry,, 16-ounce, bags, (pack, 6))","Map(vectorType -> sparse, length -> 138549, indices -> List(1, 10, 153, 201, 476, 1591, 3421, 4989, 6809, 22250, 130395), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 10, values -> List(0.007433276685292916, 0.007416106950796431, 0.007418319642480226, 0.007423255779819619, 0.007415873603134902, 0.007444007433666162, 0.007428609606855451, 0.007451769827574132, 0.9331411780277029, 0.0074276024426772605))","Map(vectorType -> dense, length -> 11, values -> List(2.0, 0.007433276685292916, 0.007416106950796431, 0.007418319642480226, 0.007423255779819619, 0.007415873603134902, 0.007444007433666162, 0.007428609606855451, 0.007451769827574132, 0.9331411780277029, 0.0074276024426772605))"
31,B003F6UO7K,5,Great machine!,"I have never been a huge coffee fan. However, my mother purchased this little machine and talked me into trying the Latte Macciato. No Coffee Shop has a better one and I like most of the other products, too (as a usually non-coffee drinker!). The little Dolche Guesto Machine is super easy to use and prepares a really good Coffee/Latte/Cappuccino/etc in less than a minute (if water is heated up). I would recommend the Dolce Gusto to anyone. Too good for the price and I'am getting one myself! :)",positive,0.0,"List(i, have, never, been, a, huge, coffee, fan., however,, my, mother, purchased, this, little, machine, and, talked, me, into, trying, the, latte, macciato., no, coffee, shop, has, a, better, one, and, i, like, most, of, the, other, products,, too, (as, a, usually, non-coffee, drinker!).the, little, dolche, guesto, machine, is, super, easy, to, use, and, prepares, a, really, good, coffee/latte/cappuccino/etc, in, less, than, a, minute, (if, water, is, heated, up)., i, would, recommend, the, dolce, gusto, to, anyone., too, good, for, the, price, and, i'am, getting, one, myself!, :))","List(never, huge, coffee, fan., however,, mother, purchased, little, machine, talked, trying, latte, macciato., coffee, shop, better, one, like, products,, (as, usually, non-coffee, drinker!).the, little, dolche, guesto, machine, super, easy, use, prepares, really, good, coffee/latte/cappuccino/etc, less, minute, (if, water, heated, up)., recommend, dolce, gusto, anyone., good, price, i'am, getting, one, myself!, :))","Map(vectorType -> sparse, length -> 138549, indices -> List(2, 3, 4, 8, 12, 15, 18, 30, 35, 51, 55, 57, 76, 103, 109, 144, 149, 168, 169, 191, 379, 478, 520, 603, 887, 1054, 1262, 1413, 1542, 1747, 1829, 2594, 2966, 3351, 3929, 5506, 9001, 10172, 11299, 15336, 19246, 26264, 94317, 95649, 115864, 127097, 131439), values -> List(1.0, 2.0, 2.0, 2.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 10, values -> List(0.0016746016409127716, 0.0016707700079147648, 0.001671251067587489, 0.0016723677162075853, 0.0016707195241840809, 0.001677059022750817, 0.0016735493175676199, 0.0016788067304249923, 0.9849375417187216, 0.0016733332537283018))","Map(vectorType -> dense, length -> 11, values -> List(0.0, 0.0016746016409127716, 0.0016707700079147648, 0.001671251067587489, 0.0016723677162075853, 0.0016707195241840809, 0.001677059022750817, 0.0016735493175676199, 0.0016788067304249923, 0.9849375417187216, 0.0016733332537283018))"
37,B001EO5QW8,5,Love Gluten Free Oatmeal!!!,"For those of us with celiac disease this product is a lifesaver and what could be better than getting it at almost half the price of the grocery or health food store! I love McCann's instant oatmeal - all flavors!!! Thanks, Abby",neutral,2.0,"List(for, those, of, us, with, celiac, disease, this, product, is, a, lifesaver, and, what, could, be, better, than, getting, it, at, almost, half, the, price, of, the, grocery, or, health, food, store!, , i, love, mccann's, instant, oatmeal, -, all, flavors!!!thanks,abby)","List(us, celiac, disease, product, lifesaver, better, getting, almost, half, price, grocery, health, food, store!, , love, mccann's, instant, oatmeal, -, flavors!!!thanks,abby)","Map(vectorType -> sparse, length -> 138549, indices -> List(0, 1, 7, 10, 14, 19, 30, 35, 135, 168, 179, 190, 247, 284, 555, 589, 1959, 3340, 5418, 6308, 11576, 18941, 72201, 92254), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 10, values -> List(0.0035558843633338604, 0.0035476338766518914, 0.003548676911433345, 0.0035510439371499846, 0.003547519818102123, 0.003560955023175933, 0.0035535465431012743, 0.0035646785776806164, 0.9680169957027245, 0.0035530652466466174))","Map(vectorType -> dense, length -> 11, values -> List(2.0, 0.0035558843633338604, 0.0035476338766518914, 0.003548676911433345, 0.0035510439371499846, 0.003547519818102123, 0.003560955023175933, 0.0035535465431012743, 0.0035646785776806164, 0.9680169957027245, 0.0035530652466466174))"
41,B001EO5QW8,5,Why wouldn't you buy oatmeal from Mcanns? Tastes great!,"The variety packs taste great! I have them every morning. At $0.30 cents per meal, I don't understand why everyone on earth isn't buying this stuff up. Maple and brown sugar is terrific, followed by apples and cinnamon, followed by regular. You don't get tired of the same ole thing, and they taste great. I just boil water from a small pot, empty the packet or 2 in a bowl, pour in boiling water, and watch it expand to 2x its size! Taste really good and takes minutes to prepare. Not sure why everyone on earth isn't this. Convenient, healthy, very quick, excellent quality, and extremely cheap...",negative,1.0,"List(the, variety, packs, taste, great!i, have, them, every, morning., at, $0.30, cents, per, meal,, i, don't, understand, why, everyone, on, earth, isn't, buying, this, stuff, up.maple, and, brown, sugar, is, terrific,, followed, by, apples, and, cinnamon,, followed, by, regular., you, don't, get, tired, of, the, same, ole, thing,, and, they, taste, great.i, just, boil, water, from, a, small, pot,, empty, the, packet, or, 2, in, a, bowl,, pour, in, boiling, water,, and, watch, it, expand, to, 2x, its, size!taste, really, good, and, takes, minutes, to, prepare.not, sure, why, everyone, on, earth, isn't, this., convenient,, healthy,, very, quick,, excellent, quality,, and, extremely, cheap...)","List(variety, packs, taste, great!i, every, morning., $0.30, cents, per, meal,, understand, everyone, earth, buying, stuff, up.maple, brown, sugar, terrific,, followed, apples, cinnamon,, followed, regular., get, tired, ole, thing,, taste, great.i, boil, water, small, pot,, empty, packet, 2, bowl,, pour, boiling, water,, watch, expand, 2x, size!taste, really, good, takes, minutes, prepare.not, sure, everyone, earth, this., convenient,, healthy,, quick,, excellent, quality,, extremely, cheap...)","Map(vectorType -> sparse, length -> 138549, indices -> List(1, 4, 6, 9, 12, 31, 51, 56, 71, 75, 89, 95, 97, 141, 143, 210, 291, 316, 415, 425, 426, 436, 484, 509, 608, 706, 883, 1023, 1101, 1103, 1208, 1319, 1420, 1431, 1472, 1518, 1658, 1791, 1842, 2152, 2444, 2504, 2665, 2827, 2922, 2942, 3091, 3511, 3941, 4017, 5757, 6049, 6509, 7058, 8360, 9005, 11164, 29712, 37966, 44962, 54432, 116852), values -> List(5.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 10, values -> List(0.0012321739157347047, 0.0012293486016686126, 0.0012297007333865976, 0.0012305245958652942, 0.0012293040360825372, 0.001233966147722107, 0.0012314002665761048, 0.001235248773784209, 0.9889171084849121, 0.001231224444267767))","Map(vectorType -> dense, length -> 11, values -> List(1.0, 0.0012321739157347047, 0.0012293486016686126, 0.0012297007333865976, 0.0012305245958652942, 0.0012293040360825372, 0.001233966147722107, 0.0012314002665761048, 0.001235248773784209, 0.9889171084849121, 0.001231224444267767))"
49,B001EO5QW8,4,Very good but next time I won't order the Variety Pack,"I really like the Maple and Brown Sugar flavor. The regular is fine with brown sugar added. The Apples and Cinnamon flavor is OK. This is a very quick, easy and satisfying breakfast and I'll order this brand again, but not the variety. I'll get all Maple and Brown Sugar.",positive,0.0,"List(i, really, like, the, maple, and, brown, sugar, flavor., the, regular, is, fine, with, brown, sugar, added., the, apples, and, cinnamon, flavor, is, ok., this, is, a, very, quick,, easy, and, satisfying, breakfast, and, i'll, order, this, brand, again,, but, not, the, variety., i'll, get, all, maple, and, brown, sugar.)","List(really, like, maple, brown, sugar, flavor., regular, fine, brown, sugar, added., apples, cinnamon, flavor, ok., quick,, easy, satisfying, breakfast, order, brand, again,, variety., get, maple, brown, sugar.)","Map(vectorType -> sparse, length -> 138549, indices -> List(2, 9, 12, 13, 59, 71, 102, 107, 109, 116, 362, 407, 426, 468, 564, 620, 1007, 1167, 1726, 1981, 2788, 2827, 3091), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 10, values -> List(0.0031737050920643867, 0.003166424412764953, 0.0031673494034961725, 0.0031694576730358494, 0.0031663225942597365, 0.003178307700970626, 0.003171710319112728, 0.0031816265546310814, 0.9714538348385695, 0.003171261411094901))","Map(vectorType -> dense, length -> 11, values -> List(0.0, 0.0031737050920643867, 0.003166424412764953, 0.0031673494034961725, 0.0031694576730358494, 0.0031663225942597365, 0.003178307700970626, 0.003171710319112728, 0.0031816265546310814, 0.9714538348385695, 0.003171261411094901))"
58,B004N5KULM,5,How can you go wrong!,"It is chocolate, what can I say. Great variety of everything our family loves. With a family of six it goes fast here. Perfect variety. Kit Kat, Reeses, take five and more.",positive,0.0,"List(it, is, chocolate,, what, can, i, say., , great, variety, of, everything, our, family, loves., , with, a, family, of, six, it, goes, fast, here., , perfect, variety., , kit, kat,, reeses,, take, five, and, more.)","List(chocolate,, say., , great, variety, everything, family, loves., , family, six, goes, fast, here., , perfect, variety., , kit, kat,, reeses,, take, five, more.)","Map(vectorType -> sparse, length -> 138549, indices -> List(0, 5, 118, 119, 228, 291, 360, 417, 439, 486, 566, 748, 774, 819, 1716, 1981, 3618, 6820, 34702, 99881), values -> List(4.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 10, values -> List(0.0035559173009226718, 0.0035477299447427644, 0.0035487757599385926, 0.003551122197303528, 0.0035475664390136404, 0.0035610591473660995, 0.003553674320090201, 0.003564769432283288, 0.9680161996817892, 0.003553185776549981))","Map(vectorType -> dense, length -> 11, values -> List(0.0, 0.0035559173009226718, 0.0035477299447427644, 0.0035487757599385926, 0.003551122197303528, 0.0035475664390136404, 0.0035610591473660995, 0.003553674320090201, 0.003564769432283288, 0.9680161996817892, 0.003553185776549981))"
72,B001GVISJC,5,Bigger then other brands,"Grape gummy bears are hard to find in my area. In fact pretty much anyone I talk to about grape gummy bears they think I'm lying. So I bought 10lbs... : ) These bears are a little bit bigger then the other brands and have kind of sour kick, but nothing to strong. I love grape flavored candy/soda and these are pretty good. There is another company that makes grape gummy bears that are a little bit better in my opinion, but these are well worth it for the price. I like to use the gummy bears in home made Popsicles with flavored sports drink. The salt in the sports drink makes for softer popsicles, and the gummy bears are awesome frozen. They are delicious!",positive,0.0,"List(grape, gummy, bears, are, hard, to, find, in, my, area., in, fact, pretty, much, anyone, i, talk, to, about, grape, gummy, bears, they, think, i'm, lying., so, i, bought, 10lbs..., :, ), these, bears, are, a, little, bit, bigger, then, the, other, brands, and, have, kind, of, sour, kick,, but, nothing, to, strong., i, love, grape, flavored, candy/soda, and, these, are, pretty, good., there, is, another, company, that, makes, grape, gummy, bears, that, are, a, little, bit, better, in, my, opinion,, but, these, are, well, worth, it, for, the, price., i, like, to, use, the, gummy, bears, in, home, made, popsicles, with, flavored, sports, drink., the, salt, in, the, sports, drink, makes, for, softer, popsicles,, and, the, gummy, bears, are, awesome, frozen., they, are, delicious!)","List(grape, gummy, bears, hard, find, area., fact, pretty, much, anyone, talk, grape, gummy, bears, think, lying., bought, 10lbs..., :, ), bears, little, bit, bigger, brands, kind, sour, kick,, nothing, strong., love, grape, flavored, candy/soda, pretty, good., another, company, makes, grape, gummy, bears, little, bit, better, opinion,, well, worth, price., like, use, gummy, bears, home, made, popsicles, flavored, sports, drink., salt, sports, drink, makes, softer, popsicles,, gummy, bears, awesome, frozen., delicious!)","Map(vectorType -> sparse, length -> 138549, indices -> List(2, 7, 15, 16, 18, 23, 30, 38, 39, 41, 50, 53, 54, 72, 104, 110, 140, 142, 148, 180, 184, 238, 245, 277, 283, 285, 307, 333, 336, 747, 801, 829, 850, 1212, 1484, 1495, 1787, 1823, 2251, 2343, 2627, 2927, 3242, 3643, 4619, 8109, 9878, 24994, 36640, 46587, 79346, 88734), values -> List(1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 5.0, 1.0, 1.0, 4.0, 1.0, 6.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 10, values -> List(0.0012495336592845273, 0.0012466677672086275, 0.0012470374209115165, 0.0012478667428673152, 0.0012466273576024057, 0.0012513962131784073, 0.0012487500631953035, 0.0012526586873681225, 0.9887608855513422, 0.0012485765370415622))","Map(vectorType -> dense, length -> 11, values -> List(0.0, 0.0012495336592845273, 0.0012466677672086275, 0.0012470374209115165, 0.0012478667428673152, 0.0012466273576024057, 0.0012513962131784073, 0.0012487500631953035, 0.0012526586873681225, 0.9887608855513422, 0.0012485765370415622))"


In [0]:
(training,testing) = processed.randomSplit([0.8,0.2], seed=100)

In [0]:
from pyspark.ml.classification import RandomForestClassifier
nb = RandomForestClassifier(numTrees=3, maxDepth=2, featuresCol='features', labelCol="Score", seed=100)
fitted = nb.fit(training)
amazon_prediction = fitted.transform(testing)

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="Score", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(amazon_prediction)
print("Accuracy of model at predicting user rating was: {}".format(accuracy))