In [46]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import udf
from pyspark.sql.types import *
import re

In [59]:
#create sparksession
spark = SparkSession.builder.appName('ml').getOrCreate()

In [60]:
data1=spark.read.json("C:\\Users\\ums_guolinye\\Desktop\\Toys_and_Games_5.json").select("reviewText")

In [61]:
data1.show()

+--------------------+
|          reviewText|
+--------------------+
|I like the item p...|
|Love the magnet e...|
|Both sides are ma...|
|Bought one a few ...|
|I have a stainles...|
|this is a nice ma...|
|This is just as I...|
|My granddaughter ...|
|This is good prod...|
|I keep this board...|
|Very nice to use ...|
|Bought this board...|
|The board had to ...|
|sturdy and perfec...|
|We had purchased ...|
|We purchased this...|
|This easel is the...|
|This is my absolu...|
|Working with Mand...|
|This book contain...|
+--------------------+
only showing top 20 rows



In [62]:
def clean_data(text):
    #将字母全部转为小写
    content=str(text).lower()
    #只保留单词，即去除其他字符
    content=re.sub('[^a-z ]+', ' ',content)
    #将多个空格转为一个空格
    content=re.sub('\s+', ' ',content).strip()
    return content

In [63]:
#自定义函数
clean_data_udf=udf(clean_data, StringType())
#对data_python数据的body列做数据预处理，即调用clean_data方法，生成新的列Body_clean
data1=data1.withColumn("reviewText_clean",clean_data_udf("reviewText"))

In [64]:
data1.select("reviewText_clean").show()

+--------------------+
|    reviewText_clean|
+--------------------+
|i like the item p...|
|love the magnet e...|
|both sides are ma...|
|bought one a few ...|
|i have a stainles...|
|this is a nice ma...|
|this is just as i...|
|my granddaughter ...|
|this is good prod...|
|i keep this board...|
|very nice to use ...|
|bought this board...|
|the board had to ...|
|sturdy and perfec...|
|we had purchased ...|
|we purchased this...|
|this easel is the...|
|this is my absolu...|
|working with mand...|
|this book contain...|
+--------------------+
only showing top 20 rows



In [65]:
#tokenizer
tokenizer = Tokenizer(inputCol="reviewText_clean", outputCol="words")
data1 = tokenizer.transform(data1)

In [66]:
data1.show()

+--------------------+--------------------+--------------------+
|          reviewText|    reviewText_clean|               words|
+--------------------+--------------------+--------------------+
|I like the item p...|i like the item p...|[i, like, the, it...|
|Love the magnet e...|love the magnet e...|[love, the, magne...|
|Both sides are ma...|both sides are ma...|[both, sides, are...|
|Bought one a few ...|bought one a few ...|[bought, one, a, ...|
|I have a stainles...|i have a stainles...|[i, have, a, stai...|
|this is a nice ma...|this is a nice ma...|[this, is, a, nic...|
|This is just as I...|this is just as i...|[this, is, just, ...|
|My granddaughter ...|my granddaughter ...|[my, granddaughte...|
|This is good prod...|this is good prod...|[this, is, good, ...|
|I keep this board...|i keep this board...|[i, keep, this, b...|
|Very nice to use ...|very nice to use ...|[very, nice, to, ...|
|Bought this board...|bought this board...|[bought, this, bo...|
|The board had to ...|the

In [67]:
swr = StopWordsRemover(inputCol = 'words', outputCol = 'words_sw_removed')
data1 = swr.transform(data1)

In [68]:
data1.show()

+--------------------+--------------------+--------------------+--------------------+
|          reviewText|    reviewText_clean|               words|    words_sw_removed|
+--------------------+--------------------+--------------------+--------------------+
|I like the item p...|i like the item p...|[i, like, the, it...|[like, item, pric...|
|Love the magnet e...|love the magnet e...|[love, the, magne...|[love, magnet, ea...|
|Both sides are ma...|both sides are ma...|[both, sides, are...|[sides, magnetic,...|
|Bought one a few ...|bought one a few ...|[bought, one, a, ...|[bought, one, yea...|
|I have a stainles...|i have a stainles...|[i, have, a, stai...|[stainless, steel...|
|this is a nice ma...|this is a nice ma...|[this, is, a, nic...|[nice, magnetic, ...|
|This is just as I...|this is just as i...|[this, is, just, ...|[expected, sturdy...|
|My granddaughter ...|my granddaughter ...|[my, granddaughte...|[granddaughter, r...|
|This is good prod...|this is good prod...|[this, is, 

In [70]:
data1.select("words_sw_removed").show()

+--------------------+
|    words_sw_removed|
+--------------------+
|[like, item, pric...|
|[love, magnet, ea...|
|[sides, magnetic,...|
|[bought, one, yea...|
|[stainless, steel...|
|[nice, magnetic, ...|
|[expected, sturdy...|
|[granddaughter, r...|
|[good, product, u...|
|[keep, board, top...|
|[nice, use, magne...|
|[bought, board, a...|
|[board, put, away...|
|[sturdy, perfect,...|
|[purchased, melis...|
|[purchased, go, m...|
|[easel, perfect, ...|
|[absolute, favori...|
|[working, mandala...|
|[book, contains, ...|
+--------------------+
only showing top 20 rows



In [72]:
data1.select("words_sw_removed").rdd.flatMap(lambda x:x[0]).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a+b).sortBy(lambda x: x[1], False).take(20)

[('one', 78092),
 ('game', 69250),
 ('old', 65264),
 ('toy', 63570),
 ('like', 61600),
 ('great', 59844),
 ('play', 57771),
 ('little', 55993),
 ('fun', 54362),
 ('year', 50977),
 ('get', 46928),
 ('kids', 45757),
 ('really', 45385),
 ('set', 43058),
 ('well', 42785),
 ('loves', 40324),
 ('love', 38779),
 ('time', 37771),
 ('son', 36033),
 ('also', 35690)]

In [73]:
#####################################################################

In [74]:
data2=spark.read.json("C:\\Users\\ums_guolinye\\Desktop\\Sports_and_Outdoors_5.json").select("reviewText")

In [75]:
data2=data2.withColumn("reviewText_clean",clean_data_udf("reviewText"))

In [76]:
data2 = tokenizer.transform(data2)

In [77]:
data2 = swr.transform(data2)

In [78]:
data2.select("words_sw_removed").rdd.flatMap(lambda x:x[0]).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a+b).sortBy(lambda x: x[1], False).take(20)

[('one', 114907),
 ('great', 102257),
 ('use', 96282),
 ('like', 94927),
 ('well', 94530),
 ('good', 93356),
 ('get', 72470),
 ('knife', 63705),
 ('little', 53132),
 ('really', 49579),
 ('also', 49513),
 ('easy', 47885),
 ('much', 47461),
 ('time', 46635),
 ('price', 46305),
 ('used', 46286),
 ('fit', 46045),
 ('m', 45556),
 ('product', 43594),
 ('made', 42420)]

In [None]:
data3=spark.read.json("C:\\Users\\ums_guolinye\\Desktop\\VideoGames.json").select("reviewText")

In [None]:
data3=data3.withColumn("reviewText_clean",clean_data_udf("reviewText"))

In [None]:
data3 = tokenizer.transform(data3)

In [None]:
data3 = swr.transform(data3)

In [None]:
data3.select("words_sw_removed").rdd.flatMap(lambda x:x[0]).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a+b).sortBy(lambda x: x[1], False).take(20)