In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('nlp_musical').getOrCreate()

In [4]:
data = data = spark.read.json("Musical_Instruments_5.json")

In [5]:
data.show(5)

+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|      asin| helpful|overall|          reviewText| reviewTime|    reviewerID|        reviewerName|             summary|unixReviewTime|
+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|1384719342|  [0, 0]|    5.0|Not much to write...|02 28, 2014|A2IBPI20UZIR0U|cassandra tu "Yea...|                good|    1393545600|
|1384719342|[13, 14]|    5.0|The product does ...|03 16, 2013|A14VAT5EAX3D9S|                Jake|                Jake|    1363392000|
|1384719342|  [1, 1]|    5.0|The primary job o...|08 28, 2013|A195EZSQDW3E21|Rick Bennette "Ri...|It Does The Job Well|    1377648000|
|1384719342|  [0, 0]|    5.0|Nice windscreen p...|02 14, 2014|A2C00NNG1ZQQG2|RustyBill "Sunday...|GOOD WINDSCREEN F...|    1392336000|
|1384719342|  [0, 0]|    5.0|This pop filter i...|02 21

In [6]:
from pyspark.sql.functions import *

In [7]:
data = data.withColumn('class', when(data.overall >=4, "like")
                               .when(data.overall <= 2, "not_like")
                               .otherwise("neutral"))


In [8]:
data = data.select("reviewText", "overall", "class")

### Clean and Prepare the Data

** Create a new length feature: **


In [9]:
from pyspark.sql.functions import length

In [10]:
data = data.withColumn('length',length(data['reviewText']))

In [11]:
data.show(10)

+--------------------+-------+-------+------+
|          reviewText|overall|  class|length|
+--------------------+-------+-------+------+
|Not much to write...|    5.0|   like|   268|
|The product does ...|    5.0|   like|   544|
|The primary job o...|    5.0|   like|   436|
|Nice windscreen p...|    5.0|   like|   206|
|This pop filter i...|    5.0|   like|   159|
|So good that I bo...|    5.0|   like|   234|
|I have used monst...|    5.0|   like|   191|
|I now use this ca...|    3.0|neutral|   845|
|Perfect for my Ep...|    5.0|   like|   201|
|Monster makes the...|    5.0|   like|   217|
+--------------------+-------+-------+------+
only showing top 10 rows



In [12]:
# Pretty Clear Difference
data.groupby('class').mean().show()

+--------+------------------+-----------------+
|   class|      avg(overall)|      avg(length)|
+--------+------------------+-----------------+
|not_like|1.5353319057815846|579.2055674518201|
| neutral|               3.0|579.2111398963731|
|    like|4.7690090888938155|473.1188206606074|
+--------+------------------+-----------------+



In [13]:
data.groupby('class').count().show()

+--------+-----+
|   class|count|
+--------+-----+
|not_like|  467|
| neutral|  772|
|    like| 9022|
+--------+-----+



### Feature Transformations

In [14]:
from pyspark.ml.feature import Tokenizer,StopWordsRemover
from pyspark.ml.feature import CountVectorizer,IDF,StringIndexer
tokenizer = Tokenizer(inputCol="reviewText", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens',outputCol='c_vec')
idf = IDF(inputCol="c_vec", outputCol="tf_idf")
class_to_num = StringIndexer(inputCol='class',outputCol='label')

In [15]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

In [16]:
clean_up = VectorAssembler(inputCols=['tf_idf','length'],outputCol='features')

### The Model

We'll use Naive Bayes, but feel free to play around with this choice!


In [17]:
from pyspark.ml.classification import NaiveBayes

In [18]:
# Use defaults
nb = NaiveBayes()

### Pipeline

In [19]:
from pyspark.ml import Pipeline

In [21]:
data_prep_pipe = Pipeline(stages=[class_to_num,
                                  tokenizer,
                                  stopremove,
                                  count_vec,
                                  idf,
                                  clean_up])

In [22]:
cleaner = data_prep_pipe.fit(data)

In [23]:
clean_data = cleaner.transform(data)

### Training and Evaluation!

In [24]:
clean_data = clean_data.select(['label','features'])

In [25]:
clean_data.show(10)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(51949,[3,12,14,3...|
|  0.0|(51949,[2,3,12,16...|
|  0.0|(51949,[11,19,44,...|
|  0.0|(51949,[18,37,57,...|
|  0.0|(51949,[2,122,132...|
|  0.0|(51949,[0,5,15,21...|
|  0.0|(51949,[5,16,29,1...|
|  1.0|(51949,[1,3,4,8,1...|
|  0.0|(51949,[0,3,12,33...|
|  0.0|(51949,[1,6,15,52...|
+-----+--------------------+
only showing top 10 rows



In [26]:
(training,testing) = clean_data.randomSplit([0.7,0.3])

In [27]:
spam_predictor = nb.fit(training)

In [28]:
data.printSchema()

root
 |-- reviewText: string (nullable = true)
 |-- overall: double (nullable = true)
 |-- class: string (nullable = false)
 |-- length: integer (nullable = true)



In [29]:
test_results = spam_predictor.transform(testing)

In [30]:
test_results.show(10)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(51949,[0],[1.025...|[-6.4364458791922...|[0.88254556731249...|       0.0|
|  0.0|(51949,[0,1,2,3,4...|[-9464.3199117707...|[5.41364644463281...|       1.0|
|  0.0|(51949,[0,1,2,3,4...|[-37790.085841850...|[9.61392136634175...|       1.0|
|  0.0|(51949,[0,1,2,3,4...|[-22194.003180576...|[1.49645589416267...|       1.0|
|  0.0|(51949,[0,1,2,3,4...|[-3891.9440090324...|[1.0,2.6511348389...|       0.0|
|  0.0|(51949,[0,1,2,3,4...|[-20912.362050739...|[1.0,1.0446542006...|       0.0|
|  0.0|(51949,[0,1,2,3,4...|[-9464.4172761728...|[1.0,1.9741604603...|       0.0|
|  0.0|(51949,[0,1,2,3,4...|[-4102.3344908907...|[6.99359242030503...|       1.0|
|  0.0|(51949,[0,1,2,3,5...|[-11856.051529633...|[1.0,1.8591330129...|       0.0|
|  0.0|(51949,[0

In [31]:
test_results.groupBy("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|   65|
|  1.0|       1.0|   75|
|  0.0|       1.0|  514|
|  1.0|       0.0|  157|
|  2.0|       2.0|   42|
|  2.0|       1.0|   39|
|  1.0|       2.0|   24|
|  0.0|       0.0| 2050|
|  0.0|       2.0|  213|
+-----+----------+-----+



In [32]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [33]:
acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting spam was: {}".format(acc))

Accuracy of model at predicting spam was: 0.7320968816341021


In [34]:
## Higher accuracy but not better result!!!

### Use LogisticRegression/Random Forest

#### Logistic Regression

In [37]:
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression

In [38]:
lg = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

In [39]:
predictor_1 = lg.fit(training)

In [40]:
test_results_1 = predictor_1.transform(testing)

In [42]:
# Create a confusion matrix
test_results_1.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|  141|
|  0.0|       1.0|    7|
|  1.0|       0.0|  251|
|  0.0|       0.0| 2766|
|  0.0|       2.0|    4|
|  1.0|       1.0|    2|
|  2.0|       2.0|    5|
|  1.0|       2.0|    3|
+-----+----------+-----+



In [43]:
acc_eval = MulticlassClassificationEvaluator()
acc_1 = acc_eval.evaluate(test_results_1)
print("Accuracy of model at predicting: {}".format(acc_1))

Accuracy of model at predicting: 0.81835169568789


In [44]:
## Higher accuracy but not better result!!!

### Random forest

In [45]:
rf = RandomForestClassifier(labelCol="label", 
                            featuresCol="features", 
                            numTrees = 500, 
                            maxDepth = 5, 
                            maxBins = 64)

In [46]:
predictor_2 = rf.fit(training)

In [47]:
test_results_2 = predictor_2.transform(testing)

In [48]:
# Create a confusion matrix
test_results_2.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|  146|
|  1.0|       0.0|  256|
|  0.0|       0.0| 2777|
+-----+----------+-----+



In [49]:
test_results_2.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0| 3179|
+----------+-----+



In [50]:
acc_eval = MulticlassClassificationEvaluator()
acc_2 = acc_eval.evaluate(test_results_2)
print("Accuracy of model at predicting: {}".format(acc_2))

Accuracy of model at predicting: 0.8145852430247102


In [51]:
## Higher accuracy but too bad result!!!

### Need to resample data

In [52]:
like_df = training.filter(col("label") == 0)
neutral_df = training.filter(col("label") == 1)
not_like_df = training.filter(col("label") == 2)
ratio_1 = int(like_df.count()/neutral_df.count())
ratio_2 = int(like_df.count()/not_like_df.count())
print("ratio like/neutral: {}".format(ratio_1))
print("ratio like/not_like: {}".format(ratio_2))

ratio like/neutral: 12
ratio like/not_like: 19


In [61]:
# resample neutral
a1 = range(ratio_1)
# duplicate the minority rows
oversampled_neutral_df = neutral_df.withColumn("dummy",
explode(array([lit(x) for x in a1]))).drop('dummy')
# combine both oversampled minority rows and previous majority rows 
combined_df = like_df.unionAll(oversampled_neutral_df)
combined_df.show(10)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(51949,[0],[1.025...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
|  0.0|(51949,[0,1,2,3,4...|
+-----+--------------------+
only showing top 10 rows



In [None]:
combined_df.groupBy("label").count().show()

In [None]:
# resample not_like
a2 = range(ratio_2)
# duplicate the minority rows
oversampled_notlike_df = not_like_df.withColumn("dummy", 
 explode(array([lit(x) for x in a2
 .drop('dummy')
# combine both oversampled minority rows and previous majority rows 
combined_df = combined_df.unionAll(oversampled_notlike_df)
combined_df.show(10)

In [None]:
combined_df.groupBy("label").count().show()

### Naive Bayer

In [None]:
predictor_4 = nb.fit(combined_df)

In [None]:
test_results_4 = predictor_4.transform(testing)

In [None]:
test_results_4.groupBy('label', 'prediction').count().show()

In [None]:
acc_eval = MulticlassClassificationEvaluator()
acc_4 = acc_eval.evaluate(test_results_4)
print("Accuracy of model at predicting: {}".format(acc_4))

### Logistic Regression

In [None]:
predictor_5 = lg.fit(combined_df)

In [None]:
test_results_5 = predictor_5.transform(testing)

In [None]:
test_results_5.groupBy('label', 'prediction').count().show()

In [None]:
acc_eval = MulticlassClassificationEvaluator()
acc_5 = acc_eval.evaluate(test_results_5)
print("Accuracy of model at predicting: {}".format(acc_5))

### Random Forest

In [None]:
predictor_3 = rf.fit(combined_df)

In [None]:
test_results_3 = predictor_3.transform(testing)

In [None]:
test_results_3.groupBy('label').count().show()

In [None]:
# Create a confusion matrix
test_results_3.groupBy('label', 'prediction').count().show()

In [None]:
acc_eval = MulticlassClassificationEvaluator()
acc_3 = acc_eval.evaluate(test_results_3)
print("Accuracy of model at predicting: {}".format(acc_3))

In [None]:
## Higher accuracy and better result. But not very good!