In [88]:
#importing modules
import numpy as np
import pyspark
from pyspark import SparkContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

In [89]:
#starting a pyspark session
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [90]:
comments_train = spark.read.options(delimiter=';').csv('train data product reviews.csv', inferSchema=True, header=True)
comments_train.show(truncate=True, n=5)


+-----+--------------------+
|label|                text|
+-----+--------------------+
|    0|"Reference Yes, L...|
|    0|NO!!!!!!I will gi...|
|    0|Neat Features/Unc...|
|    1|"Progressive-Unde...|
|    0|"The theif who st...|
+-----+--------------------+
only showing top 5 rows



In [110]:
comments_train.count(), comments_train.select('label').distinct().count()

(51979, 2)

In 'label' column we have 0's and 1's only. Let's rearrange this data frame as *df_train*.

In [107]:
df_train = comments_train.select('text', 'label')
df_train =df_train.withColumn('label', df_train.label.cast(IntegerType()))
df_train.show(truncate=True, n=20)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|"Reference Yes, L...|    0|
|NO!!!!!!I will gi...|    0|
|Neat Features/Unc...|    0|
|"Progressive-Unde...|    1|
|"The theif who st...|    0|
|This movie was te...|    0|
|I love wood: I ha...|    1|
|Lets me use my ow...|    1|
|Good for study pu...|    0|
|Great for Beadwea...|    1|
|gifts for people:...|    0|
|Very good read!: ...|    1|
|Truly Wonderful: ...|    1|
|"Africa de mi cor...|    1|
|good, but i need ...|    1|
|Turntable with Au...|    0|
|"Very poorly writ...|    0|
|Tired, immature, ...|    0|
|Husband Gerald Br...|    0|
|more of a clash t...|    0|
+--------------------+-----+
only showing top 20 rows



Now we are going to maintain a *df_test* similar to *df_train*.

In [92]:
comments_test = spark.read.options(delimiter=';').csv('test data product reviews.csv', inferSchema=True, header=True)
comments_test.show(truncate=True, n=5)

+--------------------+
|          label,text|
+--------------------+
|0,Not worth the m...|
|"0,""I changed my...|
|"0,""How quickly ...|
|0,DOA Did Not Pow...|
|"0,""support: I o...|
+--------------------+
only showing top 5 rows



In [93]:
comments_test.count()

11703

We are going to use *regex* to describe patters to obtain a clean data frame with columns text and label.

In [108]:
regex_pattern = r'"*([01])(.+)'
comments_test = comments_test.withColumn('text', regexp_extract(col('label,text'), regex_pattern, 2))\
                 .withColumn('label', regexp_extract(col('label,text'), regex_pattern, 1))
df_test = comments_test.select('text', 'label')
df_test =df_test.withColumn('label', df_test.label.cast(IntegerType()))
df_test.show(truncate=True, n=20)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|,Not worth the mo...|    0|
|,""I changed my m...|    0|
|,""How quickly we...|    0|
|,DOA Did Not Powe...|    0|
|,""support: I ord...|    0|
|,""Rewriting this...|    1|
|,""Canon CLI-8 4-...|    1|
|,needs parts: My ...|    0|
|,""Awesome: Does ...|    1|
|,Yeah for Dairy F...|    1|
|,""Good book if y...|    0|
|,""Good way to ke...|    1|
|,""The Best Red S...|    1|
|,""Piece of Crap:...|    0|
|,""SO EASY!!!!: T...|    1|
|,Very Useful Info...|    1|
|,""great product!...|    1|
|,""Breathtakingly...|    0|
|,""I am thankful ...|    1|
|,Very good: Great...|    1|
+--------------------+-----+
only showing top 20 rows



In [109]:
df_test.count(), df_test.select('label').distinct().count()

(11703, 2)

Now that we have both *df_train* and *df_test* in our targetted composition, we can progress with the **sentiment analysis**.

**Data Preparation (Training Data)**

**Tokenizer**

In [112]:
tokenizer = Tokenizer(inputCol='text', outputCol='sentiment_words')
tokenized_train = tokenizer.transform(df_train)
tokenized_train.show(truncate=True, n=10)

+--------------------+-----+--------------------+
|                text|label|     sentiment_words|
+--------------------+-----+--------------------+
|"Reference Yes, L...|    0|["reference, yes,...|
|NO!!!!!!I will gi...|    0|[no!!!!!!i, will,...|
|Neat Features/Unc...|    0|[neat, features/u...|
|"Progressive-Unde...|    1|["progressive-und...|
|"The theif who st...|    0|["the, theif, who...|
|This movie was te...|    0|[this, movie, was...|
|I love wood: I ha...|    1|[i, love, wood:, ...|
|Lets me use my ow...|    1|[lets, me, use, m...|
|Good for study pu...|    0|[good, for, study...|
|Great for Beadwea...|    1|[great, for, bead...|
+--------------------+-----+--------------------+
only showing top 10 rows



**Removing Stop Words**

In [116]:
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='decisive_words')
swr_free_train = swr.transform(tokenized_train)
swr_free_train.show(truncate=True, n=10)

+--------------------+-----+--------------------+--------------------+
|                text|label|     sentiment_words|      decisive_words|
+--------------------+-----+--------------------+--------------------+
|"Reference Yes, L...|    0|["reference, yes,...|["reference, yes,...|
|NO!!!!!!I will gi...|    0|[no!!!!!!i, will,...|[no!!!!!!i, give,...|
|Neat Features/Unc...|    0|[neat, features/u...|[neat, features/u...|
|"Progressive-Unde...|    1|["progressive-und...|["progressive-und...|
|"The theif who st...|    0|["the, theif, who...|["the, theif, sto...|
|This movie was te...|    0|[this, movie, was...|[movie, terrible!...|
|I love wood: I ha...|    1|[i, love, wood:, ...|[love, wood:, in-...|
|Lets me use my ow...|    1|[lets, me, use, m...|[lets, use, coffe...|
|Good for study pu...|    0|[good, for, study...|[good, study, pur...|
|Great for Beadwea...|    1|[great, for, bead...|[great, beadweavi...|
+--------------------+-----+--------------------+--------------------+
only s