In [163]:
#importing modules
import numpy as np
import pyspark
from pyspark import SparkContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

In [164]:
#starting a pyspark session
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:52868)
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/py4j/java_gateway.py", line 977, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/py4j/java_gateway.py", line 1115, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 61] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:52868)

In [90]:
comments_train = spark.read.options(delimiter=';').csv('train data product reviews.csv', inferSchema=True, header=True)
comments_train.show(truncate=True, n=5)


+-----+--------------------+
|label|                text|
+-----+--------------------+
|    0|"Reference Yes, L...|
|    0|NO!!!!!!I will gi...|
|    0|Neat Features/Unc...|
|    1|"Progressive-Unde...|
|    0|"The theif who st...|
+-----+--------------------+
only showing top 5 rows



In [110]:
comments_train.count(), comments_train.select('label').distinct().count()

(51979, 2)

In 'label' column we have 0's and 1's only. Let's rearrange this data frame as *df_train*.

In [107]:
df_train = comments_train.select('text', 'label')
df_train =df_train.withColumn('label', df_train.label.cast(IntegerType()))
df_train.show(truncate=True, n=20)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|"Reference Yes, L...|    0|
|NO!!!!!!I will gi...|    0|
|Neat Features/Unc...|    0|
|"Progressive-Unde...|    1|
|"The theif who st...|    0|
|This movie was te...|    0|
|I love wood: I ha...|    1|
|Lets me use my ow...|    1|
|Good for study pu...|    0|
|Great for Beadwea...|    1|
|gifts for people:...|    0|
|Very good read!: ...|    1|
|Truly Wonderful: ...|    1|
|"Africa de mi cor...|    1|
|good, but i need ...|    1|
|Turntable with Au...|    0|
|"Very poorly writ...|    0|
|Tired, immature, ...|    0|
|Husband Gerald Br...|    0|
|more of a clash t...|    0|
+--------------------+-----+
only showing top 20 rows



Now we are going to maintain a *df_test* similar to *df_train*.

In [92]:
comments_test = spark.read.options(delimiter=';').csv('test data product reviews.csv', inferSchema=True, header=True)
comments_test.show(truncate=True, n=5)

+--------------------+
|          label,text|
+--------------------+
|0,Not worth the m...|
|"0,""I changed my...|
|"0,""How quickly ...|
|0,DOA Did Not Pow...|
|"0,""support: I o...|
+--------------------+
only showing top 5 rows



In [93]:
comments_test.count()

11703

We are going to use *regex* to describe patters to obtain a clean data frame with columns text and label.

In [108]:
regex_pattern = r'"*([01])(.+)'
comments_test = comments_test.withColumn('text', regexp_extract(col('label,text'), regex_pattern, 2))\
                 .withColumn('label', regexp_extract(col('label,text'), regex_pattern, 1))
df_test = comments_test.select('text', 'label')
df_test =df_test.withColumn('label', df_test.label.cast(IntegerType()))
df_test.show(truncate=True, n=20)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|,Not worth the mo...|    0|
|,""I changed my m...|    0|
|,""How quickly we...|    0|
|,DOA Did Not Powe...|    0|
|,""support: I ord...|    0|
|,""Rewriting this...|    1|
|,""Canon CLI-8 4-...|    1|
|,needs parts: My ...|    0|
|,""Awesome: Does ...|    1|
|,Yeah for Dairy F...|    1|
|,""Good book if y...|    0|
|,""Good way to ke...|    1|
|,""The Best Red S...|    1|
|,""Piece of Crap:...|    0|
|,""SO EASY!!!!: T...|    1|
|,Very Useful Info...|    1|
|,""great product!...|    1|
|,""Breathtakingly...|    0|
|,""I am thankful ...|    1|
|,Very good: Great...|    1|
+--------------------+-----+
only showing top 20 rows



In [109]:
df_test.count(), df_test.select('label').distinct().count()

(11703, 2)

Now that we have both *df_train* and *df_test* in our targetted composition, we can progress with the **sentiment analysis**.

**Data Pre-processing (Training Data)**

**Tokenizer**

In [112]:
tokenizer = Tokenizer(inputCol='text', outputCol='sentiment_words')
tokenized_train = tokenizer.transform(df_train)
tokenized_train.show(truncate=True, n=10)

+--------------------+-----+--------------------+
|                text|label|     sentiment_words|
+--------------------+-----+--------------------+
|"Reference Yes, L...|    0|["reference, yes,...|
|NO!!!!!!I will gi...|    0|[no!!!!!!i, will,...|
|Neat Features/Unc...|    0|[neat, features/u...|
|"Progressive-Unde...|    1|["progressive-und...|
|"The theif who st...|    0|["the, theif, who...|
|This movie was te...|    0|[this, movie, was...|
|I love wood: I ha...|    1|[i, love, wood:, ...|
|Lets me use my ow...|    1|[lets, me, use, m...|
|Good for study pu...|    0|[good, for, study...|
|Great for Beadwea...|    1|[great, for, bead...|
+--------------------+-----+--------------------+
only showing top 10 rows



**Removing Stop Words**

In [116]:
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='decisive_words')
swr_free_train = swr.transform(tokenized_train)
swr_free_train.show(truncate=True, n=10)

+--------------------+-----+--------------------+--------------------+
|                text|label|     sentiment_words|      decisive_words|
+--------------------+-----+--------------------+--------------------+
|"Reference Yes, L...|    0|["reference, yes,...|["reference, yes,...|
|NO!!!!!!I will gi...|    0|[no!!!!!!i, will,...|[no!!!!!!i, give,...|
|Neat Features/Unc...|    0|[neat, features/u...|[neat, features/u...|
|"Progressive-Unde...|    1|["progressive-und...|["progressive-und...|
|"The theif who st...|    0|["the, theif, who...|["the, theif, sto...|
|This movie was te...|    0|[this, movie, was...|[movie, terrible!...|
|I love wood: I ha...|    1|[i, love, wood:, ...|[love, wood:, in-...|
|Lets me use my ow...|    1|[lets, me, use, m...|[lets, use, coffe...|
|Good for study pu...|    0|[good, for, study...|[good, study, pur...|
|Great for Beadwea...|    1|[great, for, bead...|[great, beadweavi...|
+--------------------+-----+--------------------+--------------------+
only s

**Hashing**

In [119]:
from pyspark.ml.feature import HashingTF, IDF

hashingTF = HashingTF(inputCol=swr.getOutputCol(), outputCol="raw_features")
numeric_train = hashingTF.transform(swr_free_train).select('decisive_words','raw_features', 'label')
numeric_train.show(truncate=True, n=10)


+--------------------+--------------------+-----+
|      decisive_words|        raw_features|label|
+--------------------+--------------------+-----+
|["reference, yes,...|(262144,[11946,14...|    0|
|[no!!!!!!i, give,...|(262144,[6102,137...|    0|
|[neat, features/u...|(262144,[2306,278...|    0|
|["progressive-und...|(262144,[1148,576...|    1|
|["the, theif, sto...|(262144,[1546,188...|    0|
|[movie, terrible!...|(262144,[1696,424...|    0|
|[love, wood:, in-...|(262144,[12524,41...|    1|
|[lets, use, coffe...|(262144,[1354,538...|    1|
|[good, study, pur...|(262144,[4578,757...|    0|
|[great, beadweavi...|(262144,[2574,649...|    1|
+--------------------+--------------------+-----+
only showing top 10 rows



# Logistic Regression**

In [120]:
logreg = LogisticRegression(labelCol='label', featuresCol='raw_features', maxIter = 10, regParam=.01)
model_lr = logreg.fit(numeric_train)
print('model_lr is trained')

model_lr is trained


In [121]:
tokenized_test = tokenizer.transform(df_test)
swr_free_test = swr.transform(tokenized_test)
numeric_test = hashingTF.transform(swr_free_test).select('decisive_words','raw_features', 'label')
numeric_test.show(truncate=True, n=10)

+--------------------+--------------------+-----+
|      decisive_words|        raw_features|label|
+--------------------+--------------------+-----+
|[,not, worth, mon...|(262144,[7777,132...|    0|
|[,""i, changed, m...|(262144,[1578,230...|    0|
|[,""how, quickly,...|(262144,[2448,861...|    0|
|[,doa, power, box...|(262144,[35590,38...|    0|
|[,""support:, ord...|(262144,[4907,997...|    0|
|[,""rewriting, ma...|(262144,[2325,748...|    1|
|[,""canon, cli-8,...|(262144,[77073,84...|    1|
|[,needs, parts:, ...|(262144,[1546,538...|    0|
|[,""awesome:, sup...|(262144,[11422,44...|    1|
|[,yeah, dairy, fr...|(262144,[5729,762...|    1|
+--------------------+--------------------+-----+
only showing top 10 rows



In [134]:
predict_logreg = model_lr.transform(numeric_test)
predicted_logreg_df = predict_logreg.select(
    "decisive_words", "prediction", "label")
predicted_logreg_df = predicted_logreg_df.withColumn('prediction', predicted_logreg_df.prediction.cast(IntegerType()))
predicted_logreg_df.show(truncate = True, n=10)


+--------------------+----------+-----+
|      decisive_words|prediction|label|
+--------------------+----------+-----+
|[,not, worth, mon...|         0|    0|
|[,""i, changed, m...|         0|    0|
|[,""how, quickly,...|         0|    0|
|[,doa, power, box...|         0|    0|
|[,""support:, ord...|         0|    0|
|[,""rewriting, ma...|         1|    1|
|[,""canon, cli-8,...|         1|    1|
|[,needs, parts:, ...|         0|    0|
|[,""awesome:, sup...|         1|    1|
|[,yeah, dairy, fr...|         0|    1|
+--------------------+----------+-----+
only showing top 10 rows



In [148]:
def confusion_matrix(df,prediction,label):
    """
    Generates a manual confusion matrix in a pyspark data frame, which is assembled according to the classification prediction. 
    df = Data Frame with prediction and label values
    prediction = string, column name of the prediction values
    label = string, column name of the label values
    """
    correctly_predicted = df.filter(df.prediction == df.label).count()
    false_positive = df.filter((df.prediction == 1) & (df.label == 0)).count()
    false_negative = df.filter((df.prediction == 0) & (df.label == 1)).count()
    true_positive = df.filter((df.prediction == 1) & (df.label == 1)).count()
    true_negative = df.filter((df.prediction == 0) & (df.label == 0)).count()
    
    accuracy = correctly_predicted/df.count()
    precision = true_positive/(true_positive + false_positive)
    recall = true_positive/(true_positive + false_negative)
    f1_score = 2 * ((precision * recall)/(precision + recall))

    
    print(f'Correctly Predicted (True Positive): {correctly_predicted} which is %{correctly_predicted/df.count()}')
    print(f'Type-I Error (False Positive): {false_positive} which is %{false_positive/df.count()}')
    print(f'Type-II Error (False Negative): {false_negative} which is %{false_negative_logreg/predicted_logreg_df.count()}')
    print(f'Accuracy: %{accuracy}')
    print(f'Precision: %{precision}')
    print(f'Sensitivity(Recall): %{recall}')
    print(f'F1 Score: %{f1_score}')



In [149]:
confusion_matrix(predicted_logreg_df, 'prediction','label')

Correctly Predicted (True Positive): 8827 which is %0.7542510467401521
Type-I Error (False Positive): 1941 which is %0.1658549089976929
Type-II Error (False Negative): 935 which is %0.079894044262155
Accuracy: %0.7542510467401521
Precision: %0.7156043956043956
Sensitivity(Recall): %0.8393194706994329
F1 Score: %0.7725403353369187


# Random Forest

In [159]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [160]:
rf = RandomForestClassifier(featuresCol = 'raw_features', labelCol = 'label')
model_rf = rf.fit(numeric_train)


ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:52868)
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/py4j/java_gateway.py", line 977, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/py4j/java_gateway.py", line 1115, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 61] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:52868)

In [157]:
predict_rf = model_rf.transform(numeric_test)
predicted_rf_df = predict_rf.select(
    "decisive_words", "prediction", "label")
predicted_rf_df = predicted_rf_df.withColumn('prediction', predicted_rf_df.prediction.cast(IntegerType()))
predicted_rf_df.show(truncate = True, n=10)


NameError: name 'model_rf' is not defined

In [None]:
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")