# **Import Library**

In [None]:
import pandas as pd

from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover

# **Creating Spark Application**

In [None]:
spark = SparkSession \
  .builder \
  .appName("Sentimen Analisis di Spark")  \
  .config("spark.some.config.option","some-value") \
  .getOrCreate()

In [None]:
spark

# **Read Dataset**

In [None]:
sentimen = spark.read.csv('tweets.csv',
                          header=True,
                          inferSchema=True
                          )

sentimen.show(truncate=False, n=50)

+------+---------+---------------+-----------------------------------------------------------+
|ItemID|Sentiment|SentimentSource|SentimentText                                              |
+------+---------+---------------+-----------------------------------------------------------+
|1038  |1        |Sentiment140   |that film is fantastic #brilliant                          |
|1804  |0        |Sentiment140   |this music is really bad #myband                           |
|1693  |0        |Sentiment140   |winter is terrible #thumbs-down                            |
|1477  |0        |Sentiment140   |this game is awful #nightmare                              |
|45    |1        |Sentiment140   |I love jam #loveit                                         |
|246   |0        |Sentiment140   |I dislike skiing #rubbish                                  |
|776   |1        |Sentiment140   |I like pop music #toptastic                                |
|1666  |1        |Sentiment140   |this game is awf

In [None]:
sentimen = sentimen.select('SentimentText', col('Sentiment').cast('Int').alias('label'))

sentimen.show(truncate=False, n=5)

+---------------------------------+-----+
|SentimentText                    |label|
+---------------------------------+-----+
|that film is fantastic #brilliant|1    |
|this music is really bad #myband |0    |
|winter is terrible #thumbs-down  |0    |
|this game is awful #nightmare    |0    |
|I love jam #loveit               |1    |
+---------------------------------+-----+
only showing top 5 rows



# **Separating Training and Testing Data**

In [None]:
dataTerpisah = sentimen.randomSplit([0.7,0.3])
train = dataTerpisah[0]

# in the testing data, rename the label from "label" to "trueLabel"
test = dataTerpisah[1].withColumnRenamed("label","trueLabel")
train_rows = train.count()
test_row = test.count()

print("Jumlah Baris Data Training:",train_rows,"dan Jumlah Baris Data Testing:",test_row)

Jumlah Baris Data Training: 1372 dan Jumlah Baris Data Testing: 560


# **Tokenizer**

In [None]:
tokenizer = Tokenizer(inputCol='SentimentText', outputCol='SentimentWords')

tokenizedTrain = tokenizer.transform(train)

tokenizedTrain.show(truncate=False, n=10)

+----------------------------------+-----+----------------------------------------+
|SentimentText                     |label|SentimentWords                          |
+----------------------------------+-----+----------------------------------------+
|I adore cheese #bestever          |1    |[i, adore, cheese, #bestever]           |
|I adore cheese #brilliant         |1    |[i, adore, cheese, #brilliant]          |
|I adore cheese #favorite          |1    |[i, adore, cheese, #favorite]           |
|I adore cheese #loveit            |1    |[i, adore, cheese, #loveit]             |
|I adore cheese #thumbs-up         |1    |[i, adore, cheese, #thumbs-up]          |
|I adore classical music #favorite |1    |[i, adore, classical, music, #favorite] |
|I adore classical music #thumbs-up|1    |[i, adore, classical, music, #thumbs-up]|
|I adore classical music #toptastic|1    |[i, adore, classical, music, #toptastic]|
|I adore coffee #bestever          |1    |[i, adore, coffee, #bestever]     

# **Removing Unimportant Words**

In [None]:
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                       outputCol='MeaningfulWords')

SwRemovedTrain = swr.transform(tokenizedTrain)

SwRemovedTrain.show(truncate=False, n=5)

+-------------------------+-----+------------------------------+---------------------------+
|SentimentText            |label|SentimentWords                |MeaningfulWords            |
+-------------------------+-----+------------------------------+---------------------------+
|I adore cheese #bestever |1    |[i, adore, cheese, #bestever] |[adore, cheese, #bestever] |
|I adore cheese #brilliant|1    |[i, adore, cheese, #brilliant]|[adore, cheese, #brilliant]|
|I adore cheese #favorite |1    |[i, adore, cheese, #favorite] |[adore, cheese, #favorite] |
|I adore cheese #loveit   |1    |[i, adore, cheese, #loveit]   |[adore, cheese, #loveit]   |
|I adore cheese #thumbs-up|1    |[i, adore, cheese, #thumbs-up]|[adore, cheese, #thumbs-up]|
+-------------------------+-----+------------------------------+---------------------------+
only showing top 5 rows



# **Vectorization**

In [None]:
hashTF = HashingTF(inputCol=swr.getOutputCol(), outputCol='features')

numericTrain = hashTF.transform(SwRemovedTrain).select('label','MeaningfulWords','features')

numericTrain.show(truncate=False, n=3)

+-----+---------------------------+-------------------------------------------+
|label|MeaningfulWords            |features                                   |
+-----+---------------------------+-------------------------------------------+
|1    |[adore, cheese, #bestever] |(262144,[1689,91011,100089],[1.0,1.0,1.0]) |
|1    |[adore, cheese, #brilliant]|(262144,[1689,45361,100089],[1.0,1.0,1.0]) |
|1    |[adore, cheese, #favorite] |(262144,[1689,100089,108624],[1.0,1.0,1.0])|
+-----+---------------------------+-------------------------------------------+
only showing top 3 rows



# **Training**

In [None]:
lr = LogisticRegression(labelCol='label', featuresCol='features',
                        maxIter=10, regParam=0.01)

model = lr.fit(numericTrain)

print('Training Selesai!')

Training Selesai!


# **Data Testing**

In [None]:
tokenizedTest = tokenizer.transform(test)
SwRemovedTest = swr.transform(tokenizedTest)
numericTest = hashTF.transform(SwRemovedTest).select('trueLabel','MeaningfulWords','features')

numericTest.show(truncate=False, n=500)

+---------+-------------------------------------------+------------------------------------------------------------------+
|trueLabel|MeaningfulWords                            |features                                                          |
+---------+-------------------------------------------+------------------------------------------------------------------+
|1        |[adore, cheese, #toptastic]                |(262144,[1689,42010,100089],[1.0,1.0,1.0])                        |
|1        |[adore, classical, music, #bestever]       |(262144,[91011,100089,102383,131250],[1.0,1.0,1.0,1.0])           |
|1        |[adore, classical, music, #brilliant]      |(262144,[45361,100089,102383,131250],[1.0,1.0,1.0,1.0])           |
|1        |[adore, classical, music, #loveit]         |(262144,[100089,102383,131250,254974],[1.0,1.0,1.0,1.0])          |
|1        |[adore, coffee, #brilliant]                |(262144,[45361,100089,159212],[1.0,1.0,1.0])                      |
|1        |[ador

# **Predicting and Calculating Model Accuracy**

In [None]:
prediksimentah = model.transform(numericTest)
prediksifinal = prediksimentah.select('MeaningfulWords','prediction','trueLabel')

prediksifinal.show(truncate=False, n=50)

+-------------------------------------+----------+---------+
|MeaningfulWords                      |prediction|trueLabel|
+-------------------------------------+----------+---------+
|[adore, cheese, #toptastic]          |1.0       |1        |
|[adore, classical, music, #bestever] |1.0       |1        |
|[adore, classical, music, #brilliant]|1.0       |1        |
|[adore, classical, music, #loveit]   |1.0       |1        |
|[adore, coffee, #brilliant]          |1.0       |1        |
|[adore, coffee, #toptastic]          |1.0       |1        |
|[adore, jam, #brilliant]             |1.0       |1        |
|[adore, jam, #favorite]              |1.0       |1        |
|[adore, pop, music, #bestever]       |1.0       |1        |
|[adore, pop, music, #loveit]         |1.0       |1        |
|[adore, rock, music, #bestever]      |1.0       |1        |
|[adore, skiing, #brilliant]          |1.0       |1        |
|[adore, skiing, #favorite]           |1.0       |1        |
|[adore, skiing, #loveit

In [None]:
prediksibenar = prediksifinal.filter(prediksifinal['prediction'] == prediksifinal['trueLabel']).count()
totaldata = prediksifinal.count()

print('Prediksi benar: ',prediksibenar,', Total Data: ',totaldata,', Akurasi: ',prediksibenar/totaldata)

Prediksi benar:  552 , Total Data:  560 , Akurasi:  0.9857142857142858
