In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/29 12:08:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
df = spark.read.format("csv")\
    .option("header", True)\
    .option("inferSchema", True)\
    .load("/Users/d.c.deh./Documents/Visual Studio/Data science 2/Csv files/IMDB Dataset.csv")

In [3]:
# There are comma's in the reviews, so I have to clean this up a bit because currently it puts part of the reviews in the sentiment column
df.show(5)

+--------------------+--------------------+
|              review|           sentiment|
+--------------------+--------------------+
|One of the other ...|            positive|
|"A wonderful litt...| not only is it w...|
|"I thought this w...| but spirited you...|
|Basically there's...|            negative|
|"Petter Mattei's ...| power and succes...|
+--------------------+--------------------+
only showing top 5 rows



In [4]:
# I take out the rows where the review is also partly in the sentiment column. There is probably a better way to do this without dropping these rows.
filtered_df = df.filter((df.sentiment == "positive") | (df.sentiment == "negative"))

In [5]:
filtered_df.show(5)

+--------------------+---------+
|              review|sentiment|
+--------------------+---------+
|One of the other ...| positive|
|Basically there's...| negative|
|I sure would like...| positive|
|This show was an ...| negative|
|Encouraged by the...| negative|
+--------------------+---------+
only showing top 5 rows



# Reworking the review column

In [6]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer

# Defining a tokenizer
tokenizer = Tokenizer(inputCol = "review", outputCol = "words")

In [7]:
# Applying the tokenizer to the dataframe
splitted_review = tokenizer.transform(filtered_df)

In [8]:
from pyspark.ml.feature import StopWordsRemover

# Removing the stop words
remover = StopWordsRemover(inputCol = "words", outputCol = "filtered review")
no_stop_words_df = remover.transform(splitted_review)

In [9]:
no_stop_words_df.show(5)

+--------------------+---------+--------------------+--------------------+
|              review|sentiment|               words|     filtered review|
+--------------------+---------+--------------------+--------------------+
|One of the other ...| positive|[one, of, the, ot...|[one, reviewers, ...|
|Basically there's...| negative|[basically, there...|[basically, famil...|
|I sure would like...| positive|[i, sure, would, ...|[sure, like, see,...|
|This show was an ...| negative|[this, show, was,...|[show, amazing,, ...|
|Encouraged by the...| negative|[encouraged, by, ...|[encouraged, posi...|
+--------------------+---------+--------------------+--------------------+
only showing top 5 rows



In [10]:
from pyspark.ml.feature import HashingTF

# Changing the text to numbers
hashing = HashingTF(inputCol = "filtered review", outputCol = "hashed review")
transformed_df = hashing.transform(no_stop_words_df)

In [11]:
transformed_df.show(5)

+--------------------+---------+--------------------+--------------------+--------------------+
|              review|sentiment|               words|     filtered review|       hashed review|
+--------------------+---------+--------------------+--------------------+--------------------+
|One of the other ...| positive|[one, of, the, ot...|[one, reviewers, ...|(262144,[3280,436...|
|Basically there's...| negative|[basically, there...|[basically, famil...|(262144,[6512,853...|
|I sure would like...| positive|[i, sure, would, ...|[sure, like, see,...|(262144,[1889,545...|
|This show was an ...| negative|[this, show, was,...|[show, amazing,, ...|(262144,[2437,825...|
|Encouraged by the...| negative|[encouraged, by, ...|[encouraged, posi...|(262144,[8538,149...|
+--------------------+---------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [12]:
from pyspark.sql.functions import when

# Changing the sentiment to binary values
binary_df = transformed_df.withColumn("rating", when (transformed_df.sentiment == "positive", 1).otherwise(0))

In [13]:
binary_df.show(5)

+--------------------+---------+--------------------+--------------------+--------------------+------+
|              review|sentiment|               words|     filtered review|       hashed review|rating|
+--------------------+---------+--------------------+--------------------+--------------------+------+
|One of the other ...| positive|[one, of, the, ot...|[one, reviewers, ...|(262144,[3280,436...|     1|
|Basically there's...| negative|[basically, there...|[basically, famil...|(262144,[6512,853...|     0|
|I sure would like...| positive|[i, sure, would, ...|[sure, like, see,...|(262144,[1889,545...|     1|
|This show was an ...| negative|[this, show, was,...|[show, amazing,, ...|(262144,[2437,825...|     0|
|Encouraged by the...| negative|[encouraged, by, ...|[encouraged, posi...|(262144,[8538,149...|     0|
+--------------------+---------+--------------------+--------------------+--------------------+------+
only showing top 5 rows



In [14]:
# Selecting the relevant columns
prepared_df = binary_df["rating", "hashed review"]

In [15]:
prepared_df.show(5)

+------+--------------------+
|rating|       hashed review|
+------+--------------------+
|     1|(262144,[3280,436...|
|     0|(262144,[6512,853...|
|     1|(262144,[1889,545...|
|     0|(262144,[2437,825...|
|     0|(262144,[8538,149...|
+------+--------------------+
only showing top 5 rows



# Creating the model

In [16]:
train, test = prepared_df.randomSplit([0.7, 0.3])

In [17]:
from pyspark.ml.classification import LogisticRegression

# Defining the model
lr = LogisticRegression(featuresCol = "hashed review", labelCol = "rating", maxIter=10)

lrModel = lr.fit(train)

24/05/29 12:08:45 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/05/29 12:08:45 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
                                                                                

24/05/29 12:08:54 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [18]:
predictions = lrModel.transform(test)
predictions.take(1)

24/05/29 12:08:59 WARN DAGScheduler: Broadcasting large task binary with size 1602.9 KiB


[Row(rating=0, hashed review=SparseVector(262144, {19: 1.0, 3926: 1.0, 5297: 1.0, 7629: 1.0, 15775: 1.0, 15965: 1.0, 17252: 1.0, 17291: 1.0, 20236: 1.0, 21534: 1.0, 25629: 2.0, 26844: 1.0, 34343: 1.0, 41095: 1.0, 43890: 1.0, 51007: 3.0, 51471: 1.0, 51678: 1.0, 52366: 1.0, 52471: 1.0, 53651: 1.0, 58074: 1.0, 58227: 1.0, 68821: 2.0, 70152: 1.0, 81916: 1.0, 84025: 1.0, 84738: 1.0, 87405: 1.0, 87419: 1.0, 93757: 1.0, 93803: 1.0, 95513: 1.0, 95685: 1.0, 96984: 1.0, 98431: 1.0, 100694: 1.0, 100869: 1.0, 105448: 1.0, 113432: 1.0, 115002: 1.0, 120228: 1.0, 123874: 1.0, 124786: 1.0, 133243: 1.0, 134032: 1.0, 134685: 1.0, 137733: 1.0, 141037: 1.0, 144803: 1.0, 146027: 1.0, 148880: 1.0, 150892: 1.0, 153078: 1.0, 153169: 1.0, 158102: 1.0, 158661: 1.0, 164964: 1.0, 166377: 1.0, 171368: 1.0, 175786: 1.0, 185450: 1.0, 186635: 1.0, 186925: 1.0, 188835: 1.0, 191174: 1.0, 191458: 1.0, 191515: 1.0, 192450: 1.0, 197438: 1.0, 202721: 1.0, 203895: 1.0, 207842: 1.0, 211154: 1.0, 214676: 1.0, 219879: 1.0, 221

In [20]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Defining an evaluator for the model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="rating", metricName="accuracy")

# Using the evaluator to compute the accuracy
accuracy = evaluator.evaluate(predictions)

print("Accuracy:", accuracy)

24/05/29 12:09:22 WARN DAGScheduler: Broadcasting large task binary with size 1601.4 KiB


Accuracy: 0.8494936561517867


                                                                                

#### Other models won't work, ALS requires 3 columns and the decision tree or random forest gives an error because the calculations are too large for the processor