<a href="https://colab.research.google.com/github/nickname8888/pyspark-prac/blob/main/udfs_prac.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark nltk



In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType
import re
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

In [3]:
spark = SparkSession.builder.appName("Analyzer").getOrCreate()

In [72]:
# using the sentiment analysis dataset

df = spark.read.csv("/content/IMDB Dataset.csv", header=True, inferSchema=True)

In [73]:
df.show(10)

+--------------------+--------------------+
|              review|           sentiment|
+--------------------+--------------------+
|One of the other ...|            positive|
|"A wonderful litt...| not only is it w...|
|"I thought this w...| but spirited you...|
|Basically there's...|            negative|
|"Petter Mattei's ...| power and succes...|
|"Probably my all-...| but that only ma...|
|I sure would like...|            positive|
|This show was an ...|            negative|
|Encouraged by the...|            negative|
|If you like origi...|            positive|
+--------------------+--------------------+
only showing top 10 rows



In [74]:
df.printSchema()

root
 |-- review: string (nullable = true)
 |-- sentiment: string (nullable = true)



In [75]:
from pyspark.sql.functions import col, sum

def check_nulls(df):
  null_values = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
  null_values.show()

In [25]:
check_nulls(df)

+------+---------+
|review|sentiment|
+------+---------+
|     0|        7|
+------+---------+



In [76]:
df.filter(col("sentiment").isNull()).show()

+--------------------+---------+
|              review|sentiment|
+--------------------+---------+
|".... may seem fa...|     NULL|
|"And that comes f...|     NULL|
|"Sorry everyone,,...|     NULL|
|"With a special t...|     NULL|
|"I've seen a lot ...|     NULL|
|"I happened to se...|     NULL|
|"seriously what t...|     NULL|
+--------------------+---------+



In [77]:
df_clean = df.dropna()
df_clean.show(5)

+--------------------+--------------------+
|              review|           sentiment|
+--------------------+--------------------+
|One of the other ...|            positive|
|"A wonderful litt...| not only is it w...|
|"I thought this w...| but spirited you...|
|Basically there's...|            negative|
|"Petter Mattei's ...| power and succes...|
+--------------------+--------------------+
only showing top 5 rows



In [78]:
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove special characters
    text = text.strip()  # remove extra spaces
    return text

clean_text_udf = udf(clean_text, StringType())

In [79]:
df = df.withColumn("cleaned_review", clean_text_udf(col("review")))
df.show(5)

+--------------------+--------------------+--------------------+
|              review|           sentiment|      cleaned_review|
+--------------------+--------------------+--------------------+
|One of the other ...|            positive|one of the other ...|
|"A wonderful litt...| not only is it w...|a wonderful littl...|
|"I thought this w...| but spirited you...|i thought this wa...|
|Basically there's...|            negative|basically theres ...|
|"Petter Mattei's ...| power and succes...|petter matteis lo...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [80]:
# count the number of occurences of sentiments: positive, negative, something else

positive_count = df.filter(col("sentiment") == "positive").count()
negative_count = df.filter(col("sentiment") == "negative").count()
other_count = df.filter((col("sentiment") != "positive") & (col("sentiment") != "negative")).count()

print("Positive count:", positive_count)
print("Negative count:", negative_count)
print("Other count:", other_count)

Positive count: 14897
Negative count: 13792
Other count: 21304


In [81]:
# dropping all other counts

df = df.filter(col("sentiment").isin(["positive", "negative"]))
df.show(5)

+--------------------+---------+--------------------+
|              review|sentiment|      cleaned_review|
+--------------------+---------+--------------------+
|One of the other ...| positive|one of the other ...|
|Basically there's...| negative|basically theres ...|
|I sure would like...| positive|i sure would like...|
|This show was an ...| negative|this show was an ...|
|Encouraged by the...| negative|encouraged by the...|
+--------------------+---------+--------------------+
only showing top 5 rows



In [82]:
nltk.download("vader_lexicon")
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [83]:
# sentiment score function

def get_sentiment(text):
    score = sia.polarity_scores(text)["compound"]
    if score >= 0.05:
        return "positive"
    elif score <= -0.05:
        return "negative"
    else:
        return "neutral"

In [84]:
# convert to udf

get_sentiment_udf = udf(get_sentiment, StringType())

df = df.withColumn("predicted_sentiment", get_sentiment_udf(col("cleaned_review")))

df.select("review", "sentiment", "predicted_sentiment").show(1, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [85]:
df.printSchema()

root
 |-- review: string (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- cleaned_review: string (nullable = true)
 |-- predicted_sentiment: string (nullable = true)



In [86]:
df.show(5)

+--------------------+---------+--------------------+-------------------+
|              review|sentiment|      cleaned_review|predicted_sentiment|
+--------------------+---------+--------------------+-------------------+
|One of the other ...| positive|one of the other ...|           negative|
|Basically there's...| negative|basically theres ...|           negative|
|I sure would like...| positive|i sure would like...|           positive|
|This show was an ...| negative|this show was an ...|           positive|
|Encouraged by the...| negative|encouraged by the...|           positive|
+--------------------+---------+--------------------+-------------------+
only showing top 5 rows



In [87]:
# replacing string values for numeric values

df = df.replace({"positive":"1", "negative":"0"}, subset=["sentiment"]).withColumn("sentiment", col("sentiment").cast("int"))
df = df.replace({"positive":"1", "negative":"0"}, subset=["predicted_sentiment"]).withColumn("predicted_sentiment", col("predicted_sentiment").cast("int"))

In [88]:
df.show(5)

+--------------------+---------+--------------------+-------------------+
|              review|sentiment|      cleaned_review|predicted_sentiment|
+--------------------+---------+--------------------+-------------------+
|One of the other ...|        1|one of the other ...|                  0|
|Basically there's...|        0|basically theres ...|                  0|
|I sure would like...|        1|i sure would like...|                  1|
|This show was an ...|        0|this show was an ...|                  1|
|Encouraged by the...|        0|encouraged by the...|                  1|
+--------------------+---------+--------------------+-------------------+
only showing top 5 rows



In [89]:
# getting vader model accuracy

from pyspark.sql.functions import col, avg

accuracy = df.withColumn("correct", (col("sentiment") == col("predicted_sentiment")).cast("int")) \
  .select(avg("correct")).collect()[0][0]


Vader Model Accuracy: 0.7130214026303727


In [90]:
print("Vader Model Accuracy:", accuracy * 100)

Vader Model Accuracy: 71.30214026303727
