In [13]:
from pyspark.sql import SparkSession

# Start Spark session
spark = SparkSession.builder.appName("NaiveBayes_Example").getOrCreate()

workspace_default_storage_account = "projectgstoragedfb938a3e"
workspace_default_container = "azureml-blobstore-becc8696-e562-432e-af12-8a5e3e1f9b0f"
workspace_wasbs_base_url = f"wasbs://{workspace_default_container}@{workspace_default_storage_account}.blob.core.windows.net/"

output_path = f"{workspace_wasbs_base_url}nlp_sentiment_sample_submissions.parquet"
# Read the Parquet file back into a dataframe
df_read_back = spark.read.parquet(output_path)

# Show first 5 rows
#df_read_back.show(5)
#df_read_back.printSchema()

from pyspark.sql import functions as F
df_flat = df_read_back.withColumn("sentiment_result", F.explode(F.col("sentiment"))) \
            .select("text", "sentiment_result.result")

# Show the results
#df_flat.show(truncate=False)

# Create a 100% predictable dataset
df = spark.createDataFrame([
    ("I feel great", "pos", "cancer"),
    ("This is amazing", "pos", "non-cancer"),
    ("Not good at all", "neg", "cancer"),
    ("Terrible experience", "neg", "non-cancer")
], ["text", "sentiment", "cancer"])

print(df.show())

#df = df_flat
df = df.withColumnRenamed("result", "sentiment")
df.show()



StatementMeta(ba5e360d-f184-47a0-9859-76b5031b79e3, 36, 18, Finished, Available, Finished)

+-------------------+---------+----------+
|               text|sentiment|    cancer|
+-------------------+---------+----------+
|       I feel great|      pos|    cancer|
|    This is amazing|      pos|non-cancer|
|    Not good at all|      neg|    cancer|
|Terrible experience|      neg|non-cancer|
+-------------------+---------+----------+

None
+-------------------+---------+----------+
|               text|sentiment|    cancer|
+-------------------+---------+----------+
|       I feel great|      pos|    cancer|
|    This is amazing|      pos|non-cancer|
|    Not good at all|      neg|    cancer|
|Terrible experience|      neg|non-cancer|
+-------------------+---------+----------+



In [14]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

# Tokenizer and stop words removal
tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
stopwords_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")

# Vectorization and TF-IDF
count_vectorizer = CountVectorizer(inputCol="filtered_tokens", outputCol="raw_features")
idf = IDF(inputCol="raw_features", outputCol="features")

# Label encoding
indexer = StringIndexer(inputCol="sentiment", outputCol="label")

# Build pipeline
pipeline = Pipeline(stages=[tokenizer, stopwords_remover, count_vectorizer, idf, indexer])

# Fit and transform
processed_df = pipeline.fit(df).transform(df)

# Split the data (use all data for training for simplicity)
train_data, test_data = processed_df.randomSplit([0.8, 0.2], seed=42)

# Train Naive Bayes model
nb = NaiveBayes(featuresCol="features", labelCol="label", modelType="multinomial")
nb_model = nb.fit(train_data)

# Predict
predictions = nb_model.transform(test_data)

# Evaluate
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"Test Accuracy: {accuracy:.2f}")

StatementMeta(ba5e360d-f184-47a0-9859-76b5031b79e3, 36, 19, Finished, Available, Finished)

Test Accuracy: 1.00
