In [None]:
# prompt: generate code for spark navie bays with /content/NB.csv data set
!pip install pyspark



In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
# Create a SparkSession
spark = SparkSession.builder.appName("NaiveBayesExample").getOrCreate()

In [None]:
# Load the data
data = spark.read.csv("/content/NB.csv", header=True, inferSchema=True)
data.show()

+--------+-----------+--------+-----+----+
| Outlook|Temperature|Humidity|Windy|Play|
+--------+-----------+--------+-----+----+
|   Sunny|        Hot|    High|   No|  No|
|   Sunny|        Hot|    High|  Yes|  No|
|Overcast|        Hot|    High|   No| Yes|
|   Rainy|       Mild|    High|   No| Yes|
|   Rainy|       Cool|  Normal|   No| Yes|
|   Rainy|       Cool|  Normal|  Yes|  No|
|Overcast|       Cool|  Normal|  Yes| Yes|
|   Sunny|       Mild|    High|   No|  No|
|   Sunny|       Cool|  Normal|   No| Yes|
|   Rainy|       Mild|  Normal|   No| Yes|
|   Sunny|       Mild|  Normal|  Yes| Yes|
|Overcast|       Mild|    High|  Yes| Yes|
|Overcast|        Hot|  Normal|   No| Yes|
|   Rainy|       Mild|    High|  Yes|  No|
+--------+-----------+--------+-----+----+



In [None]:
# Assuming your CSV has a column named "label" for the target variable
# and other columns for features

# Prepare the data
# Convert string columns to numerical indices using StringIndexer
# ... (if you have string columns)

# List of categorical columns to be indexed
categoricalCols = ["Outlook", "Temperature", "Humidity", "Windy"]

# Create and apply StringIndexer for each categorical column
indexers = [StringIndexer(inputCol=col, outputCol=col + "_index") for col in categoricalCols]
for indexer in indexers:
    data = indexer.fit(data).transform(data)

# Assemble features into a vector using indexed columns
assembler = VectorAssembler(inputCols=[col + "_index" for col in categoricalCols], outputCol="features")
data = assembler.transform(data)

# Import necessary libraries
from pyspark.ml.feature import StringIndexer

# Assuming 'Play' is the column with string values like "Yes" and "No"
# Create a StringIndexer to convert 'Play' to numerical labels
indexer = StringIndexer(inputCol="Play", outputCol="Play_index")

data.show()

+--------+-----------+--------+-----+----+-------------+-----------------+--------------+-----------+-----------------+
| Outlook|Temperature|Humidity|Windy|Play|Outlook_index|Temperature_index|Humidity_index|Windy_index|         features|
+--------+-----------+--------+-----+----+-------------+-----------------+--------------+-----------+-----------------+
|   Sunny|        Hot|    High|   No|  No|          1.0|              2.0|           0.0|        0.0|[1.0,2.0,0.0,0.0]|
|   Sunny|        Hot|    High|  Yes|  No|          1.0|              2.0|           0.0|        1.0|[1.0,2.0,0.0,1.0]|
|Overcast|        Hot|    High|   No| Yes|          2.0|              2.0|           0.0|        0.0|[2.0,2.0,0.0,0.0]|
|   Rainy|       Mild|    High|   No| Yes|          0.0|              0.0|           0.0|        0.0|        (4,[],[])|
|   Rainy|       Cool|  Normal|   No| Yes|          0.0|              1.0|           1.0|        0.0|[0.0,1.0,1.0,0.0]|
|   Rainy|       Cool|  Normal|  Yes|  N

In [None]:
# Split the data into training and testing sets
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [None]:
# Apply the StringIndexer to both trainingData and testData
# (This is the fix)
trainingData = indexer.fit(trainingData).transform(trainingData)
testData = indexer.fit(testData).transform(testData)

In [None]:
# Create a Naive Bayes model
# Specify the label and features columns
nb = NaiveBayes(featuresCol="features", labelCol="Play_index", smoothing=1.0, modelType="multinomial")

In [None]:
# Train the model
model = nb.fit(trainingData)

In [None]:
# Make predictions on the test data
predictions = model.transform(testData)

In [None]:
# Evaluate the model
evaluator = MulticlassClassificationEvaluator(
    labelCol="Play_index", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

Accuracy: 0.5
