In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import coalesce, lit
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd
import numpy as np

# Initialize Spark Session
spark = SparkSession.builder.appName("ClassifierExamples").getOrCreate()

# Create the DataFrame
# data = spark.createDataFrame([
#     (0, "red", "SUV", 12, 20.0, 60, 5, "yes"),
#     (1, "blue", "sedan", 9, 30.0, 70, 10, "no"),
#     (2, "green", "truck", 15, 25.0, 80, 3, "yes"),
#     (3, "yellow", "SUV", 20, 22.0, 65, 6, "no"),
#     (4, "white", "sedan", 5, 35.0, 75, 12, "yes"),
#     (5, "black", "truck", 10, 28.0, 85, 7, "no"),
#     # (6, None, "sedan", 8, None, 70, 9, "yes"),  # Example with null values
#     # (7, "blue", None, 5, 30.0, None, 4, "yes")
# ], ["id", "color", "type", "hour", "milesperhour", "age", "label"])

## ============================================================================
## color=label のデータ
## ============================================================================
data = spark.createDataFrame([
    (0, "red", "SUV", 12, 20.0, 60, 5, "yes"),
    (1, "blue", "sedan", 9, 30.0, 70, 10, "no"),
    (2, "red", "truck", 15, 25.0, 80, 3, "yes"),
    (3, "blue", "SUV", 20, 22.0, 65, 6, "no"),
    (4, "red", "sedan", 5, 35.0, 75, 12, "yes"),
    (5, "blue", "truck", 10, 28.0, 85, 7, "no"),
    # (6, None, "sedan", 8, None, 70, 9, "yes"),  # Example with null values
    # (7, "blue", None, 5, 30.0, None, 4, "yes")
], ["id", "color", "type", "hour", "milesperhour", "age", "label"])


# Drop rows with any null values
data = data.dropna()

# Debugging: Print schema and check for nulls in specific columns
data.printSchema()
data.select("color", "type", "hour").show()

# Fill nulls in categorical columns with 'missing'
data = data.withColumn("color", coalesce(data["color"], lit("missing")))
data = data.withColumn("type", coalesce(data["type"], lit("missing")))

# Convert string labels to numeric
label_indexer = StringIndexer(inputCol="label", outputCol="label_index")
data = label_indexer.fit(data).transform(data)

# Split the data into training and test sets (80% training, 20% test)
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1234)

# String Indexing for features
indexers = [
    StringIndexer(inputCol="color", outputCol="color_index"),
    StringIndexer(inputCol="type", outputCol="type_index"),
    StringIndexer(inputCol="hour", outputCol="hour_index")
]

# One-Hot Encoding for Logistic Regression (not needed for tree-based models)
encoder = OneHotEncoder(
    inputCols=["color_index", "type_index", "hour_index"],
    outputCols=["color_vec", "type_vec", "hour_vec"]
)

# Assembling Features for Logistic Regression and Naive Bayes
assembler_lr_nb = VectorAssembler(
    inputCols=["color_vec", "type_vec", "hour_vec", "milesperhour", "age"],
    outputCol="features"
)

# Assembling Features for Tree-Based Models
assembler_tree = VectorAssembler(
    inputCols=["color_index", "type_index", "hour_index", "milesperhour", "age"],
    outputCol="features"
)

# Create and Fit the Pipeline for Logistic Regression and Naive Bayes
pipeline_lr_nb = Pipeline(stages=indexers + [encoder, assembler_lr_nb])
model_lr_nb = pipeline_lr_nb.fit(train_data)
transformed_train_data_lr_nb = model_lr_nb.transform(train_data)
transformed_test_data_lr_nb = model_lr_nb.transform(test_data)

# Create and Fit the Pipeline for Tree-Based Models
pipeline_tree = Pipeline(stages=indexers + [assembler_tree])
model_tree = pipeline_tree.fit(train_data)
transformed_train_data_tree = model_tree.transform(train_data)
transformed_test_data_tree = model_tree.transform(test_data)

# # Training the Logistic Regression Model
# lr = LogisticRegression(featuresCol="features", labelCol="label_index")
# lr_model = lr.fit(transformed_train_data_lr_nb)

# # Get model coefficients and intercept for Logistic Regression
# coefficients = lr_model.coefficients
# intercept = lr_model.intercept
# print(f"Coefficients: {coefficients}")
# print(f"Intercept: {intercept}")

# # Get model summary to extract training metrics for Logistic Regression
# training_summary = lr_model.summary
# print(f"Training Accuracy: {training_summary.accuracy}")
# print(f"Training Precision: {training_summary.precisionByLabel}")
# print(f"Training Recall: {training_summary.recallByLabel}")
# print(f"Training F1 Score: {training_summary.fMeasureByLabel()}")

# Training the Random Forest Classifier
rf = RandomForestClassifier(featuresCol="features", labelCol="label_index")
rf_model = rf.fit(transformed_train_data_tree)

# # Training the GBT Classifier
# gbt = GBTClassifier(featuresCol="features", labelCol="label_index")
# gbt_model = gbt.fit(transformed_train_data_tree)

# # Training the Naive Bayes Classifier
# nb = NaiveBayes(featuresCol="features", labelCol="label_index")
# nb_model = nb.fit(transformed_train_data_lr_nb)

# Initialize evaluators for all models
evaluator_accuracy = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label_index", metricName="accuracy")
evaluator_precision = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label_index", metricName="weightedPrecision")
evaluator_recall = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label_index", metricName="weightedRecall")
evaluator_f1 = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label_index", metricName="f1")

# # Evaluating the Logistic Regression Model
# lr_predictions = lr_model.transform(transformed_test_data_lr_nb)
# lr_accuracy = evaluator_accuracy.evaluate(lr_predictions)
# lr_precision = evaluator_precision.evaluate(lr_predictions)
# lr_recall = evaluator_recall.evaluate(lr_predictions)
# lr_f1 = evaluator_f1.evaluate(lr_predictions)
# print(f"Logistic Regression Accuracy: {lr_accuracy}")
# print(f"Logistic Regression Precision: {lr_precision}")
# print(f"Logistic Regression Recall: {lr_recall}")
# print(f"Logistic Regression F1 Score: {lr_f1}")

# Evaluating the Random Forest Classifier
rf_predictions = rf_model.transform(transformed_test_data_tree)
rf_accuracy = evaluator_accuracy.evaluate(rf_predictions)
rf_precision = evaluator_precision.evaluate(rf_predictions)
rf_recall = evaluator_recall.evaluate(rf_predictions)
rf_f1 = evaluator_f1.evaluate(rf_predictions)
print(f"Random Forest Accuracy: {rf_accuracy}")
print(f"Random Forest Precision: {rf_precision}")
print(f"Random Forest Recall: {rf_recall}")
print(f"Random Forest F1 Score: {rf_f1}")

# Get feature importances for Random Forest
rf_feature_importances = rf_model.featureImportances.toArray()
features_importances_rf = [(assembler_tree.getInputCols()[i], float(rf_feature_importances[i])) for i in range(len(rf_feature_importances))]
importances_df_rf = pd.DataFrame(features_importances_rf, columns=["Feature", "Importance"]).sort_values(by='Importance', ascending=False)
print(importances_df_rf)

# # Evaluating the GBT Classifier
# gbt_predictions = gbt_model.transform(transformed_test_data_tree)
# gbt_accuracy = evaluator_accuracy.evaluate(gbt_predictions)
# gbt_precision = evaluator_precision.evaluate(gbt_predictions)
# gbt_recall = evaluator_recall.evaluate(gbt_predictions)
# gbt_f1 = evaluator_f1.evaluate(gbt_predictions)
# print(f"GBT Classifier Accuracy: {gbt_accuracy}")
# print(f"GBT Classifier Precision: {gbt_precision}")
# print(f"GBT Classifier Recall: {gbt_recall}")
# print(f"GBT Classifier F1 Score: {gbt_f1}")

# # Get feature importances for GBT
# gbt_feature_importances = gbt_model.featureImportances.toArray()
# features_importances_gbt = [(assembler_tree.getInputCols()[i], float(gbt_feature_importances[i])) for i in range(len(gbt_feature_importances))]
# importances_df_gbt = pd.DataFrame(features_importances_gbt, columns=["Feature", "Importance"]).sort_values(by='Importance', ascending=False)
# print(importances_df_gbt)

# # Evaluating the Naive Bayes Classifier
# nb_predictions = nb_model.transform(transformed_test_data_lr_nb)
# nb_accuracy = evaluator_accuracy.evaluate(nb_predictions)
# nb_precision = evaluator_precision.evaluate(nb_predictions)
# nb_recall = evaluator_recall.evaluate(nb_predictions)
# nb_f1 = evaluator_f1.evaluate(nb_predictions)
# print(f"Naive Bayes Accuracy: {nb_accuracy}")
# print(f"Naive Bayes Precision: {nb_precision}")
# print(f"Naive Bayes Recall: {nb_recall}")
# print(f"Naive Bayes F1 Score: {nb_f1}")

# # Naive Bayes Model Parameters
# print(f"Naive Bayes Model Parameters: {nb_model.explainParams()}")

# # Extract and print class prior probabilities and conditional probabilities for Naive Bayes
# class_prior_probs = np.exp(nb_model.pi.toArray())
# conditional_probs = np.exp(nb_model.theta.toArray())

# # Get the number of classes and features
# num_classes, num

24/07/17 00:02:13 WARN Utils: Your hostname, thamaMBP.local resolves to a loopback address: 127.0.0.1; using 172.20.10.2 instead (on interface en0)
24/07/17 00:02:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/17 00:02:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/07/17 00:02:14 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/07/17 00:02:14 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/07/17 00:02:14 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


root
 |-- id: long (nullable = true)
 |-- color: string (nullable = true)
 |-- type: string (nullable = true)
 |-- hour: long (nullable = true)
 |-- milesperhour: double (nullable = true)
 |-- age: long (nullable = true)
 |-- label: long (nullable = true)
 |-- _8: string (nullable = true)



                                                                                

+-----+-----+----+
|color| type|hour|
+-----+-----+----+
|  red|  SUV|  12|
| blue|sedan|   9|
|  red|truck|  15|
| blue|  SUV|  20|
|  red|sedan|   5|
| blue|truck|  10|
+-----+-----+----+



24/07/17 00:02:29 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


Py4JJavaError: An error occurred while calling o688.coefficients.
: org.apache.spark.SparkException: Multinomial models contain a matrix of coefficients, use coefficientMatrix instead.
	at org.apache.spark.ml.classification.LogisticRegressionModel.coefficients(LogisticRegression.scala:1085)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
