In [None]:
import pandas as pd
import numpy as np

from pyspark.sql.functions import coalesce, lit
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext

In [None]:
# does not work
# packages = [
#     'org.apache.hadoop:hadoop-aws:3.4.0',
#     'org.apache.hadoop:hadoop-client-api:3.4.0',
#     'org.apache.hadoop:hadoop-client-runtime:3.4.0',
# ]

# works
packages = [
    'org.apache.hadoop:hadoop-aws:3.3.4',
    'org.apache.hadoop:hadoop-client-api:3.3.4',
    'org.apache.hadoop:hadoop-client-runtime:3.3.4',
]

conf = SparkConf() \
    .setAppName("MyApp") \
    .set("spark.driver.memory", "8g") \
    .set("spark.executor.memory", "8g") \
    .set('spark.jars.packages', ','.join(packages))

sc = SparkContext(conf=conf)

# hadoop_config = spark._jsc.hadoopConfiguration()
hadoop_config = sc._jsc.hadoopConfiguration()

hadoop_config.set('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')
hadoop_config.set('com.amazonaws.services.s3.enableV4', 'true')

# hadoop_config.set('fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider')
# hadoop_config.set('fs.s3a.access.key', AWS_ACCESS_KEY_ID)
# hadoop_config.set('fs.s3a.secret.key', AWS_SECRET_ACCESS_KEY)
# hadoop_config.set('fs.s3a.session.token', AWS_SESSION_TOKEN)

spark = SparkSession(sc)

In [None]:
# Create the DataFrame
data = spark.createDataFrame([
    (0, "red", "SUV", 12, 20.0, 60, 1, "2024-07-01", "u4pruydqqvj"),
    (1, "red", "sedan", 9, 30.0, 70, 2, "2024-07-02", "u4pruydqqvk"),
    (2, "red", "truck", 15, 25.0, 80, 3, "2024-07-01", "u4pruydqqvj"),
    (3, "blue", "SUV", 20, 22.0, 65, 1, "2024-07-02", "u4pruydqqvk"),
    (4, "blue", "sedan", 5, 35.0, 75, 1, "2024-07-01", "u4pruydqqvj"),
    (5, "blue", "truck", 10, 28.0, 85, 3, "2024-07-02", "u4pruydqqvk")
], ["id", "color", "type", "hour", "milesperhour", "age", "label", "date", "geohash"])

# Convert string labels to numeric
label_indexer = StringIndexer(inputCol="label", outputCol="label_index")
data = label_indexer.fit(data).transform(data)

# Split the data into training and test sets (80% training, 20% test)
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1234)

# String Indexing for features
indexers = [
    StringIndexer(inputCol="color", outputCol="color_index"),
    StringIndexer(inputCol="type", outputCol="type_index"),
    StringIndexer(inputCol="hour", outputCol="hour_index")
]

# One-Hot Encoding for Logistic Regression (not needed for tree-based models)
encoder = OneHotEncoder(
    inputCols=["color_index", "type_index", "hour_index"],
    outputCols=["color_vec", "type_vec", "hour_vec"]
)

# Assembling Features for Logistic Regression and Naive Bayes
assembler_lr_nb = VectorAssembler(
    inputCols=["color_vec", "type_vec", "hour_vec", "milesperhour", "age"],
    outputCol="features"
)

# Assembling Features for Tree-Based Models
assembler_tree = VectorAssembler(
    inputCols=["color_index", "type_index", "hour_index", "milesperhour", "age"],
    outputCol="features"
)

# Create and Fit the Pipeline for Logistic Regression and Naive Bayes
pipeline_lr_nb = Pipeline(stages=indexers + [encoder, assembler_lr_nb])
model_lr_nb = pipeline_lr_nb.fit(train_data)
transformed_train_data_lr_nb = model_lr_nb.transform(train_data)
transformed_test_data_lr_nb = model_lr_nb.transform(test_data)

# Create and Fit the Pipeline for Tree-Based Models
pipeline_tree = Pipeline(stages=indexers + [assembler_tree])
model_tree = pipeline_tree.fit(train_data)
transformed_train_data_tree = model_tree.transform(train_data)
transformed_test_data_tree = model_tree.transform(test_data)

In [None]:
# パイプラインで処理した pyspark データフレームから、features の順番で特徴量を取り出す
def get_features(df):
    feature_attrs = df.schema['features'].metadata['ml_attr']['attrs']
    features = []
    for attr_type, attrs in feature_attrs.items():
        features += attrs

    for each in sorted(features, key=lambda x: x['idx']):
        print(each['idx'], each['name'])
    
    feature_names = [each['name'] for each in sorted(features, key=lambda x: x['idx'])]

    return feature_names

In [None]:
transformed_train_data_lr_nb.printSchema()

In [None]:
transformed_train_data_lr_nb.show()

In [None]:
get_features(transformed_train_data_lr_nb)

# Logistic regression

## Save and load (Data)

In [None]:
# save to local
# data_path = "data/transformed_train_data_lr_nb/"

# save to s3
data_path = "s3a://test-thama-misc-20210612/20240717-sparkml/data/transformed_train_data_lr_nb/"

In [None]:
transformed_train_data_lr_nb.write.partitionBy("date", "geohash").mode('overwrite').save(data_path)

In [None]:
transformed_train_data_lr_nb_loaded = spark.read.load(data_path)

In [None]:
transformed_train_data_lr_nb_loaded.printSchema()

In [None]:
get_features(transformed_train_data_lr_nb_loaded)

## Save and load (Pipeline)

In [None]:
type(pipeline_lr_nb)

In [None]:
type(model_lr_nb)

In [None]:
# save to local
pipeline_model_path = "pipelines/model_lr_nb"
model_lr_nb.write().overwrite().save(pipeline_model_path)

In [None]:
# save to s3
pipeline_model_path = "s3a://test-thama-misc-20210612/20240717-sparkml/pipelines/model_lr_nb"
model_lr_nb.write().overwrite().save(pipeline_model_path)

In [None]:
loaded_model = PipelineModel.load(pipeline_model_path)

In [None]:
transformed_train_data_lr_nb_loaded = loaded_model.transform(train_data)

In [None]:
transformed_train_data_lr_nb_loaded.printSchema()

In [None]:
get_features(transformed_train_data_lr_nb_loaded)

## Model

In [None]:
# Training the Logistic Regression Model
lr = LogisticRegression(featuresCol="features", labelCol="label_index")
lr_model = lr.fit(transformed_train_data_lr_nb)

In [None]:
# これはたぶんエラーになるのでかわりに次のセルを実行

# # Get model coefficients and intercept for Logistic Regression
# coefficients = lr_model.coefficients
# intercept = lr_model.intercept
# print(f"Coefficients: {coefficients}")
# print(f"Intercept: {intercept}")

In [None]:
# Get model coefficients and intercept for Logistic Regression
coefficients = lr_model.coefficientMatrix
intercept = lr_model.interceptVector
print(f"Coefficients: {coefficients}")
print(f"Intercept: {intercept}")

In [None]:
# shape = num_classes x num_features
np.array(coefficients.toArray().tolist()).shape

In [None]:
coef_df = pd.DataFrame(
    np.array(coefficients.toArray().tolist()),
    columns=get_features(transformed_train_data_lr_nb)
)

In [None]:
coef_df

In [None]:
# インデックスと label の対応
data.toPandas()[['label', 'label_index']].drop_duplicates().head()

# Random forest

In [None]:
transformed_train_data_tree.printSchema()

In [None]:
get_features(transformed_train_data_tree)

## Save and load (Data)

In [None]:
# save to local
# data_path = "data/transformed_train_data_tree/"

# save to s3
data_path = "s3a://test-thama-misc-20210612/20240717-sparkml/data/transformed_train_data_tree/"

In [None]:
transformed_train_data_tree.write.partitionBy("date", "geohash").mode('overwrite').save(data_path)

In [None]:
transformed_train_data_tree_loaded = spark.read.load(data_path)

In [None]:
transformed_train_data_tree_loaded.printSchema()

In [None]:
get_features(transformed_train_data_tree_loaded)

## Save and load (Pipeline)

In [None]:
# save to local
pipeline_model_path = "pipelines/model_tree"
model_tree.write().overwrite().save(pipeline_model_path)

In [None]:
# save to s3
pipeline_model_path = "s3a://test-thama-misc-20210612/20240717-sparkml/pipelines/model_tree"
model_tree.write().overwrite().save(pipeline_model_path)

In [None]:
loaded_model = PipelineModel.load(pipeline_model_path)

In [None]:
transformed_train_data_tree_loaded = loaded_model.transform(train_data)

In [None]:
transformed_train_data_tree_loaded.printSchema()

In [None]:
get_features(transformed_train_data_tree_loaded)

## Model

In [None]:

# # Get model summary to extract training metrics for Logistic Regression
# training_summary = lr_model.summary
# print(f"Training Accuracy: {training_summary.accuracy}")
# print(f"Training Precision: {training_summary.precisionByLabel}")
# print(f"Training Recall: {training_summary.recallByLabel}")
# print(f"Training F1 Score: {training_summary.fMeasureByLabel()}")

# Training the Random Forest Classifier
rf = RandomForestClassifier(featuresCol="features", labelCol="label_index")
rf_model = rf.fit(transformed_train_data_tree)

# # Training the GBT Classifier
# gbt = GBTClassifier(featuresCol="features", labelCol="label_index")
# gbt_model = gbt.fit(transformed_train_data_tree)

# # Training the Naive Bayes Classifier
# nb = NaiveBayes(featuresCol="features", labelCol="label_index")
# nb_model = nb.fit(transformed_train_data_lr_nb)

# # Initialize evaluators for all models
# evaluator_accuracy = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label_index", metricName="accuracy")
# evaluator_precision = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label_index", metricName="weightedPrecision")
# evaluator_recall = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label_index", metricName="weightedRecall")
# evaluator_f1 = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label_index", metricName="f1")

# # Evaluating the Logistic Regression Model
# lr_predictions = lr_model.transform(transformed_test_data_lr_nb)
# lr_accuracy = evaluator_accuracy.evaluate(lr_predictions)
# lr_precision = evaluator_precision.evaluate(lr_predictions)
# lr_recall = evaluator_recall.evaluate(lr_predictions)
# lr_f1 = evaluator_f1.evaluate(lr_predictions)
# print(f"Logistic Regression Accuracy: {lr_accuracy}")
# print(f"Logistic Regression Precision: {lr_precision}")
# print(f"Logistic Regression Recall: {lr_recall}")
# print(f"Logistic Regression F1 Score: {lr_f1}")

# # Evaluating the Random Forest Classifier
# rf_predictions = rf_model.transform(transformed_test_data_tree)
# rf_accuracy = evaluator_accuracy.evaluate(rf_predictions)
# rf_precision = evaluator_precision.evaluate(rf_predictions)
# rf_recall = evaluator_recall.evaluate(rf_predictions)
# rf_f1 = evaluator_f1.evaluate(rf_predictions)
# print(f"Random Forest Accuracy: {rf_accuracy}")
# print(f"Random Forest Precision: {rf_precision}")
# print(f"Random Forest Recall: {rf_recall}")
# print(f"Random Forest F1 Score: {rf_f1}")

# Get feature importances for Random Forest
rf_feature_importances = rf_model.featureImportances.toArray()
features_importances_rf = [(assembler_tree.getInputCols()[i], float(rf_feature_importances[i])) for i in range(len(rf_feature_importances))]
importances_df_rf = pd.DataFrame(features_importances_rf, columns=["Feature", "Importance"]).sort_values(by='Importance', ascending=False)
print(importances_df_rf)

# # Evaluating the GBT Classifier
# gbt_predictions = gbt_model.transform(transformed_test_data_tree)
# gbt_accuracy = evaluator_accuracy.evaluate(gbt_predictions)
# gbt_precision = evaluator_precision.evaluate(gbt_predictions)
# gbt_recall = evaluator_recall.evaluate(gbt_predictions)
# gbt_f1 = evaluator_f1.evaluate(gbt_predictions)
# print(f"GBT Classifier Accuracy: {gbt_accuracy}")
# print(f"GBT Classifier Precision: {gbt_precision}")
# print(f"GBT Classifier Recall: {gbt_recall}")
# print(f"GBT Classifier F1 Score: {gbt_f1}")

# # Get feature importances for GBT
# gbt_feature_importances = gbt_model.featureImportances.toArray()
# features_importances_gbt = [(assembler_tree.getInputCols()[i], float(gbt_feature_importances[i])) for i in range(len(gbt_feature_importances))]
# importances_df_gbt = pd.DataFrame(features_importances_gbt, columns=["Feature", "Importance"]).sort_values(by='Importance', ascending=False)
# print(importances_df_gbt)

# # Evaluating the Naive Bayes Classifier
# nb_predictions = nb_model.transform(transformed_test_data_lr_nb)
# nb_accuracy = evaluator_accuracy.evaluate(nb_predictions)
# nb_precision = evaluator_precision.evaluate(nb_predictions)
# nb_recall = evaluator_recall.evaluate(nb_predictions)
# nb_f1 = evaluator_f1.evaluate(nb_predictions)
# print(f"Naive Bayes Accuracy: {nb_accuracy}")
# print(f"Naive Bayes Precision: {nb_precision}")
# print(f"Naive Bayes Recall: {nb_recall}")
# print(f"Naive Bayes F1 Score: {nb_f1}")

# # Naive Bayes Model Parameters
# print(f"Naive Bayes Model Parameters: {nb_model.explainParams()}")

# # Extract and print class prior probabilities and conditional probabilities for Naive Bayes
# class_prior_probs = np.exp(nb_model.pi.toArray())
# conditional_probs = np.exp(nb_model.theta.toArray())

# # Get the number of classes and features
# num_classes, num