In [51]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import coalesce, lit
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pandas as pd
import numpy as np

# Initialize Spark Session
spark = SparkSession.builder.appName("ClassifierExamples").getOrCreate()

## ============================================================================
## color != label のデータ
## ============================================================================
# Create the DataFrame
data = spark.createDataFrame([
    (0, "red", "SUV", 12, 20.0, 60, 1),
    (1, "red", "sedan", 9, 30.0, 70, 2),
    (2, "red", "truck", 15, 25.0, 80, 3),
    (3, "blue", "SUV", 20, 22.0, 65, 1),
    (4, "blue", "sedan", 5, 35.0, 75, 1),
    (5, "blue", "truck", 10, 28.0, 85, 3),
], ["id", "color", "type", "hour", "milesperhour", "age", "label"])

## ============================================================================
## color = label のデータ
## ============================================================================
# Create the DataFrame
# data = spark.createDataFrame([
#     (0, "red", "SUV", 12, 20.0, 60, 1),
#     (1, "blue", "sedan", 9, 30.0, 70, 2),
#     (2, "red", "truck", 15, 25.0, 80, 1),
#     (3, "blue", "SUV", 20, 22.0, 65, 2),
#     (4, "red", "sedan", 5, 35.0, 75, 1),
#     (5, "blue", "truck", 10, 28.0, 85, 2),
# ], ["id", "color", "type", "hour", "milesperhour", "age", "label"])


# Drop rows with any null values
data = data.dropna()

# Debugging: Print schema and check for nulls in specific columns
data.printSchema()
data.select("color", "type", "hour").show()

# Fill nulls in categorical columns with 'missing'
data = data.withColumn("color", coalesce(data["color"], lit("missing")))
data = data.withColumn("type", coalesce(data["type"], lit("missing")))

# Convert string labels to numeric
label_indexer = StringIndexer(inputCol="label", outputCol="label_index")
data = label_indexer.fit(data).transform(data)

# Split the data into training and test sets (80% training, 20% test)
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1234)

# String Indexing for features
indexers = [
    StringIndexer(inputCol="color", outputCol="color_index"),
    StringIndexer(inputCol="type", outputCol="type_index"),
    StringIndexer(inputCol="hour", outputCol="hour_index")
]

# One-Hot Encoding for Logistic Regression (not needed for tree-based models)
encoder = OneHotEncoder(
    inputCols=["color_index", "type_index", "hour_index"],
    outputCols=["color_vec", "type_vec", "hour_vec"]
)

# Assembling Features for Logistic Regression and Naive Bayes
assembler_lr_nb = VectorAssembler(
    inputCols=["color_vec", "type_vec", "hour_vec", "milesperhour", "age"],
    outputCol="features"
)

# Assembling Features for Tree-Based Models
assembler_tree = VectorAssembler(
    inputCols=["color_index", "type_index", "hour_index", "milesperhour", "age"],
    outputCol="features"
)

# Create and Fit the Pipeline for Logistic Regression and Naive Bayes
pipeline_lr_nb = Pipeline(stages=indexers + [encoder, assembler_lr_nb])
model_lr_nb = pipeline_lr_nb.fit(train_data)
transformed_train_data_lr_nb = model_lr_nb.transform(train_data)
transformed_test_data_lr_nb = model_lr_nb.transform(test_data)

# Create and Fit the Pipeline for Tree-Based Models
pipeline_tree = Pipeline(stages=indexers + [assembler_tree])
model_tree = pipeline_tree.fit(train_data)
transformed_train_data_tree = model_tree.transform(train_data)
transformed_test_data_tree = model_tree.transform(test_data)

root
 |-- id: long (nullable = true)
 |-- color: string (nullable = true)
 |-- type: string (nullable = true)
 |-- hour: long (nullable = true)
 |-- milesperhour: double (nullable = true)
 |-- age: long (nullable = true)
 |-- label: long (nullable = true)

+-----+-----+----+
|color| type|hour|
+-----+-----+----+
|  red|  SUV|  12|
|  red|sedan|   9|
|  red|truck|  15|
| blue|  SUV|  20|
| blue|sedan|   5|
| blue|truck|  10|
+-----+-----+----+



# Logistic regression

In [52]:
# Training the Logistic Regression Model
lr = LogisticRegression(featuresCol="features", labelCol="label_index")
lr_model = lr.fit(transformed_train_data_lr_nb)

In [53]:
# パイプラインで処理した pyspark データフレームから、features の順番で特徴量を取り出す
def get_features(df):
    feature_attrs = df.schema['features'].metadata['ml_attr']['attrs']
    features = []
    for attr_type, attrs in feature_attrs.items():
        features += attrs

    for each in sorted(features, key=lambda x: x['idx']):
        print(each['idx'], each['name'])
    
    feature_names = [each['name'] for each in sorted(features, key=lambda x: x['idx'])]

    return feature_names

In [54]:
# これはたぶんエラーになるのでかわりに次のセルを実行

# # Get model coefficients and intercept for Logistic Regression
# coefficients = lr_model.coefficients
# intercept = lr_model.intercept
# print(f"Coefficients: {coefficients}")
# print(f"Intercept: {intercept}")

In [55]:
# Get model coefficients and intercept for Logistic Regression
coefficients = lr_model.coefficientMatrix
intercept = lr_model.interceptVector
print(f"Coefficients: {coefficients}")
print(f"Intercept: {intercept}")

Coefficients: DenseMatrix([[ 2.19021458e+00,  2.04738613e+00, -8.34716427e+00,
              -7.03965340e+00,  2.19577114e+00, -5.48109301e+00,
               8.75308059e-01,  3.46509751e-01, -2.59519343e-01],
             [-2.16080475e+00, -2.17598618e+00,  8.22498412e+00,
               6.98902753e+00, -2.10724153e+00,  5.34844865e+00,
              -1.15673774e+00, -3.23254873e-01,  2.60108872e-01],
             [-2.94098344e-02,  1.28600044e-01,  1.22180156e-01,
               5.06258718e-02, -8.85296105e-02,  1.32644362e-01,
               2.81429676e-01, -2.32548774e-02, -5.89529440e-04]])
Intercept: [18.255972365913973,-12.515273657784677,-5.740698708129297]


In [56]:
# shape = num_classes x num_features
np.array(coefficients.toArray().tolist()).shape

(3, 9)

In [57]:
coef_df = pd.DataFrame(
    np.array(coefficients.toArray().tolist()),
    columns=get_features(transformed_train_data_lr_nb)
)

0 color_vec_blue
1 type_vec_SUV
2 type_vec_truck
3 hour_vec_10
4 hour_vec_12
5 hour_vec_15
6 hour_vec_20
7 milesperhour
8 age


In [58]:
coef_df

Unnamed: 0,color_vec_blue,type_vec_SUV,type_vec_truck,hour_vec_10,hour_vec_12,hour_vec_15,hour_vec_20,milesperhour,age
0,2.190215,2.047386,-8.347164,-7.039653,2.195771,-5.481093,0.875308,0.34651,-0.259519
1,-2.160805,-2.175986,8.224984,6.989028,-2.107242,5.348449,-1.156738,-0.323255,0.260109
2,-0.02941,0.1286,0.12218,0.050626,-0.08853,0.132644,0.28143,-0.023255,-0.00059


In [67]:
# インデックスと label の対応
data.toPandas()[['label', 'label_index']].drop_duplicates().head()

Unnamed: 0,label,label_index
0,1,0.0
1,2,2.0
2,3,1.0


# Random forest

In [59]:

# # Get model summary to extract training metrics for Logistic Regression
# training_summary = lr_model.summary
# print(f"Training Accuracy: {training_summary.accuracy}")
# print(f"Training Precision: {training_summary.precisionByLabel}")
# print(f"Training Recall: {training_summary.recallByLabel}")
# print(f"Training F1 Score: {training_summary.fMeasureByLabel()}")

# Training the Random Forest Classifier
rf = RandomForestClassifier(featuresCol="features", labelCol="label_index")
rf_model = rf.fit(transformed_train_data_tree)

# # Training the GBT Classifier
# gbt = GBTClassifier(featuresCol="features", labelCol="label_index")
# gbt_model = gbt.fit(transformed_train_data_tree)

# # Training the Naive Bayes Classifier
# nb = NaiveBayes(featuresCol="features", labelCol="label_index")
# nb_model = nb.fit(transformed_train_data_lr_nb)

# # Initialize evaluators for all models
# evaluator_accuracy = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label_index", metricName="accuracy")
# evaluator_precision = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label_index", metricName="weightedPrecision")
# evaluator_recall = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label_index", metricName="weightedRecall")
# evaluator_f1 = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label_index", metricName="f1")

# # Evaluating the Logistic Regression Model
# lr_predictions = lr_model.transform(transformed_test_data_lr_nb)
# lr_accuracy = evaluator_accuracy.evaluate(lr_predictions)
# lr_precision = evaluator_precision.evaluate(lr_predictions)
# lr_recall = evaluator_recall.evaluate(lr_predictions)
# lr_f1 = evaluator_f1.evaluate(lr_predictions)
# print(f"Logistic Regression Accuracy: {lr_accuracy}")
# print(f"Logistic Regression Precision: {lr_precision}")
# print(f"Logistic Regression Recall: {lr_recall}")
# print(f"Logistic Regression F1 Score: {lr_f1}")

# # Evaluating the Random Forest Classifier
# rf_predictions = rf_model.transform(transformed_test_data_tree)
# rf_accuracy = evaluator_accuracy.evaluate(rf_predictions)
# rf_precision = evaluator_precision.evaluate(rf_predictions)
# rf_recall = evaluator_recall.evaluate(rf_predictions)
# rf_f1 = evaluator_f1.evaluate(rf_predictions)
# print(f"Random Forest Accuracy: {rf_accuracy}")
# print(f"Random Forest Precision: {rf_precision}")
# print(f"Random Forest Recall: {rf_recall}")
# print(f"Random Forest F1 Score: {rf_f1}")

# Get feature importances for Random Forest
rf_feature_importances = rf_model.featureImportances.toArray()
features_importances_rf = [(assembler_tree.getInputCols()[i], float(rf_feature_importances[i])) for i in range(len(rf_feature_importances))]
importances_df_rf = pd.DataFrame(features_importances_rf, columns=["Feature", "Importance"]).sort_values(by='Importance', ascending=False)
print(importances_df_rf)

# # Evaluating the GBT Classifier
# gbt_predictions = gbt_model.transform(transformed_test_data_tree)
# gbt_accuracy = evaluator_accuracy.evaluate(gbt_predictions)
# gbt_precision = evaluator_precision.evaluate(gbt_predictions)
# gbt_recall = evaluator_recall.evaluate(gbt_predictions)
# gbt_f1 = evaluator_f1.evaluate(gbt_predictions)
# print(f"GBT Classifier Accuracy: {gbt_accuracy}")
# print(f"GBT Classifier Precision: {gbt_precision}")
# print(f"GBT Classifier Recall: {gbt_recall}")
# print(f"GBT Classifier F1 Score: {gbt_f1}")

# # Get feature importances for GBT
# gbt_feature_importances = gbt_model.featureImportances.toArray()
# features_importances_gbt = [(assembler_tree.getInputCols()[i], float(gbt_feature_importances[i])) for i in range(len(gbt_feature_importances))]
# importances_df_gbt = pd.DataFrame(features_importances_gbt, columns=["Feature", "Importance"]).sort_values(by='Importance', ascending=False)
# print(importances_df_gbt)

# # Evaluating the Naive Bayes Classifier
# nb_predictions = nb_model.transform(transformed_test_data_lr_nb)
# nb_accuracy = evaluator_accuracy.evaluate(nb_predictions)
# nb_precision = evaluator_precision.evaluate(nb_predictions)
# nb_recall = evaluator_recall.evaluate(nb_predictions)
# nb_f1 = evaluator_f1.evaluate(nb_predictions)
# print(f"Naive Bayes Accuracy: {nb_accuracy}")
# print(f"Naive Bayes Precision: {nb_precision}")
# print(f"Naive Bayes Recall: {nb_recall}")
# print(f"Naive Bayes F1 Score: {nb_f1}")

# # Naive Bayes Model Parameters
# print(f"Naive Bayes Model Parameters: {nb_model.explainParams()}")

# # Extract and print class prior probabilities and conditional probabilities for Naive Bayes
# class_prior_probs = np.exp(nb_model.pi.toArray())
# conditional_probs = np.exp(nb_model.theta.toArray())

# # Get the number of classes and features
# num_classes, num

24/07/17 00:37:29 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 5 (= number of training instances)


        Feature  Importance
4           age    0.574963
3  milesperhour    0.233631
1    type_index    0.128906
2    hour_index    0.062500
0   color_index    0.000000
