In [28]:
import pandas as pd
import numpy as np
import random
import string
from datetime import datetime, timedelta

from pyspark.sql.functions import coalesce, lit, rand
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, MinMaxScaler
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegressionModel, RandomForestClassificationModel
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext

In [29]:
packages = [
    'org.apache.hadoop:hadoop-aws:3.3.4',
    'org.apache.hadoop:hadoop-client-api:3.3.4',
    'org.apache.hadoop:hadoop-client-runtime:3.3.4',
]

conf = SparkConf() \
    .setAppName("MyApp") \
    .set("spark.driver.memory", "8g") \
    .set("spark.executor.memory", "8g") \
    .set('spark.jars.packages', ','.join(packages))

sc = SparkContext(conf=conf)

hadoop_config = sc._jsc.hadoopConfiguration()
hadoop_config.set('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')
hadoop_config.set('com.amazonaws.services.s3.enableV4', 'true')

spark = SparkSession(sc)

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=MyApp, master=local[*]) created by __init__ at /var/folders/w_/hg905l611296df772h1275kr0000gn/T/ipykernel_37179/2579216408.py:13 

In [30]:
spark

In [48]:
def random_date(start, end):
    return start + timedelta(days=random.randint(0, int((end - start).days)))

def random_geohash():
    return ''.join(random.choices(string.ascii_lowercase + string.digits, k=11))

data = []
start_date = datetime(2024, 7, 1)
end_date = datetime(2024, 7, 4)
colors = ["red", "blue", "green"]
types = ["SUV", "sedan", "truck"]

for _ in range(1000):
    id = _
    color = random.choice(colors)
    type = random.choice(types)
    hour = random.randint(0, 4)
    milesperhour = random.uniform(19.0, 35.0)
    age = random.randint(40, 85)

    if random.randint(1, 10) < 7:
        if color == "red":
            label = 1
        elif color == "blue":
            label = 2
        else:
            label = 3
    else:
        label = random.randint(1, 3)
    
    date = random_date(start_date, end_date).strftime("%Y-%m-%d")
    geohash = random_geohash()
    data.append((id, color, type, hour, milesperhour, age, label, date, geohash))

schema = ["id", "color", "type", "hour", "milesperhour", "age", "label", "date", "geohash"]
data = spark.createDataFrame(data, schema)

data.show()

+---+-----+-----+----+------------------+---+-----+----------+-----------+
| id|color| type|hour|      milesperhour|age|label|      date|    geohash|
+---+-----+-----+----+------------------+---+-----+----------+-----------+
|  0| blue|sedan|   2| 23.85204602239879| 41|    2|2024-07-02|u0jvfxosqr5|
|  1|green|sedan|   2|20.295620865550074| 81|    3|2024-07-02|7cbzbn6kgxg|
|  2|  red|truck|   2| 33.61090496536134| 70|    1|2024-07-01|3w70knmoobn|
|  3| blue|truck|   4|33.141766706752065| 65|    2|2024-07-04|lbkoduj9ff9|
|  4|green|  SUV|   2|22.608227456451203| 70|    3|2024-07-03|vqzx4onq15q|
|  5|green|truck|   2|19.094984118957022| 60|    3|2024-07-01|9dh9s0c2mf6|
|  6|green|sedan|   4| 28.02709350819081| 46|    2|2024-07-03|y15n01iyey8|
|  7|green|truck|   4| 21.76558467703689| 52|    3|2024-07-01|og9gtht2xu0|
|  8| blue|truck|   0|31.203967505819808| 61|    2|2024-07-04|d0rqn1z6dg1|
|  9|green|  SUV|   3| 27.15535647554013| 65|    3|2024-07-01|l569mbxs13p|
| 10|  red|sedan|   0| 24

In [49]:
# string label
# data = spark.createDataFrame([
#     (0, "red",   "SUV",   12, 20.0, 60, "class1", "2024-07-01", "u4pruydqqvj"),
#     (1, "red",   "sedan",  9, 30.0, 70, "class2", "2024-07-02", "u4pruydqqvk"),
#     (2, "red",   "truck", 15, 25.0, 80, "class3", "2024-07-01", "u4pruydqqvj"),
#     (3, "blue",  "SUV",   20, 22.0, 65, "class1", "2024-07-02", "u4pruydqqvk"),
#     (4, "blue",  "sedan",  5, 35.0, 75, "class1", "2024-07-01", "u4pruydqqvj"),
#     (5, "blue",  "truck", 12, 28.0, 85, "class3", "2024-07-02", "u4pruydqqvk"),
#     (6, "green", "SUV",    9, 19.0, 50, "class2", "2024-07-03", "u4pruydqqvl"),
#     (7, "green", "sedan", 15, 32.0, 60, "class3", "2024-07-03", "u4pruydqqvm"),
#     (8, "green", "truck", 20, 27.0, 40, "class1", "2024-07-04", "u4pruydqqvn"),
#     (9, "green", "SUV",    5, 21.0, 55, "class2", "2024-07-04", "u4pruydqqvo")
# ], ["id", "color", "type", "hour", "milesperhour", "age", "label", "date", "geohash"])

# Split the data into training and test sets (80% training, 20% test)
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1234)

# String Indexing for features
indexers = [
    StringIndexer(inputCol="label", outputCol="label_index"),
    StringIndexer(inputCol="color", outputCol="color_index"),
    StringIndexer(inputCol="type", outputCol="type_index"),
    StringIndexer(inputCol="hour", outputCol="hour_index")
]

# One-Hot Encoding for Logistic Regression (not needed for tree-based models)
encoder = OneHotEncoder(
    inputCols=["color_index", "type_index", "hour_index"],
    outputCols=["color_vec", "type_vec", "hour_vec"]
)

# Assembling Features for Logistic Regression and Naive Bayes
assembler_lr_nb = VectorAssembler(
    inputCols=["color_vec", "type_vec", "hour_vec", "milesperhour", "age"],
    outputCol="assembled_features"
)

# Assembling Features for Tree-Based Models
assembler_tree = VectorAssembler(
    inputCols=["color_index", "type_index", "hour_index", "milesperhour", "age"],
    outputCol="features"
)

# MinMaxScaler for Logistic Regression and Naive Bayes
scaler_lr_nb = MinMaxScaler(inputCol="assembled_features", outputCol="features")

# Create and Fit the Pipeline for Logistic Regression and Naive Bayes
pipeline_lr_nb = Pipeline(stages=indexers + [encoder, assembler_lr_nb, scaler_lr_nb])
model_lr_nb = pipeline_lr_nb.fit(train_data)
transformed_train_data_lr_nb = model_lr_nb.transform(train_data)
transformed_test_data_lr_nb = model_lr_nb.transform(test_data)

# Create and Fit the Pipeline for Tree-Based Models
pipeline_tree = Pipeline(stages=indexers + [assembler_tree])
model_tree = pipeline_tree.fit(train_data)
transformed_train_data_tree = model_tree.transform(train_data)
transformed_test_data_tree = model_tree.transform(test_data)

In [50]:
model_lr_nb.stages[0].labels

['2', '3', '1']

In [51]:
def get_features(df):
    if 'assembled_features' in df.schema.names:
        feature_attrs = df.schema['assembled_features'].metadata['ml_attr']['attrs']
    else:
        feature_attrs = df.schema['features'].metadata['ml_attr']['attrs']

    features = []
    for attr_type, attrs in feature_attrs.items():
        features += attrs

    # for each in sorted(features, key=lambda x: x['idx']):
    #     print(each['idx'], each['name'])
    
    feature_names = [each['name'] for each in sorted(features, key=lambda x: x['idx'])]

    return feature_names

In [52]:
get_features(transformed_train_data_lr_nb)

['color_vec_blue',
 'color_vec_green',
 'type_vec_truck',
 'type_vec_SUV',
 'hour_vec_4',
 'hour_vec_3',
 'hour_vec_1',
 'hour_vec_2',
 'milesperhour',
 'age']

In [53]:
get_features(transformed_train_data_tree)

['color_index', 'type_index', 'hour_index', 'milesperhour', 'age']

# Logistic regression

## save and load data

In [54]:
# save to local
data_path = "data/transformed_train_data_lr_nb/"

# save to s3
# data_path = "s3a://test-thama-misc-20210612/20240717-sparkml/data/transformed_train_data_lr_nb/"

transformed_train_data_lr_nb.write.partitionBy("date", "geohash").mode('overwrite').save(data_path)

                                                                                

In [55]:
# load data
transformed_train_data_lr_nb_loaded = spark.read.load(data_path)

# print
transformed_train_data_lr_nb_loaded.toPandas().head(1)

                                                                                

Unnamed: 0,id,color,type,hour,milesperhour,age,label,label_index,color_index,type_index,hour_index,color_vec,type_vec,hour_vec,assembled_features,features,date,geohash
0,883,green,truck,4,31.798986,59,3,1.0,1.0,0.0,0.0,"(0.0, 1.0)","(1.0, 0.0)","(1.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 31.79...","(0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.799...",2024-07-04,c9djp8ws986


## save and load pipeline

In [56]:
# save to local
pipeline_model_path = "pipelines/model_lr_nb"

# save to s3
# pipeline_model_path = "s3a://test-thama-misc-20210612/20240717-sparkml/pipelines/model_lr_nb"

model_lr_nb.write().overwrite().save(pipeline_model_path)

In [57]:
# load pipeline
loaded_model = PipelineModel.load(pipeline_model_path)

## create model

In [41]:
# Training the Logistic Regression Model
lr = LogisticRegression(featuresCol="features", labelCol="label_index")
lr_model = lr.fit(transformed_train_data_lr_nb)

## save and load model

In [42]:
# save to local
model_path = "models/lr_model/"

# save to s3
# model_path = "s3a://test-thama-misc-20210612/20240717-sparkml/models/lr_model/"

lr_model.write().overwrite().save(model_path)

In [43]:
# load model
lr_model = LogisticRegressionModel.load(model_path)

## interpret model

In [44]:
# Get model coefficients and intercept for Logistic Regression
coefficients = lr_model.coefficientMatrix
intercept = lr_model.interceptVector
print(f"Coefficients: {coefficients}")
print(f"Intercept: {intercept}")

Coefficients: DenseMatrix([[ 1.95362006e+00,  1.63605288e-01, -2.76989651e-01,
              -5.16043888e-02,  3.09372788e-01,  2.10867137e-01,
               2.49153739e-01,  2.08468315e-01,  3.66401691e-01,
               6.18207235e-01],
             [-1.69201551e-01,  1.66896353e+00,  2.05872674e-01,
               3.96516014e-02, -3.53387303e-01, -3.05530031e-01,
              -1.36584129e-01, -6.24807536e-04, -1.38313359e-01,
              -3.92570818e-01],
             [-1.78441851e+00, -1.83256882e+00,  7.11169769e-02,
               1.19527874e-02,  4.40145153e-02,  9.46628932e-02,
              -1.12569610e-01, -2.07843507e-01, -2.28088332e-01,
              -2.25636417e-01]])
Intercept: [-1.3216406774434617,-0.13098727058627097,1.4526279480297326]


In [45]:
# shape = num_classes x num_features
np.array(coefficients.toArray().tolist()).shape

(3, 10)

In [46]:
coef_df = pd.DataFrame(
    np.array(coefficients.toArray().tolist()),
    columns=get_features(transformed_train_data_lr_nb)
)

# map label_index to label
coef_df['label'] = loaded_model.stages[0].labels
coef_df[['label'] + list(coef_df.columns)[:-1]]

Unnamed: 0,label,color_vec_green,color_vec_blue,type_vec_sedan,type_vec_truck,hour_vec_1,hour_vec_4,hour_vec_0,hour_vec_3,milesperhour,age
0,3,1.95362,0.163605,-0.27699,-0.051604,0.309373,0.210867,0.249154,0.208468,0.366402,0.618207
1,2,-0.169202,1.668964,0.205873,0.039652,-0.353387,-0.30553,-0.136584,-0.000625,-0.138313,-0.392571
2,1,-1.784419,-1.832569,0.071117,0.011953,0.044015,0.094663,-0.11257,-0.207844,-0.228088,-0.225636


In [47]:
coef_df2 = coef_df.reset_index().melt(id_vars=['label'], var_name='feature', value_name='coefficient')
coef_df2[['label', 'feature', 'coefficient']].sort_values(by=['label', 'feature'])

Unnamed: 0,label,feature,coefficient
32,1,age,-0.225636
8,1,color_vec_blue,-1.832569
5,1,color_vec_green,-1.784419
23,1,hour_vec_0,-0.11257
17,1,hour_vec_1,0.044015
26,1,hour_vec_3,-0.207844
20,1,hour_vec_4,0.094663
2,1,index,2.0
29,1,milesperhour,-0.228088
11,1,type_vec_sedan,0.071117


## evaluate

In [25]:
# # Initialize evaluators for all models
# evaluator_accuracy = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label_index", metricName="accuracy")
# evaluator_precision = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label_index", metricName="weightedPrecision")
# evaluator_recall = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label_index", metricName="weightedRecall")
# evaluator_f1 = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label_index", metricName="f1")

# Initialize evaluators for all models
evaluator_accuracy = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label", metricName="accuracy")
evaluator_precision = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label", metricName="weightedPrecision")
evaluator_recall = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label", metricName="weightedRecall")
evaluator_f1 = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label", metricName="f1")

In [26]:
# Evaluating the Logistic Regression Model
lr_predictions = lr_model.transform(transformed_test_data_lr_nb)
lr_accuracy = evaluator_accuracy.evaluate(lr_predictions)
lr_precision = evaluator_precision.evaluate(lr_predictions)
lr_recall = evaluator_recall.evaluate(lr_predictions)
lr_f1 = evaluator_f1.evaluate(lr_predictions)
print(f"Logistic Regression Accuracy: {lr_accuracy}")
print(f"Logistic Regression Precision: {lr_precision}")
print(f"Logistic Regression Recall: {lr_recall}")
print(f"Logistic Regression F1 Score: {lr_f1}")

Logistic Regression Accuracy: 0.7014218009478673
Logistic Regression Precision: 0.7123299902820629
Logistic Regression Recall: 0.7014218009478672
Logistic Regression F1 Score: 0.702552651112263


# Random forest

## save and load data

In [75]:
# save to local
data_path = "data/transformed_train_data_tree/"

# save to s3
# data_path = "s3a://test-thama-misc-20210612/20240717-sparkml/data/transformed_train_data_tree/"

transformed_train_data_tree.write.partitionBy("date", "geohash").mode('overwrite').save(data_path)

In [76]:
# load
transformed_train_data_tree_loaded = spark.read.load(data_path)

## save and load pipeline

In [77]:
# save to local
pipeline_model_path = "pipelines/model_tree"

# save to s3
# pipeline_model_path = "s3a://test-thama-misc-20210612/20240717-sparkml/pipelines/model_tree"

model_tree.write().overwrite().save(pipeline_model_path)

In [78]:
# load pipeline
loaded_model = PipelineModel.load(pipeline_model_path)

## create model

In [79]:
# Training the Random Forest Classifier
rf = RandomForestClassifier(featuresCol="features", labelCol="label_index")
rf_model = rf.fit(transformed_train_data_tree)

24/07/19 10:07:21 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 9 (= number of training instances)


## save and load model

In [80]:
# save to local
model_path = "models/rf_model/"

# save to s3
# model_path = "s3a://test-thama-misc-20210612/20240717-sparkml/models/rf_model/"

rf_model.write().overwrite().save(model_path)

In [81]:
# load model
rf_model = RandomForestClassificationModel.load(model_path)

## interpret model

In [82]:
# Get feature importances for Random Forest
rf_feature_importances = rf_model.featureImportances.toArray()
features_importances_rf = [(assembler_tree.getInputCols()[i], float(rf_feature_importances[i])) for i in range(len(rf_feature_importances))]
importances_df_rf = pd.DataFrame(features_importances_rf, columns=["Feature", "Importance"]).sort_values(by='Importance', ascending=False)

In [83]:
importances_df_rf

Unnamed: 0,Feature,Importance
0,color_index,0.301543
3,milesperhour,0.298617
2,hour_index,0.258801
1,type_index,0.075918
4,age,0.065121
