In [1]:
import pandas as pd
import numpy as np

from pyspark.sql.functions import coalesce, lit
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, MinMaxScaler
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegressionModel, RandomForestClassificationModel
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext

In [2]:
packages = [
    'org.apache.hadoop:hadoop-aws:3.3.4',
    'org.apache.hadoop:hadoop-client-api:3.3.4',
    'org.apache.hadoop:hadoop-client-runtime:3.3.4',
]

conf = SparkConf() \
    .setAppName("MyApp") \
    .set("spark.driver.memory", "8g") \
    .set("spark.executor.memory", "8g") \
    .set('spark.jars.packages', ','.join(packages))

sc = SparkContext(conf=conf)

hadoop_config = sc._jsc.hadoopConfiguration()
hadoop_config.set('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')
hadoop_config.set('com.amazonaws.services.s3.enableV4', 'true')

spark = SparkSession(sc)

24/07/19 07:44:59 WARN Utils: Your hostname, thamaMBP.local resolves to a loopback address: 127.0.0.1; using 172.20.10.5 instead (on interface en0)
24/07/19 07:44:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/Users/thama/ghq/github.com/mathmathpp2/synapselightgbm_trial/synapse-test/.venv/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/thama/.ivy2/cache
The jars for the packages stored in: /Users/thama/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
org.apache.hadoop#hadoop-client-api added as a dependency
org.apache.hadoop#hadoop-client-runtime added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-17db85c5-b95c-40c4-802d-1d1e88f4dce3;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found org.xerial.snappy#snappy-java;1.1.8.2 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.slf4j#slf4j-api;1.7.36 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.2 in local-m2-cache
:: resolution report :: resolve 478ms :: artifacts dl 64ms
	:: modul

In [3]:
spark

In [11]:
# Create the DataFrame

# number label
# data = spark.createDataFrame([
#     (0, "red",   "SUV",   12, 20.0, 60, 1, "2024-07-01", "u4pruydqqvj"),
#     (1, "red",   "sedan",  9, 30.0, 70, 2, "2024-07-02", "u4pruydqqvk"),
#     (2, "red",   "truck", 15, 25.0, 80, 3, "2024-07-01", "u4pruydqqvj"),
#     (3, "blue",  "SUV",   20, 22.0, 65, 1, "2024-07-02", "u4pruydqqvk"),
#     (4, "blue",  "sedan",  5, 35.0, 75, 1, "2024-07-01", "u4pruydqqvj"),
#     (5, "blue",  "truck", 12, 28.0, 85, 3, "2024-07-02", "u4pruydqqvk"),
#     (6, "green", "SUV",    9, 19.0, 50, 2, "2024-07-03", "u4pruydqqvl"),
#     (7, "green", "sedan", 15, 32.0, 60, 3, "2024-07-03", "u4pruydqqvm"),
#     (8, "green", "truck", 20, 27.0, 40, 1, "2024-07-04", "u4pruydqqvn"),
#     (9, "green", "SUV",    5, 21.0, 55, 2, "2024-07-04", "u4pruydqqvo")
# ], ["id", "color", "type", "hour", "milesperhour", "age", "label", "date", "geohash"])

# string label
data = spark.createDataFrame([
    (0, "red",   "SUV",   12, 20.0, 60, "class1", "2024-07-01", "u4pruydqqvj"),
    (1, "red",   "sedan",  9, 30.0, 70, "class2", "2024-07-02", "u4pruydqqvk"),
    (2, "red",   "truck", 15, 25.0, 80, "class3", "2024-07-01", "u4pruydqqvj"),
    (3, "blue",  "SUV",   20, 22.0, 65, "class1", "2024-07-02", "u4pruydqqvk"),
    (4, "blue",  "sedan",  5, 35.0, 75, "class1", "2024-07-01", "u4pruydqqvj"),
    (5, "blue",  "truck", 12, 28.0, 85, "class3", "2024-07-02", "u4pruydqqvk"),
    (6, "green", "SUV",    9, 19.0, 50, "class2", "2024-07-03", "u4pruydqqvl"),
    (7, "green", "sedan", 15, 32.0, 60, "class3", "2024-07-03", "u4pruydqqvm"),
    (8, "green", "truck", 20, 27.0, 40, "class1", "2024-07-04", "u4pruydqqvn"),
    (9, "green", "SUV",    5, 21.0, 55, "class2", "2024-07-04", "u4pruydqqvo")
], ["id", "color", "type", "hour", "milesperhour", "age", "label", "date", "geohash"])

# Split the data into training and test sets (80% training, 20% test)
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1234)

# String Indexing for features
indexers = [
    StringIndexer(inputCol="label", outputCol="label_index"),
    StringIndexer(inputCol="color", outputCol="color_index"),
    StringIndexer(inputCol="type", outputCol="type_index"),
    StringIndexer(inputCol="hour", outputCol="hour_index")
]

# One-Hot Encoding for Logistic Regression (not needed for tree-based models)
encoder = OneHotEncoder(
    inputCols=["color_index", "type_index", "hour_index"],
    outputCols=["color_vec", "type_vec", "hour_vec"]
)

# Assembling Features for Logistic Regression and Naive Bayes
assembler_lr_nb = VectorAssembler(
    inputCols=["color_vec", "type_vec", "hour_vec", "milesperhour", "age"],
    outputCol="assembled_features" # !!!
)

# Assembling Features for Tree-Based Models
assembler_tree = VectorAssembler(
    inputCols=["color_index", "type_index", "hour_index", "milesperhour", "age"],
    outputCol="features"
)

# MinMaxScaler for Logistic Regression and Naive Bayes
scaler_lr_nb = MinMaxScaler(inputCol="assembled_features", outputCol="features") # !!!

# Create and Fit the Pipeline for Logistic Regression and Naive Bayes
pipeline_lr_nb = Pipeline(stages=indexers + [encoder, assembler_lr_nb, scaler_lr_nb])
model_lr_nb = pipeline_lr_nb.fit(train_data)
transformed_train_data_lr_nb = model_lr_nb.transform(train_data)
transformed_test_data_lr_nb = model_lr_nb.transform(test_data)

# Create and Fit the Pipeline for Tree-Based Models
pipeline_tree = Pipeline(stages=indexers + [assembler_tree])
model_tree = pipeline_tree.fit(train_data)
transformed_train_data_tree = model_tree.transform(train_data)
transformed_test_data_tree = model_tree.transform(test_data)

In [26]:
def get_features(df):
     # !!!
    if 'assembled_features' in df.schema.names:
        feature_attrs = df.schema['assembled_features'].metadata['ml_attr']['attrs']
    else:
        feature_attrs = df.schema['features'].metadata['ml_attr']['attrs']

    features = []
    for attr_type, attrs in feature_attrs.items():
        features += attrs

    # for each in sorted(features, key=lambda x: x['idx']):
    #     print(each['idx'], each['name'])
    
    feature_names = [each['name'] for each in sorted(features, key=lambda x: x['idx'])]

    return feature_names

In [29]:
get_features(transformed_train_data_lr_nb)

0 color_vec_green
1 color_vec_blue
2 type_vec_SUV
3 type_vec_sedan
4 hour_vec_12
5 hour_vec_20
6 hour_vec_5
7 hour_vec_9
8 milesperhour
9 age


['color_vec_green',
 'color_vec_blue',
 'type_vec_SUV',
 'type_vec_sedan',
 'hour_vec_12',
 'hour_vec_20',
 'hour_vec_5',
 'hour_vec_9',
 'milesperhour',
 'age']

In [30]:
get_features(transformed_train_data_tree)

0 color_index
1 type_index
2 hour_index
3 milesperhour
4 age


['color_index', 'type_index', 'hour_index', 'milesperhour', 'age']

# Logistic regression

## save and load data

In [33]:
# save to local
data_path = "data/transformed_train_data_lr_nb/"

# save to s3
# data_path = "s3a://test-thama-misc-20210612/20240717-sparkml/data/transformed_train_data_lr_nb/"

transformed_train_data_lr_nb.write.partitionBy("date", "geohash").mode('overwrite').save(data_path)

                                                                                

In [34]:
# load data
transformed_train_data_lr_nb_loaded = spark.read.load(data_path)

# print
transformed_train_data_lr_nb_loaded.toPandas().head(1)

Unnamed: 0,id,color,type,hour,milesperhour,age,label,label_index,color_index,type_index,hour_index,color_vec,type_vec,hour_vec,assembled_features,features,date,geohash
0,4,blue,sedan,5,35.0,75,class1,0.0,1.0,1.0,2.0,"(0.0, 1.0)","(0.0, 1.0)","(0.0, 0.0, 1.0, 0.0)","(0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 35.0,...","(0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, ...",2024-07-01,u4pruydqqvj


## save and load pipeline

In [35]:
# save to local
pipeline_model_path = "pipelines/model_lr_nb"

# save to s3
# pipeline_model_path = "s3a://test-thama-misc-20210612/20240717-sparkml/pipelines/model_lr_nb"

model_lr_nb.write().overwrite().save(pipeline_model_path)

In [36]:
# load pipeline
loaded_model = PipelineModel.load(pipeline_model_path)

## create model

In [37]:
# Training the Logistic Regression Model
lr = LogisticRegression(featuresCol="features", labelCol="label_index")
lr_model = lr.fit(transformed_train_data_lr_nb)

24/07/19 08:00:22 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


## save and load model

In [38]:
# save to local
model_path = "models/lr_model/"

# save to s3
# model_path = "s3a://test-thama-misc-20210612/20240717-sparkml/models/lr_model/"

lr_model.write().overwrite().save(model_path)

In [39]:
# load model
lr_model = LogisticRegressionModel.load(model_path)

## interpret model

In [40]:
# Get model coefficients and intercept for Logistic Regression
coefficients = lr_model.coefficientMatrix
intercept = lr_model.interceptVector
print(f"Coefficients: {coefficients}")
print(f"Intercept: {intercept}")

Coefficients: DenseMatrix([[-16.50659272,   2.25680371,   2.78347211,   3.08884321,
                6.34129263,  13.67993769,   6.81600371, -11.04906031,
                4.6806029 , -22.54001381],
             [  1.95667653,  -7.68094173,   5.00340612,  -2.06113446,
              -13.11928344,  -4.42240364,  10.8390998 ,  19.37451749,
              -10.58002467,   2.6051087 ],
             [ 14.54991619,   5.42413803,  -7.78687823,  -1.02770875,
                6.77799081,  -9.25753405, -17.65510351,  -8.32545718,
                5.89942177,  19.93490511]])
Intercept: [13.415660362682623,0.9619389248875653,-14.377599287570186]


In [41]:
# shape = num_classes x num_features
np.array(coefficients.toArray().tolist()).shape

(3, 10)

In [42]:
coef_df = pd.DataFrame(
    np.array(coefficients.toArray().tolist()),
    columns=get_features(transformed_train_data_lr_nb)
)

# map label_index to label
coef_df['label'] = loaded_model.stages[0].labels
coef_df[['label'] + list(coef_df.columns)[:-1]]

0 color_vec_green
1 color_vec_blue
2 type_vec_SUV
3 type_vec_sedan
4 hour_vec_12
5 hour_vec_20
6 hour_vec_5
7 hour_vec_9
8 milesperhour
9 age


Unnamed: 0,label,color_vec_green,color_vec_blue,type_vec_SUV,type_vec_sedan,hour_vec_12,hour_vec_20,hour_vec_5,hour_vec_9,milesperhour,age
0,class1,-16.506593,2.256804,2.783472,3.088843,6.341293,13.679938,6.816004,-11.04906,4.680603,-22.540014
1,class2,1.956677,-7.680942,5.003406,-2.061134,-13.119283,-4.422404,10.8391,19.374517,-10.580025,2.605109
2,class3,14.549916,5.424138,-7.786878,-1.027709,6.777991,-9.257534,-17.655104,-8.325457,5.899422,19.934905


In [43]:
coef_df2 = coef_df.reset_index().melt(id_vars=['label'], var_name='feature', value_name='coefficient')
coef_df2[['label', 'feature', 'coefficient']].sort_values(by=['label', 'feature'])

Unnamed: 0,label,feature,coefficient
30,class1,age,-22.540014
6,class1,color_vec_blue,2.256804
3,class1,color_vec_green,-16.506593
15,class1,hour_vec_12,6.341293
18,class1,hour_vec_20,13.679938
21,class1,hour_vec_5,6.816004
24,class1,hour_vec_9,-11.04906
0,class1,index,0.0
27,class1,milesperhour,4.680603
9,class1,type_vec_SUV,2.783472


# Random forest

## save and load data

In [44]:
# save to local
data_path = "data/transformed_train_data_tree/"

# save to s3
# data_path = "s3a://test-thama-misc-20210612/20240717-sparkml/data/transformed_train_data_tree/"

transformed_train_data_tree.write.partitionBy("date", "geohash").mode('overwrite').save(data_path)

In [45]:
# load
transformed_train_data_tree_loaded = spark.read.load(data_path)

## save and load pipeline

In [46]:
# save to local
pipeline_model_path = "pipelines/model_tree"

# save to s3
# pipeline_model_path = "s3a://test-thama-misc-20210612/20240717-sparkml/pipelines/model_tree"

model_tree.write().overwrite().save(pipeline_model_path)

In [47]:
# load pipeline
loaded_model = PipelineModel.load(pipeline_model_path)

## create model

In [48]:
# Training the Random Forest Classifier
rf = RandomForestClassifier(featuresCol="features", labelCol="label_index")
rf_model = rf.fit(transformed_train_data_tree)

24/07/19 08:00:30 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 9 (= number of training instances)


## save and load model

In [49]:
# save to local
model_path = "models/rf_model/"

# save to s3
# model_path = "s3a://test-thama-misc-20210612/20240717-sparkml/models/rf_model/"

rf_model.write().overwrite().save(model_path)

In [50]:
# load model
rf_model = RandomForestClassificationModel.load(model_path)



## interpret model

In [51]:
# Get feature importances for Random Forest
rf_feature_importances = rf_model.featureImportances.toArray()
features_importances_rf = [(assembler_tree.getInputCols()[i], float(rf_feature_importances[i])) for i in range(len(rf_feature_importances))]
importances_df_rf = pd.DataFrame(features_importances_rf, columns=["Feature", "Importance"]).sort_values(by='Importance', ascending=False)

In [52]:
importances_df_rf

Unnamed: 0,Feature,Importance
0,color_index,0.301543
3,milesperhour,0.298617
2,hour_index,0.258801
1,type_index,0.075918
4,age,0.065121
