In [17]:
%%time

from pyspark.sql import SparkSession

#Initialize a PySpark SparkSession
spark = SparkSession.builder \
    .appName("MllibExample") \
    .getOrCreate()

CPU times: user 6.44 ms, sys: 1.77 ms, total: 8.21 ms
Wall time: 469 ms


## test data

In [18]:
%%time
from pyspark.ml.feature import VectorAssembler


data = spark.createDataFrame([
    (75,
     75, 80, 80, 80,
     1, 2, 2, 2,
     75, 75, 75, 75,
     1, 1, 1, 2,
     0, 0
    ),
    (7500,
     7500, 5000, 3000, 2500,
     1, 2, 3, 4,
     7000, 6000, 4500, 2775,
     2, 4, 5, 7,
     1, 1
    )
], [
    'amount',
    'customer_id_avrge_amount_1day', 'customer_id_avrge_amount_1week', 'customer_id_avrge_amount_1month', 'customer_id_avrge_amount_3month',
    'customer_id_count_1day', 'customer_id_count_1week', 'customer_id_count_1month', 'customer_id_count_3month',
    'account_id_avrge_amount_1day', 'account_id_avrge_amount_1week', 'account_id_avrge_amount_1month', 'account_id_avrge_amount_3month',
    'account_id_count_1day', 'account_id_count_1week', 'account_id_count_1month', 'account_id_count_3month',
    'transaction_in_weekend', 'transaction_at_night'
])

# Define the input features column names
feature_cols = [
    'amount',
    'customer_id_avrge_amount_1day', 'customer_id_avrge_amount_1week', 'customer_id_avrge_amount_1month', 'customer_id_avrge_amount_3month',
    'customer_id_count_1day', 'customer_id_count_1week', 'customer_id_count_1month', 'customer_id_count_3month',
    'account_id_avrge_amount_1day', 'account_id_avrge_amount_1week', 'account_id_avrge_amount_1month', 'account_id_avrge_amount_3month',
    'account_id_count_1day', 'account_id_count_1week', 'account_id_count_1month', 'account_id_count_3month',
    'transaction_in_weekend', 'transaction_at_night'
]

# Define the target column name
target_col = "is_fraud"

# Create a vector assembler to combine the input features into a single feature vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# Transform the dataset using the vector assembler
df_data = assembler.transform(data)

prediction_test_data = df_data.select(['features'])
prediction_test_data.show()

+--------------------+
|            features|
+--------------------+
|[75.0,75.0,80.0,8...|
|[7500.0,7500.0,50...|
+--------------------+

CPU times: user 94 ms, sys: 10.7 ms, total: 105 ms
Wall time: 5.57 s


## mllib LR

In [19]:
%%time

from pyspark.ml.classification import LogisticRegressionModel

lr_model = LogisticRegressionModel.load("lr_ml")

CPU times: user 19.5 ms, sys: 8.72 ms, total: 28.2 ms
Wall time: 8.31 s


In [20]:
%%time
lr_predictions = lr_model.transform(prediction_test_data)

CPU times: user 8.19 ms, sys: 1.19 ms, total: 9.37 ms
Wall time: 76.7 ms


In [23]:
%%time
lr_predictions.show()

+--------------------+--------------------+--------------------+----------+
|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+----------+
|[75.0,75.0,80.0,8...|[16.9614953733361...|[0.99999995697547...|       0.0|
|[7500.0,7500.0,50...|[-55.516184808450...|[7.75581214435529...|       1.0|
+--------------------+--------------------+--------------------+----------+

CPU times: user 3.04 ms, sys: 222 µs, total: 3.26 ms
Wall time: 338 ms


In [30]:
lr_predictions.select("features").collect()[1][0]

DenseVector([7500.0, 7500.0, 5000.0, 3000.0, 2500.0, 1.0, 2.0, 3.0, 4.0, 7000.0, 6000.0, 4500.0, 2775.0, 2.0, 4.0, 5.0, 7.0, 1.0, 1.0])

In [49]:
%%time
from pyspark.ml.linalg import Vectors

dense_vector = Vectors.dense([11500.0, 1500.0, 5000.0, 3000.0, 2500.0, 1.0, 2.0, 3.0, 4.0, 7000.0, 6000.0, 4500.0, 2775.0, 2.0, 4.0, 5.0, 7.0, 1.0, 1.0])
data_vect = spark.createDataFrame([
   (dense_vector,),
], [
 'features'
])
data_vect.show(truncate=False)

+---------------------------------------------------------------------------------------------------------+
|features                                                                                                 |
+---------------------------------------------------------------------------------------------------------+
|[11500.0,1500.0,5000.0,3000.0,2500.0,1.0,2.0,3.0,4.0,7000.0,6000.0,4500.0,2775.0,2.0,4.0,5.0,7.0,1.0,1.0]|
+---------------------------------------------------------------------------------------------------------+

CPU times: user 9.08 ms, sys: 2.35 ms, total: 11.4 ms
Wall time: 258 ms


In [51]:
%%time
lr_model.transform(data_vect).select(['prediction']).collect()[0][0]

CPU times: user 16.9 ms, sys: 3.06 ms, total: 20 ms
Wall time: 210 ms


1.0

## mllib MLP

In [14]:
%%time

from pyspark.ml.classification import MultilayerPerceptronClassificationModel
#load saved model
mlp_model = MultilayerPerceptronClassificationModel.load("mlp_ml")

CPU times: user 72.6 ms, sys: 2.81 ms, total: 75.4 ms
Wall time: 4.53 s


In [15]:
%%time

# Make predictions
mlp_predictions = mlp_model.transform(prediction_test_data)

CPU times: user 16.8 ms, sys: 5.91 ms, total: 22.7 ms
Wall time: 357 ms


In [16]:
%%time
mlp_predictions.show()

+--------------------+--------------------+--------------------+----------+
|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+----------+
|[75.0,75.0,80.0,8...|[0.05044739035084...|[0.92734005871064...|       0.0|
|[7500.0,7500.0,50...|[-0.4487054816695...|[0.78895819884336...|       0.0|
+--------------------+--------------------+--------------------+----------+

CPU times: user 4.46 ms, sys: 2.25 ms, total: 6.71 ms
Wall time: 4.62 s


In [53]:
mlp_model.transform(data_vect).select(['prediction']).collect()[0][0]

0.0

In [56]:
m={}
m['a']=1
m

{'a': 1}