In [1]:
#111111111222222222233333333334444444444555555555566666666667777777777888888888899999999990000000000

In [2]:
# create a spark session
import pyspark
spark_context = pyspark.SparkContext()
spark_session = pyspark.sql.SparkSession(spark_context)

In [3]:
# load csv file into dataframe
df = spark_session.read.csv("creditcard.csv", header=True,\
                    sep=",", inferSchema=True)
df.show(1)

+----+----------------+-------------------+----------------+----------------+------------------+-----------------+-----------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+----------------+------------------+-----------------+------------------+-----------------+-----------------+------------------+-----------------+------------------+------------------+-----------------+------------------+-----------------+-------------------+------+-----+
|Time|              V1|                 V2|              V3|              V4|                V5|               V6|               V7|                V8|               V9|               V10|               V11|               V12|               V13|               V14|             V15|               V16|              V17|               V18|              V19|              V20|               V21|              V22|               V23|               V24|              V

In [4]:
# Print the schema to get the column names
df.printSchema()

root
 |-- Time: decimal(10,0) (nullable = true)
 |-- V1: double (nullable = true)
 |-- V2: double (nullable = true)
 |-- V3: double (nullable = true)
 |-- V4: double (nullable = true)
 |-- V5: double (nullable = true)
 |-- V6: double (nullable = true)
 |-- V7: double (nullable = true)
 |-- V8: double (nullable = true)
 |-- V9: double (nullable = true)
 |-- V10: double (nullable = true)
 |-- V11: double (nullable = true)
 |-- V12: double (nullable = true)
 |-- V13: double (nullable = true)
 |-- V14: double (nullable = true)
 |-- V15: double (nullable = true)
 |-- V16: double (nullable = true)
 |-- V17: double (nullable = true)
 |-- V18: double (nullable = true)
 |-- V19: double (nullable = true)
 |-- V20: double (nullable = true)
 |-- V21: double (nullable = true)
 |-- V22: double (nullable = true)
 |-- V23: double (nullable = true)
 |-- V24: double (nullable = true)
 |-- V25: double (nullable = true)
 |-- V26: double (nullable = true)
 |-- V27: double (nullable = true)
 |-- V28: double

In [5]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler( inputCols= [ "V1","V2","V3","V4","V5",
                                                "V6","V7","V8","V9","V10",
                                                "V11","V12","V13","V14","V15",
                                                "V16","V17","V18","V19","V20"],
                                    outputCol= "features")
vdf = vectorAssembler.transform( df )
vdf = vdf.select(["features","class"])
vdf.show(1)

+--------------------+-----+
|            features|class|
+--------------------+-----+
|[-1.3598071336738...|    0|
+--------------------+-----+
only showing top 1 row



In [6]:
# Split data into training and test dataframe
df_train, df_test = vdf.randomSplit([0.8, 0.2],seed=5)
df_train.show(1)

+--------------------+-----+
|            features|class|
+--------------------+-----+
|[-30.552380043581...|    1|
+--------------------+-----+
only showing top 1 row



In [7]:
# Create a GBT model and train it...
from pyspark.ml.regression import GBTRegressor
from pyspark.ml import Pipeline
gbt = GBTRegressor(featuresCol="features", labelCol="class", maxIter=1)
pipeline = Pipeline(stages=[gbt])
model = pipeline.fit(df_train)

In [8]:
# Make predictions.
predictions = model.transform(df_test)
predictions = predictions.select("prediction", "class")
predictions.show()

+--------------------+-----+
|          prediction|class|
+--------------------+-----+
|  0.7739130434782608|    0|
|  0.7739130434782608|    1|
|  0.7739130434782608|    1|
|0.031578947368421054|    0|
|  0.7739130434782608|    1|
|  0.7739130434782608|    1|
|0.031578947368421054|    0|
|  0.7739130434782608|    1|
|  0.7739130434782608|    0|
|  0.7739130434782608|    1|
|0.002285915245296...|    0|
|8.137358613394092E-5|    0|
|  0.7739130434782608|    0|
|8.137358613394092E-5|    0|
|8.137358613394092E-5|    0|
|1.919017463058913...|    0|
|  0.9476190476190476|    1|
|1.919017463058913...|    0|
|8.137358613394092E-5|    0|
|8.137358613394092E-5|    0|
+--------------------+-----+
only showing top 20 rows



In [9]:
from pyspark.sql.functions import udf,col
from pyspark.sql.types import IntegerType
func = udf(lambda prediction : 1 if prediction>0.5 else 0, IntegerType())
npredictions = predictions.withColumn('predclass', func( col("prediction") ) )
npredictions.show()

+--------------------+-----+---------+
|          prediction|class|predclass|
+--------------------+-----+---------+
|  0.7739130434782608|    0|        1|
|  0.7739130434782608|    1|        1|
|  0.7739130434782608|    1|        1|
|0.031578947368421054|    0|        0|
|  0.7739130434782608|    1|        1|
|  0.7739130434782608|    1|        1|
|0.031578947368421054|    0|        0|
|  0.7739130434782608|    1|        1|
|  0.7739130434782608|    0|        1|
|  0.7739130434782608|    1|        1|
|0.002285915245296...|    0|        0|
|8.137358613394092E-5|    0|        0|
|  0.7739130434782608|    0|        1|
|8.137358613394092E-5|    0|        0|
|8.137358613394092E-5|    0|        0|
|1.919017463058913...|    0|        0|
|  0.9476190476190476|    1|        1|
|1.919017463058913...|    0|        0|
|8.137358613394092E-5|    0|        0|
|8.137358613394092E-5|    0|        0|
+--------------------+-----+---------+
only showing top 20 rows



In [10]:
# Select (prediction, true label) and compute test error
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(
    labelCol="class", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 0.0276316


In [11]:
# Compute an AUC score
score = [ row.prediction for row in predictions.select('prediction').collect() ]
ground_truth = [ int(row["class"] ) for row in predictions.select('class').collect() ]
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(ground_truth, score, pos_label=1)
print("AUC Score=", metrics.auc(fpr, tpr))

AUC Score= 0.9591327570454707


In [12]:
num_not_fraud = npredictions.filter( \
    ( npredictions["class"] == 0 ) ).count()
num_not_fraud_correct = npredictions.filter( \
    ( npredictions["class"] == 0 ) & ( npredictions["predclass"] == 0 ) ) \
    .count()
print( "number of not fraud = ", num_not_fraud, ", number correctly classified", num_not_fraud_correct, \
      ", accuracy = ", num_not_fraud_correct/num_not_fraud )

number of not fraud =  56916 , number correctly classified 56894 , accuracy =  0.9996134654578678


In [13]:
num_fraud = npredictions.filter( \
    ( npredictions["class"] == 1 ) ).count()
num_fraud_correct = npredictions.filter( \
    ( npredictions["class"] == 1 ) & ( npredictions["predclass"] == 1 ) ) \
    .count()
print( "number of fraud = ", num_fraud, ", number correctly classified", num_fraud_correct, \
      ", accuracy = ", num_fraud_correct/num_fraud )

number of fraud =  102 , number correctly classified 74 , accuracy =  0.7254901960784313


In [14]:
# Create a GBT model and train it...
from pyspark.ml.regression import GBTRegressor
from pyspark.ml import Pipeline
gbt = GBTRegressor(featuresCol="features", labelCol="class", maxIter=10)
pipeline = Pipeline(stages=[gbt])
model = pipeline.fit(df_train)

In [15]:
# Make predictions.
predictions = model.transform(df_test)
predictions = predictions.select("prediction", "class")

In [16]:
# Select (prediction, true label) and compute test error
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(
    labelCol="class", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 0.02563


In [17]:
# Compute an AUC score
score = [ row.prediction for row in predictions.select('prediction').collect() ]
ground_truth = [ int(row["class"] ) for row in predictions.select('class').collect() ]
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(ground_truth, score, pos_label=1)
print("AUC Score=", metrics.auc(fpr, tpr))

AUC Score= 0.9544809929734772


In [18]:
# Split data into training and test dataframe
df_train, df_test = vdf.randomSplit([0.8, 0.2],seed=10)

# Create a GBT model and train it...
from pyspark.ml.regression import GBTRegressor
from pyspark.ml import Pipeline
gbt = GBTRegressor(featuresCol="features", labelCol="class", maxIter=10)
pipeline = Pipeline(stages=[gbt])
model = pipeline.fit(df_train)

# Make predictions.
predictions = model.transform(df_test)
predictions = predictions.select("prediction", "class")

# Select (prediction, true label) and compute test error
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(
    labelCol="class", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

# Compute an AUC score
score = [ row.prediction for row in predictions.select('prediction').collect() ]
ground_truth = [ int(row["class"] ) for row in predictions.select('class').collect() ]
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(ground_truth, score, pos_label=1)
print("AUC Score=", metrics.auc(fpr, tpr))

Root Mean Squared Error (RMSE) on test data = 0.0223985
AUC Score= 0.9639666816122471
