# Importing spark

In [5]:
# import findspark
# findspark.init()
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Python Spark").getOrCreate()
sc = spark.sparkContext

# Preparing the data

In [6]:
df_transactions = spark.read.option("header", True)\
    .option("delimiter", "|")\
    .option("delimiter", ",")\
    .option("inferSchema", "true")\
    .csv('data/train.csv')\
    .withColumnRenamed('default_payment_next_month', 'label')

In [7]:
df_transactions.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- LIMIT_BAL: double (nullable = true)
 |-- SEX: integer (nullable = true)
 |-- EDUCATION: integer (nullable = true)
 |-- MARRIAGE: integer (nullable = true)
 |-- AGE: integer (nullable = true)
 |-- PAY_0: integer (nullable = true)
 |-- PAY_2: integer (nullable = true)
 |-- PAY_3: integer (nullable = true)
 |-- PAY_4: integer (nullable = true)
 |-- PAY_5: integer (nullable = true)
 |-- PAY_6: integer (nullable = true)
 |-- BILL_AMT1: double (nullable = true)
 |-- BILL_AMT2: double (nullable = true)
 |-- BILL_AMT3: double (nullable = true)
 |-- BILL_AMT4: double (nullable = true)
 |-- BILL_AMT5: double (nullable = true)
 |-- BILL_AMT6: double (nullable = true)
 |-- PAY_AMT1: double (nullable = true)
 |-- PAY_AMT2: double (nullable = true)
 |-- PAY_AMT3: double (nullable = true)
 |-- PAY_AMT4: double (nullable = true)
 |-- PAY_AMT5: double (nullable = true)
 |-- PAY_AMT6: double (nullable = true)
 |-- label: integer (nullable = true)



In [8]:
train, test = df_transactions.randomSplit([0.8, 0.2])
train.show()

+---+---------+---+---------+--------+---+-----+-----+-----+-----+-----+-----+---------+---------+---------+---------+---------+---------+--------+--------+--------+--------+--------+--------+-----+
| ID|LIMIT_BAL|SEX|EDUCATION|MARRIAGE|AGE|PAY_0|PAY_2|PAY_3|PAY_4|PAY_5|PAY_6|BILL_AMT1|BILL_AMT2|BILL_AMT3|BILL_AMT4|BILL_AMT5|BILL_AMT6|PAY_AMT1|PAY_AMT2|PAY_AMT3|PAY_AMT4|PAY_AMT5|PAY_AMT6|label|
+---+---------+---+---------+--------+---+-----+-----+-----+-----+-----+-----+---------+---------+---------+---------+---------+---------+--------+--------+--------+--------+--------+--------+-----+
|  1|  20000.0|  2|        2|       1| 24|    2|    2|   -1|   -1|   -2|   -2|   3913.0|   3102.0|    689.0|      0.0|      0.0|      0.0|     0.0|   689.0|     0.0|     0.0|     0.0|     0.0|    1|
|  2| 120000.0|  2|        2|       2| 26|   -1|    2|    0|    0|    0|    2|   2682.0|   1725.0|   2682.0|   3272.0|   3455.0|   3261.0|     0.0|  1000.0|  1000.0|  1000.0|     0.0|  2000.0|    1|
|  3|

# Preparing the model

In [12]:
df_transactions.columns

['ID',
 'LIMIT_BAL',
 'SEX',
 'EDUCATION',
 'MARRIAGE',
 'AGE',
 'PAY_0',
 'PAY_2',
 'PAY_3',
 'PAY_4',
 'PAY_5',
 'PAY_6',
 'BILL_AMT1',
 'BILL_AMT2',
 'BILL_AMT3',
 'BILL_AMT4',
 'BILL_AMT5',
 'BILL_AMT6',
 'PAY_AMT1',
 'PAY_AMT2',
 'PAY_AMT3',
 'PAY_AMT4',
 'PAY_AMT5',
 'PAY_AMT6',
 'label']

In [13]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
feature_list= ['LIMIT_BAL','SEX', 'EDUCATION','MARRIAGE', 'EDUCATION', 'PAY_0', 'PAY_2', 'PAY_3','BILL_AMT1','PAY_AMT1']
assembler = assembler = VectorAssembler(
    inputCols=feature_list,
    outputCol='features')

lr = LogisticRegression(maxIter=10, regParam=0.001, elasticNetParam=1.)
pipeline = Pipeline(stages=[assembler , lr ])


# Fitting the model

In [14]:
_model = pipeline.fit(train)

# Evaluation of the model

In [15]:
predictions = _model.transform(test)
predictions.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- LIMIT_BAL: double (nullable = true)
 |-- SEX: integer (nullable = true)
 |-- EDUCATION: integer (nullable = true)
 |-- MARRIAGE: integer (nullable = true)
 |-- AGE: integer (nullable = true)
 |-- PAY_0: integer (nullable = true)
 |-- PAY_2: integer (nullable = true)
 |-- PAY_3: integer (nullable = true)
 |-- PAY_4: integer (nullable = true)
 |-- PAY_5: integer (nullable = true)
 |-- PAY_6: integer (nullable = true)
 |-- BILL_AMT1: double (nullable = true)
 |-- BILL_AMT2: double (nullable = true)
 |-- BILL_AMT3: double (nullable = true)
 |-- BILL_AMT4: double (nullable = true)
 |-- BILL_AMT5: double (nullable = true)
 |-- BILL_AMT6: double (nullable = true)
 |-- PAY_AMT1: double (nullable = true)
 |-- PAY_AMT2: double (nullable = true)
 |-- PAY_AMT3: double (nullable = true)
 |-- PAY_AMT4: double (nullable = true)
 |-- PAY_AMT5: double (nullable = true)
 |-- PAY_AMT6: double (nullable = true)
 |-- label: integer (nullable = true)
 |-- feature

In [16]:
predictions[['label','sms','prediction','probability','prediction']].filter('label==1').show()

AnalysisException: cannot resolve '`sms`' given input columns: [AGE, BILL_AMT1, BILL_AMT2, BILL_AMT3, BILL_AMT4, BILL_AMT5, BILL_AMT6, EDUCATION, ID, LIMIT_BAL, MARRIAGE, PAY_0, PAY_2, PAY_3, PAY_4, PAY_5, PAY_6, PAY_AMT1, PAY_AMT2, PAY_AMT3, PAY_AMT4, PAY_AMT5, PAY_AMT6, SEX, features, label, prediction, probability, rawPrediction];;
'Project [label#158, 'sms, prediction#1167, probability#1136, prediction#1167]
+- Project [ID#108, LIMIT_BAL#109, SEX#110, EDUCATION#111, MARRIAGE#112, AGE#113, PAY_0#114, PAY_2#115, PAY_3#116, PAY_4#117, PAY_5#118, PAY_6#119, BILL_AMT1#120, BILL_AMT2#121, BILL_AMT3#122, BILL_AMT4#123, BILL_AMT5#124, BILL_AMT6#125, PAY_AMT1#126, PAY_AMT2#127, PAY_AMT3#128, PAY_AMT4#129, PAY_AMT5#130, PAY_AMT6#131, ... 5 more fields]
   +- Project [ID#108, LIMIT_BAL#109, SEX#110, EDUCATION#111, MARRIAGE#112, AGE#113, PAY_0#114, PAY_2#115, PAY_3#116, PAY_4#117, PAY_5#118, PAY_6#119, BILL_AMT1#120, BILL_AMT2#121, BILL_AMT3#122, BILL_AMT4#123, BILL_AMT5#124, BILL_AMT6#125, PAY_AMT1#126, PAY_AMT2#127, PAY_AMT3#128, PAY_AMT4#129, PAY_AMT5#130, PAY_AMT6#131, ... 4 more fields]
      +- Project [ID#108, LIMIT_BAL#109, SEX#110, EDUCATION#111, MARRIAGE#112, AGE#113, PAY_0#114, PAY_2#115, PAY_3#116, PAY_4#117, PAY_5#118, PAY_6#119, BILL_AMT1#120, BILL_AMT2#121, BILL_AMT3#122, BILL_AMT4#123, BILL_AMT5#124, BILL_AMT6#125, PAY_AMT1#126, PAY_AMT2#127, PAY_AMT3#128, PAY_AMT4#129, PAY_AMT5#130, PAY_AMT6#131, ... 3 more fields]
         +- Project [ID#108, LIMIT_BAL#109, SEX#110, EDUCATION#111, MARRIAGE#112, AGE#113, PAY_0#114, PAY_2#115, PAY_3#116, PAY_4#117, PAY_5#118, PAY_6#119, BILL_AMT1#120, BILL_AMT2#121, BILL_AMT3#122, BILL_AMT4#123, BILL_AMT5#124, BILL_AMT6#125, PAY_AMT1#126, PAY_AMT2#127, PAY_AMT3#128, PAY_AMT4#129, PAY_AMT5#130, PAY_AMT6#131, ... 2 more fields]
            +- Sample 0.8, 1.0, false, 7573999638266801506
               +- Sort [ID#108 ASC NULLS FIRST, LIMIT_BAL#109 ASC NULLS FIRST, SEX#110 ASC NULLS FIRST, EDUCATION#111 ASC NULLS FIRST, MARRIAGE#112 ASC NULLS FIRST, AGE#113 ASC NULLS FIRST, PAY_0#114 ASC NULLS FIRST, PAY_2#115 ASC NULLS FIRST, PAY_3#116 ASC NULLS FIRST, PAY_4#117 ASC NULLS FIRST, PAY_5#118 ASC NULLS FIRST, PAY_6#119 ASC NULLS FIRST, BILL_AMT1#120 ASC NULLS FIRST, BILL_AMT2#121 ASC NULLS FIRST, BILL_AMT3#122 ASC NULLS FIRST, BILL_AMT4#123 ASC NULLS FIRST, BILL_AMT5#124 ASC NULLS FIRST, BILL_AMT6#125 ASC NULLS FIRST, PAY_AMT1#126 ASC NULLS FIRST, PAY_AMT2#127 ASC NULLS FIRST, PAY_AMT3#128 ASC NULLS FIRST, PAY_AMT4#129 ASC NULLS FIRST, PAY_AMT5#130 ASC NULLS FIRST, PAY_AMT6#131 ASC NULLS FIRST, label#158 ASC NULLS FIRST], false
                  +- Project [ID#108, LIMIT_BAL#109, SEX#110, EDUCATION#111, MARRIAGE#112, AGE#113, PAY_0#114, PAY_2#115, PAY_3#116, PAY_4#117, PAY_5#118, PAY_6#119, BILL_AMT1#120, BILL_AMT2#121, BILL_AMT3#122, BILL_AMT4#123, BILL_AMT5#124, BILL_AMT6#125, PAY_AMT1#126, PAY_AMT2#127, PAY_AMT3#128, PAY_AMT4#129, PAY_AMT5#130, PAY_AMT6#131, default_payment_next_month#132 AS label#158]
                     +- Relation[ID#108,LIMIT_BAL#109,SEX#110,EDUCATION#111,MARRIAGE#112,AGE#113,PAY_0#114,PAY_2#115,PAY_3#116,PAY_4#117,PAY_5#118,PAY_6#119,BILL_AMT1#120,BILL_AMT2#121,BILL_AMT3#122,BILL_AMT4#123,BILL_AMT5#124,BILL_AMT6#125,PAY_AMT1#126,PAY_AMT2#127,PAY_AMT3#128,PAY_AMT4#129,PAY_AMT5#130,PAY_AMT6#131,default_payment_next_month#132] csv


In [17]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator().setLabelCol('label').setRawPredictionCol('prediction').setMetricName('areaUnderROC')
AUC = evaluator.evaluate(predictions)
print(AUC)

0.596319563710868


test lecture

In [37]:
rddFromFile = spark.read.csv("data/output/transactions_3_1599566693.csv").rdd
dfFromRDD1 = spark.createDataFrame(rddFromFile)
dfFromRDD1.printSchema()

Py4JJavaError: An error occurred while calling o4205.csv.
: java.lang.IllegalStateException: Cannot call methods on a stopped SparkContext.
This stopped SparkContext was created at:

org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:500)
java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:481)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
py4j.Gateway.invoke(Gateway.java:238)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
py4j.GatewayConnection.run(GatewayConnection.java:238)
java.base/java.lang.Thread.run(Thread.java:832)

The currently active SparkContext was created at:

(No active SparkContext.)
         
	at org.apache.spark.SparkContext.assertNotStopped(SparkContext.scala:111)
	at org.apache.spark.SparkContext.broadcast(SparkContext.scala:1471)
	at org.apache.spark.sql.execution.datasources.text.TextFileFormat.buildReader(TextFileFormat.scala:106)
	at org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues(FileFormat.scala:130)
	at org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues$(FileFormat.scala:121)
	at org.apache.spark.sql.execution.datasources.TextBasedFileFormat.buildReaderWithPartitionValues(FileFormat.scala:170)
	at org.apache.spark.sql.execution.FileSourceScanExec.inputRDD$lzycompute(DataSourceScanExec.scala:398)
	at org.apache.spark.sql.execution.FileSourceScanExec.inputRDD(DataSourceScanExec.scala:389)
	at org.apache.spark.sql.execution.FileSourceScanExec.doExecute(DataSourceScanExec.scala:472)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171)
	at org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:525)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:453)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:452)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:496)
	at org.apache.spark.sql.execution.FilterExec.inputRDDs(basicPhysicalOperators.scala:133)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:720)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:316)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:434)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:420)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3625)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2695)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3616)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3614)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2695)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2902)
	at org.apache.spark.sql.execution.datasources.csv.TextInputCSVDataSource$.infer(CSVDataSource.scala:114)
	at org.apache.spark.sql.execution.datasources.csv.CSVDataSource.inferSchema(CSVDataSource.scala:67)
	at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.inferSchema(CSVFileFormat.scala:62)
	at org.apache.spark.sql.execution.datasources.DataSource.$anonfun$getOrInferFileFormatSchema$11(DataSource.scala:193)
	at scala.Option.orElse(Option.scala:447)
	at org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:190)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:401)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:279)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:268)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:268)
	at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:705)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:564)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:832)


# Streaming data

The stream will be produced by ```generate_transactions.py```

In [30]:
def process(time,rdd):
    print("=============================")
    try:
        dfFromRDD1 = spark.createDataFrame(rdd)
        dfFromRDD1.printSchema()
    except Exception as e:
        print(e)

In [27]:
ssc.stop(True, True)



In [32]:
from pyspark.streaming import StreamingContext
# sparkContext , delay
ssc = StreamingContext(sc , 1)
stream = ssc.textFileStream('data/output/')
stream.foreachRDD(process)
ssc.start () # prend la main dans jupyter ...
# ssc.stop(True, True)

Py4JJavaError: An error occurred while calling None.org.apache.spark.streaming.api.java.JavaStreamingContext.
: java.lang.NullPointerException
	at org.apache.spark.streaming.api.java.JavaStreamingContext.<init>(JavaStreamingContext.scala:130)
	at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
	at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
	at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:500)
	at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:481)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:238)
	at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
	at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:832)
