In [136]:
!pip install pyspark



In [137]:
from pyspark.sql import SparkSession

In [135]:
spark = SparkSession.builder.appName('mylogreg').getOrCreate()

In [138]:
from pyspark.ml.classification import LogisticRegression

In [140]:
# Load training data
training = spark.read.format("libsvm").load("sample_libsvm_data.txt")

lr = LogisticRegression()

# Fit the model
lrModel = lr.fit(training)

trainingSummary = lrModel.summary

In [141]:
trainingSummary.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[127,128,129...|[20.3777627514862...|[0.99999999858729...|       0.0|
|  1.0|(692,[158,159,160...|[-21.114014198867...|[6.76550380001560...|       1.0|
|  1.0|(692,[124,125,126...|[-23.743613234676...|[4.87842678715831...|       1.0|
|  1.0|(692,[152,153,154...|[-19.192574012719...|[4.62137287298722...|       1.0|
|  1.0|(692,[151,152,153...|[-20.125398874697...|[1.81823629113437...|       1.0|
|  0.0|(692,[129,130,131...|[20.4890549504187...|[0.99999999873608...|       0.0|
|  1.0|(692,[158,159,160...|[-21.082940212813...|[6.97903542824686...|       1.0|
|  1.0|(692,[99,100,101,...|[-19.622713503566...|[3.00582577441380...|       1.0|
|  0.0|(692,[154,155,156...|[21.1594863606570...|[0.99999999935352...|       0.0|
|  0.0|(692,[127

In [142]:
from pyspark.mllib.evaluation import MulticlassMetrics

In [143]:
lrModel.evaluate(training)

<pyspark.ml.classification.BinaryLogisticRegressionSummary at 0x7ff1faa1a1a0>

In [144]:
predictionAndLabels = lrModel.evaluate(training)

In [145]:
predictionAndLabels.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[127,128,129...|[20.3777627514862...|[0.99999999858729...|       0.0|
|  1.0|(692,[158,159,160...|[-21.114014198867...|[6.76550380001560...|       1.0|
|  1.0|(692,[124,125,126...|[-23.743613234676...|[4.87842678715831...|       1.0|
|  1.0|(692,[152,153,154...|[-19.192574012719...|[4.62137287298722...|       1.0|
|  1.0|(692,[151,152,153...|[-20.125398874697...|[1.81823629113437...|       1.0|
|  0.0|(692,[129,130,131...|[20.4890549504187...|[0.99999999873608...|       0.0|
|  1.0|(692,[158,159,160...|[-21.082940212813...|[6.97903542824686...|       1.0|
|  1.0|(692,[99,100,101,...|[-19.622713503566...|[3.00582577441380...|       1.0|
|  0.0|(692,[154,155,156...|[21.1594863606570...|[0.99999999935352...|       0.0|
|  0.0|(692,[127

In [146]:
predictionAndLabels = predictionAndLabels.predictions.select('label','prediction')

In [147]:
predictionAndLabels.show()

+-----+----------+
|label|prediction|
+-----+----------+
|  0.0|       0.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  1.0|       1.0|
+-----+----------+
only showing top 20 rows



In [148]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator

In [149]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label')

In [150]:
# For multiclass
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label',
                                             metricName='accuracy')

In [151]:
acc = evaluator.evaluate(predictionAndLabels)

In [152]:
acc

1.0