# Setup Environment

In [1]:
# install Java8
!apt-get -q install openjdk-8-jdk-headless -qq > /dev/null

# download spark3.1.1
!wget -q https://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz

# unzip it
!tar xf spark-3.1.1-bin-hadoop2.7.tgz

# install findspark 
!pip install -q findspark


import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"


import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
#spark = SparkSession.builder.appName('lr').getOrCreate()

# Download and Read Dataset
We have a dataset which has two columns; label and features vectors.

In [2]:
!wget -q https://raw.githubusercontent.com/muhammetsnts/SPARK/main/data/sample_libsvm_data.txt

In [4]:
data = spark.read.format('libsvm').load("sample_libsvm_data.txt")

In [5]:
data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



# Logistic Regression Model

In [3]:
from pyspark.ml.classification import LogisticRegression

In [6]:
logReg = LogisticRegression() # we can specify features, label and predictions columns but will use default in this example

In [8]:
logRegModel = logReg.fit(data)

In [9]:
results_summary = logRegModel.summary

In [11]:
results_summary.predictions.printSchema() # we can check the predictions dataframe

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [13]:
results_summary.predictions.show() # check for label olumn does has match with predition column

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[127,128,129...|[19.8534775947478...|[0.99999999761359...|       0.0|
|  1.0|(692,[158,159,160...|[-20.377398194908...|[1.41321555111048...|       1.0|
|  1.0|(692,[124,125,126...|[-27.401459284891...|[1.25804865126974...|       1.0|
|  1.0|(692,[152,153,154...|[-18.862741612668...|[6.42710509170280...|       1.0|
|  1.0|(692,[151,152,153...|[-20.483011833009...|[1.27157209200599...|       1.0|
|  0.0|(692,[129,130,131...|[19.8506078990277...|[0.99999999760673...|       0.0|
|  1.0|(692,[158,159,160...|[-20.337256674833...|[1.47109814695573...|       1.0|
|  1.0|(692,[99,100,101,...|[-19.595579753418...|[3.08850168102630...|       1.0|
|  0.0|(692,[154,155,156...|[19.2708803215613...|[0.99999999572670...|       0.0|
|  0.0|(692,[127

#Train-Test Split
Lets show how to split the data.

In [14]:
train_data, test_data = data.randomSplit([0.7,0.3])

In [15]:
finalModel = LogisticRegression()
fit_final = finalModel.fit(train_data)

# Evaluate Model
Lets get predictions and evaluate the model.

In [16]:
predictions_and_labels = fit_final.evaluate(test_data)

In [18]:
predictions_and_labels.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[100,101,102...|[9.12104304127178...|[0.99989067138294...|       0.0|
|  0.0|(692,[122,123,124...|[20.1934938631452...|[0.99999999830145...|       0.0|
|  0.0|(692,[123,124,125...|[30.2458382075239...|[0.99999999999992...|       0.0|
|  0.0|(692,[124,125,126...|[30.2193846636888...|[0.99999999999992...|       0.0|
|  0.0|(692,[124,125,126...|[22.0993951736449...|[0.99999999974744...|       0.0|
|  0.0|(692,[126,127,128...|[17.1686859809770...|[0.99999996502687...|       0.0|
|  0.0|(692,[126,127,128...|[27.9774018779005...|[0.99999999999929...|       0.0|
|  0.0|(692,[126,127,128...|[20.0751654033355...|[0.99999999808809...|       0.0|
|  0.0|(692,[128,129,130...|[18.0797781412590...|[0.99999998593783...|       0.0|
|  0.0|(692,[129

## Evaluators

In [19]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [20]:
my_eval = BinaryClassificationEvaluator() #  default used

In [22]:
my_final_roc = my_eval.evaluate(predictions_and_labels.predictions)
my_final_roc

1.0

So, the area under the curve of the ROC is 1.0. This means was a perfect fit and predicted everyhing acurately.