## Import the Libraries 

In [34]:
import pyspark
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession, functions, types
from pyspark.ml.feature import StandardScaler
spark = SparkSession.builder.appName("Logistic Regression").getOrCreate()

## Download the Data

In [97]:
!wget https://raw.githubusercontent.com/mananparasher/Spark-Datasets/master/breast_cancer.csv

--2020-06-20 10:02:46--  https://raw.githubusercontent.com/mananparasher/Spark-Datasets/master/breast_cancer.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.124.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.124.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 119888 (117K) [text/plain]
Saving to: ‘breast_cancer.csv’


2020-06-20 10:02:47 (413 KB/s) - ‘breast_cancer.csv’ saved [119888/119888]



## Loading the Data in Spark DataFrame 

In [36]:
df = spark.read.format('csv').options(header='false',inferschema='true').load('breast_cancer.csv')
df.show(5)

+-----+-----+-----+------+-------+-------+------+-------+------+-------+------+------+-----+-----+--------+-------+-------+-------+-------+--------+-----+-----+-----+------+------+------+------+------+------+-------+----+
|  _c0|  _c1|  _c2|   _c3|    _c4|    _c5|   _c6|    _c7|   _c8|    _c9|  _c10|  _c11| _c12| _c13|    _c14|   _c15|   _c16|   _c17|   _c18|    _c19| _c20| _c21| _c22|  _c23|  _c24|  _c25|  _c26|  _c27|  _c28|   _c29|_c30|
+-----+-----+-----+------+-------+-------+------+-------+------+-------+------+------+-----+-----+--------+-------+-------+-------+-------+--------+-----+-----+-----+------+------+------+------+------+------+-------+----+
|17.99|10.38|122.8|1001.0| 0.1184| 0.2776|0.3001| 0.1471|0.2419|0.07871| 1.095|0.9053|8.589|153.4|0.006399|0.04904|0.05373|0.01587|0.03003|0.006193|25.38|17.33|184.6|2019.0|0.1622|0.6656|0.7119|0.2654|0.4601| 0.1189|   0|
|20.57|17.77|132.9|1326.0|0.08474|0.07864|0.0869|0.07017|0.1812|0.05667|0.5435|0.7339|3.398|74.08|0.005225|0.013

## Exploring the Data

In [21]:
df.describe

<bound method DataFrame.describe of DataFrame[_c0: double, _c1: double, _c2: double, _c3: double, _c4: double, _c5: double, _c6: double, _c7: double, _c8: double, _c9: double, _c10: double, _c11: double, _c12: double, _c13: double, _c14: double, _c15: double, _c16: double, _c17: double, _c18: double, _c19: double, _c20: double, _c21: double, _c22: double, _c23: double, _c24: double, _c25: double, _c26: double, _c27: double, _c28: double, _c29: double, _c30: int]>

## Processing the Data 

In [23]:
vectorAssembler = VectorAssembler(inputCols = ['_c0', '_c0', '_c0', '_c3', '_c4', '_c5', '_c6', '_c7', \
'_c8', '_c9', '_c10', '_c12', '_c13','_c14','_c14','_c16','_c17','_c18','_c19','_c20','_c21','_c22'\
,'_c23','_c24','_c25','_c26','_c27','_c28','_c29'], outputCol = 'features')

transformed_df = vectorAssembler.transform(df)
transformed_df.select(["features","_c30"]).show()

+--------------------+----+
|            features|_c30|
+--------------------+----+
|[17.99,17.99,17.9...|   0|
|[20.57,20.57,20.5...|   0|
|[19.69,19.69,19.6...|   0|
|[11.42,11.42,11.4...|   0|
|[20.29,20.29,20.2...|   0|
|[12.45,12.45,12.4...|   0|
|[18.25,18.25,18.2...|   0|
|[13.71,13.71,13.7...|   0|
|[13.0,13.0,13.0,5...|   0|
|[12.46,12.46,12.4...|   0|
|[16.02,16.02,16.0...|   0|
|[15.78,15.78,15.7...|   0|
|[19.17,19.17,19.1...|   0|
|[15.85,15.85,15.8...|   0|
|[13.73,13.73,13.7...|   0|
|[14.54,14.54,14.5...|   0|
|[14.68,14.68,14.6...|   0|
|[16.13,16.13,16.1...|   0|
|[19.81,19.81,19.8...|   0|
|[13.54,13.54,13.5...|   1|
+--------------------+----+
only showing top 20 rows



## Normalizing the Data 

In [30]:
standardscaler=StandardScaler().setInputCol("features").setOutputCol("Scaled_features")
scaled_df=standardscaler.fit(transformed_df).transform(transformed_df)
scaled_df.select("Scaled_features","_c30").show(1)

+--------------------+----+
|     Scaled_features|_c30|
+--------------------+----+
|[5.10492359418783...|   0|
+--------------------+----+
only showing top 1 row



## Splitting the Data 

In [31]:
final_df=scaled_df.select(['Scaled_features','_c30'])
splitting_df = final_df.randomSplit([0.8, 0.2])
training_df = splitting_df[0]
testing_df = splitting_df[1]

## Implementing and Fitting the Model 

In [32]:
logisticregression = LogisticRegression\
(featuresCol = 'Scaled_features', labelCol='_c30', maxIter=20)

model = logisticregression.fit(training_df)
model

LogisticRegression_41b7a47e1e97d3dfcb26

## Prediction using the Trained Model 

In [33]:
prediction = model.transform(testing_df)
prediction.select("prediction","_c30","Scaled_features").show(10)

+----------+----+--------------------+
|prediction|_c30|     Scaled_features|
+----------+----+--------------------+
|       1.0|   1|[2.47612914301740...|
|       1.0|   1|[2.53969239399561...|
|       1.0|   1|[2.64837420258783...|
|       1.0|   1|[2.66653513143875...|
|       1.0|   1|[2.71477509869900...|
|       1.0|   1|[2.75819106923322...|
|       1.0|   1|[2.76329883047254...|
|       1.0|   1|[2.80245833330734...|
|       1.0|   1|[2.88588510021624...|
|       1.0|   1|[2.99939090553448...|
+----------+----+--------------------+
only showing top 10 rows

