In [1]:
%%bash
apt-get install openjdk-8-jdk-headless -qq > /dev/null

wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz
tar xf spark-3.0.0-bin-hadoop3.2.tgz

pip install findspark

Collecting findspark
  Downloading https://files.pythonhosted.org/packages/fc/2d/2e39f9a023479ea798eed4351cd66f163ce61e00c717e03c37109f00c0f2/findspark-1.4.2-py2.py3-none-any.whl
Installing collected packages: findspark
Successfully installed findspark-1.4.2


In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [7]:
df = spark.read.csv('/content/gdrive/MyDrive/DataFolder/Indian_Liver_Patient_Dataset_ILPDh.csv',
                    inferSchema = True, sep = ';', header = True)

In [8]:
df.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Gender: integer (nullable = true)
 |-- Total_Bilirubin: double (nullable = true)
 |-- Direct_Bilirubin: double (nullable = true)
 |-- Alkaline_Phosphotase: integer (nullable = true)
 |-- Alamine_Aminotransferase: integer (nullable = true)
 |-- Aspartate_Aminotransferase: integer (nullable = true)
 |-- Total_Protein: double (nullable = true)
 |-- Albumin: double (nullable = true)
 |-- Albumin_and_Globulin_Ratio: double (nullable = true)
 |-- Class_ID: integer (nullable = true)



In [9]:
df.groupBy('Class_ID').count().show()

+--------+-----+
|Class_ID|count|
+--------+-----+
|       1|  414|
|       2|  165|
+--------+-----+



In [13]:
from pyspark.ml.feature import VectorAssembler
df_assembler = VectorAssembler(inputCols=['Age', 'Gender',
                               'Total_Bilirubin','Direct_Bilirubin',
                               'Total_Protein', 'Albumin'],
                               outputCol = 'features')
df1 = df_assembler.transform(df)
df1.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Gender: integer (nullable = true)
 |-- Total_Bilirubin: double (nullable = true)
 |-- Direct_Bilirubin: double (nullable = true)
 |-- Alkaline_Phosphotase: integer (nullable = true)
 |-- Alamine_Aminotransferase: integer (nullable = true)
 |-- Aspartate_Aminotransferase: integer (nullable = true)
 |-- Total_Protein: double (nullable = true)
 |-- Albumin: double (nullable = true)
 |-- Albumin_and_Globulin_Ratio: double (nullable = true)
 |-- Class_ID: integer (nullable = true)
 |-- features: vector (nullable = true)



In [14]:
df_train, df_test = df1.randomSplit([0.75,0.25])
print('Training set info:')
df_train.groupBy('Class_ID').count().show()
print('Test set info:')
df_test.groupBy('Class_ID').count().show()


Training set info:
+--------+-----+
|Class_ID|count|
+--------+-----+
|       1|  314|
|       2|  123|
+--------+-----+

Test set info:
+--------+-----+
|Class_ID|count|
+--------+-----+
|       1|  100|
|       2|   42|
+--------+-----+



In [15]:
from pyspark.ml.classification import RandomForestClassifier
rfClassifier = RandomForestClassifier(labelCol='Class_ID',
                                      numTrees=15).fit(df_train)

In [17]:
rf_predictions=rfClassifier.transform(df_test)
rf_predictions.show()

+---+------+---------------+----------------+--------------------+------------------------+--------------------------+-------------+-------+--------------------------+--------+--------------------+--------------------+--------------------+----------+
|Age|Gender|Total_Bilirubin|Direct_Bilirubin|Alkaline_Phosphotase|Alamine_Aminotransferase|Aspartate_Aminotransferase|Total_Protein|Albumin|Albumin_and_Globulin_Ratio|Class_ID|            features|       rawPrediction|         probability|prediction|
+---+------+---------------+----------------+--------------------+------------------------+--------------------------+-------------+-------+--------------------------+--------+--------------------+--------------------+--------------------+----------+
| 10|     1|            0.8|             0.1|                 395|                      25|                        75|          7.6|    3.6|                       0.9|       1|[10.0,1.0,0.8,0.1...|[0.0,9.6996288836...|[0.0,0.6466419255...|       1

In [18]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
rf_accuracy=MulticlassClassificationEvaluator(labelCol='Class_ID',metricName='accuracy').evaluate(rf_predictions)
print('The accuracy of RF on test data is {0:.0%}'.format(rf_accuracy))

The accuracy of RF on test data is 66%
