## Decission Tree

In [4]:
## using pandas to check the data
import pandas as pd
import numpy as np


In [6]:
pd_data = pd.read_csv('cancer.csv')
pd_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 683 entries, 0 to 682
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   id                 683 non-null    int64
 1   Clump Thickness    683 non-null    int64
 2   UofCSize           683 non-null    int64
 3   UofCShape          683 non-null    int64
 4   Marginal Adhesion  683 non-null    int64
 5   SECSize            683 non-null    int64
 6   Bare Nuclei        683 non-null    int64
 7   Bland Chromatin    683 non-null    int64
 8   Normal Nucleoli    683 non-null    int64
 9   Mitoses            683 non-null    int64
 10  Class              683 non-null    int64
dtypes: int64(11)
memory usage: 58.8 KB


In [7]:
## no missign values in data 

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Decission Tree').getOrCreate()

22/11/19 22:18:15 WARN Utils: Your hostname, Pardeeps-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.48 instead (on interface en0)
22/11/19 22:18:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/19 22:18:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
## reading data

data = spark.read.csv('cancer.csv', header= True, inferSchema= True)
data.columns

['id',
 'Clump Thickness',
 'UofCSize',
 'UofCShape',
 'Marginal Adhesion',
 'SECSize',
 'Bare Nuclei',
 'Bland Chromatin',
 'Normal Nucleoli',
 'Mitoses',
 'Class']

In [8]:
## encoding class as category

from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder(inputCol='Class', outputCol='ClassEncoded')




In [11]:
## using assembler to create features

from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=[
 'Clump Thickness',
 'UofCSize',
 'UofCShape',
 'Marginal Adhesion',
 'SECSize',
 'Bare Nuclei',
 'Bland Chromatin',
 'Normal Nucleoli',
 'Mitoses'], outputCol= 'features')

In [19]:
## creating instance of decision tree

from pyspark.ml.classification import DecisionTreeClassifier

DT = DecisionTreeClassifier(labelCol='Class', featuresCol='features')

In [20]:
## creating pipelines

from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[encoder, assembler, DT])

In [23]:
### splitting data for training and testing

train, test = data.randomSplit([0.7,0.3])

In [24]:
fit_model = pipeline.fit(train)

In [25]:
results = fit_model.transform(test)

In [27]:
results.columns

['id',
 'Clump Thickness',
 'UofCSize',
 'UofCShape',
 'Marginal Adhesion',
 'SECSize',
 'Bare Nuclei',
 'Bland Chromatin',
 'Normal Nucleoli',
 'Mitoses',
 'Class',
 'ClassEncoded',
 'features',
 'rawPrediction',
 'probability',
 'prediction']

In [28]:
results.select(['Class', 'prediction']).show()

+-----+----------+
|Class|prediction|
+-----+----------+
|    2|       2.0|
|    4|       4.0|
|    2|       2.0|
|    4|       4.0|
|    4|       4.0|
|    4|       4.0|
|    4|       4.0|
|    2|       2.0|
|    2|       2.0|
|    2|       2.0|
|    2|       2.0|
|    2|       2.0|
|    4|       4.0|
|    4|       4.0|
|    2|       2.0|
|    4|       2.0|
|    2|       2.0|
|    2|       2.0|
|    4|       4.0|
|    2|       2.0|
+-----+----------+
only showing top 20 rows



In [30]:
## evaluating model

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol='Class', predictionCol='prediction', metricName='accuracy')

accuracy = evaluator.evaluate(results)
print(f"Accuracy is {accuracy}")

Accuracy is 0.9601990049751243
