In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
spark=SparkSession.builder.getOrCreate()

In [2]:
df2=spark.read.csv('C:/Manidhar/MachineLearningLab/datasets/titanic/train.csv',header=True)
df3=df2.select('Pclass','SibSp','Survived','Fare')
df3=df2.select(df3.Pclass.cast('double'),df3.SibSp.cast('double'),df3.Survived.cast('double'),df3.Fare.cast('double'))

In [3]:
df3=VectorAssembler(inputCols=['Pclass','SibSp','Fare'], outputCol="Features").transform(df3)

In [4]:
df3.printSchema()

root
 |-- Pclass: double (nullable = true)
 |-- SibSp: double (nullable = true)
 |-- Survived: double (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Features: vector (nullable = true)



In [5]:
df3.show()

+------+-----+--------+-------+-----------------+
|Pclass|SibSp|Survived|   Fare|         Features|
+------+-----+--------+-------+-----------------+
|   3.0|  1.0|     0.0|   7.25|   [3.0,1.0,7.25]|
|   1.0|  1.0|     1.0|71.2833|[1.0,1.0,71.2833]|
|   3.0|  0.0|     1.0|  7.925|  [3.0,0.0,7.925]|
|   1.0|  1.0|     1.0|   53.1|   [1.0,1.0,53.1]|
|   3.0|  0.0|     0.0|   8.05|   [3.0,0.0,8.05]|
|   3.0|  0.0|     0.0| 8.4583| [3.0,0.0,8.4583]|
|   1.0|  0.0|     0.0|51.8625|[1.0,0.0,51.8625]|
|   3.0|  3.0|     0.0| 21.075| [3.0,3.0,21.075]|
|   3.0|  0.0|     1.0|11.1333|[3.0,0.0,11.1333]|
|   2.0|  1.0|     1.0|30.0708|[2.0,1.0,30.0708]|
|   3.0|  1.0|     1.0|   16.7|   [3.0,1.0,16.7]|
|   1.0|  0.0|     1.0|  26.55|  [1.0,0.0,26.55]|
|   3.0|  0.0|     0.0|   8.05|   [3.0,0.0,8.05]|
|   3.0|  1.0|     0.0| 31.275| [3.0,1.0,31.275]|
|   3.0|  0.0|     0.0| 7.8542| [3.0,0.0,7.8542]|
|   2.0|  0.0|     1.0|   16.0|   [2.0,0.0,16.0]|
|   3.0|  4.0|     0.0| 29.125| [3.0,4.0,29.125]|


In [6]:
# dt1 is called as estimator or classifier or regresser
dt1=DecisionTreeClassifier(featuresCol='Features',labelCol='Survived',maxDepth=10,impurity='entropy')

In [7]:
model1=dt1.fit(df3)

In [8]:
model1.depth

10

In [9]:
print(model1.toDebugString)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_40e0b534f8c77dde1441) of depth 10 with 165 nodes
  If (feature 0 <= 2.5)
   If (feature 2 <= 15.525)
    If (feature 2 <= 7.0104)
     Predict: 0.0
    Else (feature 2 > 7.0104)
     If (feature 1 <= 1.5)
      If (feature 1 <= 0.5)
       If (feature 2 <= 14.45625)
        If (feature 2 <= 13.20835)
         If (feature 2 <= 10.50835)
          Predict: 0.0
         Else (feature 2 > 10.50835)
          If (feature 2 <= 12.9375)
           Predict: 0.0
          Else (feature 2 > 12.9375)
           Predict: 0.0
        Else (feature 2 > 13.20835)
         Predict: 0.0
       Else (feature 2 > 14.45625)
        Predict: 0.0
      Else (feature 1 > 0.5)
       If (feature 2 <= 12.9375)
        Predict: 0.0
       Else (feature 2 > 12.9375)
        Predict: 1.0
     Else (feature 1 > 1.5)
      Predict: 0.0
   Else (feature 2 > 15.525)
    If (feature 2 <= 55.22085)
     If (feature 1 <= 1.5)
      If (feature 2 <= 18.375)
    

In [10]:
model1.featureImportances

SparseVector(3, {0: 0.3178, 1: 0.1241, 2: 0.558})

In [11]:
model1.numClasses

2

In [12]:
model1.numFeatures

3

In [19]:
df2.show(truncate=False,vertical=True)

-RECORD 0--------------------------------------------------------------
 PassengerId | 1                                                       
 Survived    | 0                                                       
 Pclass      | 3                                                       
 Name        | Braund, Mr. Owen Harris                                 
 Sex         | male                                                    
 Age         | 22                                                      
 SibSp       | 1                                                       
 Parch       | 0                                                       
 Ticket      | A/5 21171                                               
 Fare        | 7.25                                                    
 Cabin       | null                                                    
 Embarked    | S                                                       
-RECORD 1-------------------------------------------------------

In [20]:
df2.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|  22|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|  38|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|  26|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|  35|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|  35|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [23]:
df2.printSchema()

root
 |-- PassengerId: string (nullable = true)
 |-- Survived: string (nullable = true)
 |-- Pclass: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- SibSp: string (nullable = true)
 |-- Parch: string (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: string (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [28]:
df2.describe()

DataFrame[summary: string, PassengerId: string, Survived: string, Pclass: string, Name: string, Sex: string, Age: string, SibSp: string, Parch: string, Ticket: string, Fare: string, Cabin: string, Embarked: string]