In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.getOrCreate()

# Reading data from csv files to dataframe

In [2]:
df2=spark.read.csv('C:/Manidhar/MachineLearningLab/datasets/titanic/train.csv',header=True)
df2.count()

891

In [3]:
df2.printSchema()

root
 |-- PassengerId: string (nullable = true)
 |-- Survived: string (nullable = true)
 |-- Pclass: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- SibSp: string (nullable = true)
 |-- Parch: string (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: string (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [4]:
df3=df2.select('Pclass','Survived')
df3.show(n=5)
df3.printSchema()

+------+--------+
|Pclass|Survived|
+------+--------+
|     3|       0|
|     1|       1|
|     3|       1|
|     1|       1|
|     3|       0|
+------+--------+
only showing top 5 rows

root
 |-- Pclass: string (nullable = true)
 |-- Survived: string (nullable = true)



In [5]:
df3=df3.select(df3.Pclass.cast('double'),df3.Survived.cast('double'))
df3.printSchema()

root
 |-- Pclass: double (nullable = true)
 |-- Survived: double (nullable = true)



In [6]:
from pyspark.ml.feature import VectorAssembler

In [7]:
df3=VectorAssembler(inputCols=['Pclass'],outputCol='Features').transform(df3)
df3.show(n=5)

+------+--------+--------+
|Pclass|Survived|Features|
+------+--------+--------+
|   3.0|     0.0|   [3.0]|
|   1.0|     1.0|   [1.0]|
|   3.0|     1.0|   [3.0]|
|   1.0|     1.0|   [1.0]|
|   3.0|     0.0|   [3.0]|
+------+--------+--------+
only showing top 5 rows



In [8]:
from pyspark.ml.classification import DecisionTreeClassifier

In [9]:
dt1=DecisionTreeClassifier(featuresCol='Features',labelCol='Survived',maxDepth=10,impurity='entropy')

In [10]:
model12=dt1.fit(df3)

In [11]:
model12.depth

2

In [12]:
model12.numFeatures

1

In [13]:
print(model12.toDebugString)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_4bb78b240756359818f6) of depth 2 with 5 nodes
  If (feature 0 <= 2.5)
   If (feature 0 <= 1.5)
    Predict: 1.0
   Else (feature 0 > 1.5)
    Predict: 0.0
  Else (feature 0 > 2.5)
   Predict: 0.0



In [14]:
df4=spark.read.csv('C:/Manidhar/MachineLearningLab/datasets/titanic/test.csv',header=True)
df4.count()

418

In [16]:
df5=df4.select('Pclass')
df5.printSchema()

root
 |-- Pclass: string (nullable = true)



In [23]:
df5=df5.select(df5.Pclass.cast('double'))
df5.printSchema()

root
 |-- Pclass: double (nullable = true)



In [24]:
df5=VectorAssembler(inputCols=['Pclass'],outputCol='Features').transform(df5)
df5.show(n=5)

+------+--------+
|Pclass|Features|
+------+--------+
|   3.0|   [3.0]|
|   3.0|   [3.0]|
|   2.0|   [2.0]|
|   3.0|   [3.0]|
|   3.0|   [3.0]|
+------+--------+
only showing top 5 rows



In [26]:
df6=model12.transform(df5)
df6.show(n=15)

+------+--------+-------------+--------------------+----------+
|Pclass|Features|rawPrediction|         probability|prediction|
+------+--------+-------------+--------------------+----------+
|   3.0|   [3.0]|[372.0,119.0]|[0.75763747454175...|       0.0|
|   3.0|   [3.0]|[372.0,119.0]|[0.75763747454175...|       0.0|
|   2.0|   [2.0]|  [97.0,87.0]|[0.52717391304347...|       0.0|
|   3.0|   [3.0]|[372.0,119.0]|[0.75763747454175...|       0.0|
|   3.0|   [3.0]|[372.0,119.0]|[0.75763747454175...|       0.0|
|   3.0|   [3.0]|[372.0,119.0]|[0.75763747454175...|       0.0|
|   3.0|   [3.0]|[372.0,119.0]|[0.75763747454175...|       0.0|
|   2.0|   [2.0]|  [97.0,87.0]|[0.52717391304347...|       0.0|
|   3.0|   [3.0]|[372.0,119.0]|[0.75763747454175...|       0.0|
|   3.0|   [3.0]|[372.0,119.0]|[0.75763747454175...|       0.0|
|   3.0|   [3.0]|[372.0,119.0]|[0.75763747454175...|       0.0|
|   1.0|   [1.0]| [80.0,136.0]|[0.37037037037037...|       1.0|
|   1.0|   [1.0]| [80.0,136.0]|[0.370370

In [29]:
df6.coalesce(1).select('prediction').write.csv('test_predict1.csv')