In [3]:
from pyspark.sql import SparkSession 
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import StringType
from pyspark.ml.feature import VectorAssembler,StringIndexer,OneHotEncoder
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import Imputer

spark = SparkSession.builder.appName("SparkML Train Test").getOrCreate()
pointsDF= spark.read.option("header","false")\
                        .option("inferSchema","True")\
                        .csv("datasets/points-places.txt")
 
pointsDF = pointsDF.withColumnRenamed('_c0','city')
pointsDF = pointsDF.withColumnRenamed('_c1','x')
pointsDF = pointsDF.withColumnRenamed('_c2','y')
pointsDF = pointsDF.withColumnRenamed('_c3','label')

pointsDF.show()

+--------+---+---+-----+
|    city|  x|  y|label|
+--------+---+---+-----+
|  Ankara|  5|  5|    0|
|Istanbul|105|  1|    1|
|Istanbul|105|106|    1|
|  Ankara|  5|  6|    0|
|   Izmir|  6|  6|    0|
|  Ankara|106|106|    1|
|Istanbul|  7|  7|    0|
|   Izmir|  7|  6|    0|
|Istanbul|107|107|    1|
|Istanbul|  6|  7|    0|
|   Izmir|  7|  8|    0|
|  Ankara|108|108|    1|
|Istanbul|  8|  6|    0|
|   Izmir|105|108|    1|
|Istanbul|  5|  8|    0|
|  Ankara|107|106|    1|
|   Izmir|  8|  8|    0|
|Istanbul|106|107|    1|
|  Ankara|107|108|    1|
|Istanbul|108|106|    1|
+--------+---+---+-----+



In [4]:
indx = StringIndexer(inputCol='city',outputCol='city_Indexed')
indexModel = indx.fit(pointsDF)
print(indexModel.labels)
pointsDF = indexModel.transform(pointsDF)
pointsDF.show()

['Istanbul', 'Ankara', 'Izmir']
+--------+---+---+-----+------------+
|    city|  x|  y|label|city_Indexed|
+--------+---+---+-----+------------+
|  Ankara|  5|  5|    0|         1.0|
|Istanbul|105|  1|    1|         0.0|
|Istanbul|105|106|    1|         0.0|
|  Ankara|  5|  6|    0|         1.0|
|   Izmir|  6|  6|    0|         2.0|
|  Ankara|106|106|    1|         1.0|
|Istanbul|  7|  7|    0|         0.0|
|   Izmir|  7|  6|    0|         2.0|
|Istanbul|107|107|    1|         0.0|
|Istanbul|  6|  7|    0|         0.0|
|   Izmir|  7|  8|    0|         2.0|
|  Ankara|108|108|    1|         1.0|
|Istanbul|  8|  6|    0|         0.0|
|   Izmir|105|108|    1|         2.0|
|Istanbul|  5|  8|    0|         0.0|
|  Ankara|107|106|    1|         1.0|
|   Izmir|  8|  8|    0|         2.0|
|Istanbul|106|107|    1|         0.0|
|  Ankara|107|108|    1|         1.0|
|Istanbul|108|106|    1|         0.0|
+--------+---+---+-----+------------+



In [5]:
encoder = OneHotEncoder(inputCol='city_Indexed',outputCol='city_Encoded')
pointsDF=encoder.transform(pointsDF)
pointsDF.show()

#encoderModel = encoder.fit(pointsDF)
#print(encoderModel.explainParams) #try to print what information encodermodel has
#pointsDF = encoderModel.transform(pointsDF)



+--------+---+---+-----+------------+-------------+
|    city|  x|  y|label|city_Indexed| city_Encoded|
+--------+---+---+-----+------------+-------------+
|  Ankara|  5|  5|    0|         1.0|(2,[1],[1.0])|
|Istanbul|105|  1|    1|         0.0|(2,[0],[1.0])|
|Istanbul|105|106|    1|         0.0|(2,[0],[1.0])|
|  Ankara|  5|  6|    0|         1.0|(2,[1],[1.0])|
|   Izmir|  6|  6|    0|         2.0|    (2,[],[])|
|  Ankara|106|106|    1|         1.0|(2,[1],[1.0])|
|Istanbul|  7|  7|    0|         0.0|(2,[0],[1.0])|
|   Izmir|  7|  6|    0|         2.0|    (2,[],[])|
|Istanbul|107|107|    1|         0.0|(2,[0],[1.0])|
|Istanbul|  6|  7|    0|         0.0|(2,[0],[1.0])|
|   Izmir|  7|  8|    0|         2.0|    (2,[],[])|
|  Ankara|108|108|    1|         1.0|(2,[1],[1.0])|
|Istanbul|  8|  6|    0|         0.0|(2,[0],[1.0])|
|   Izmir|105|108|    1|         2.0|    (2,[],[])|
|Istanbul|  5|  8|    0|         0.0|(2,[0],[1.0])|
|  Ankara|107|106|    1|         1.0|(2,[1],[1.0])|
|   Izmir|  

In [6]:
vec = VectorAssembler(inputCols=['x','y','city_Encoded'],outputCol='features')
pointsDF = vec.transform(pointsDF)
pointsDF = pointsDF.select('features','label')
pointsDF.show()

classifier = DecisionTreeClassifier()
model = classifier.fit(pointsDF)

print(model.toDebugString)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|   [5.0,5.0,0.0,1.0]|    0|
| [105.0,1.0,1.0,0.0]|    1|
|[105.0,106.0,1.0,...|    1|
|   [5.0,6.0,0.0,1.0]|    0|
|   [6.0,6.0,0.0,0.0]|    0|
|[106.0,106.0,0.0,...|    1|
|   [7.0,7.0,1.0,0.0]|    0|
|   [7.0,6.0,0.0,0.0]|    0|
|[107.0,107.0,1.0,...|    1|
|   [6.0,7.0,1.0,0.0]|    0|
|   [7.0,8.0,0.0,0.0]|    0|
|[108.0,108.0,0.0,...|    1|
|   [8.0,6.0,1.0,0.0]|    0|
|[105.0,108.0,0.0,...|    1|
|   [5.0,8.0,1.0,0.0]|    0|
|[107.0,106.0,0.0,...|    1|
|   [8.0,8.0,0.0,0.0]|    0|
|[106.0,107.0,1.0,...|    1|
|[107.0,108.0,0.0,...|    1|
|[108.0,106.0,1.0,...|    1|
+--------------------+-----+

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_14c82ba2f971) of depth 1 with 3 nodes
  If (feature 0 <= 56.5)
   Predict: 0.0
  Else (feature 0 > 56.5)
   Predict: 1.0

