In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars xgboost4j-spark-0.72.jar,xgboost4j-0.72.jar pyspark-shell'

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, DoubleType, StringType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline, PipelineModel
from pyspark.sql.functions import col
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [4]:
spark = SparkSession.builder.appName("PySpark XGBOOST").master("local[*]").config("spark.jars", "xgboost4j-spark-0.72.jar,xgboost4j-0.72.jar").getOrCreate()
#spark.sparkContext.setLogLevel("ERROR")").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")


spark.sparkContext.addPyFile("sparkxgb.zip")

In [5]:
schema = StructType(
  [StructField("PassengerId", DoubleType()),
    StructField("Survival", DoubleType()),
    StructField("Pclass", DoubleType()),
    StructField("Name", StringType()),
    StructField("Sex", StringType()),
    StructField("Age", DoubleType()),
    StructField("SibSp", DoubleType()),
    StructField("Parch", DoubleType()),
    StructField("Ticket", StringType()),
    StructField("Fare", DoubleType()),
    StructField("Cabin", StringType()),
    StructField("Embarked", StringType())
  ])

In [6]:
df_raw = spark\
  .read\
  .option("header", "true")\
  .schema(schema)\
  .csv("train.csv")
df = df_raw.na.fill(0)

In [7]:
sexIndexer = StringIndexer()\
  .setInputCol("Sex")\
  .setOutputCol("SexIndex")\
  .setHandleInvalid("keep")
    
cabinIndexer = StringIndexer()\
  .setInputCol("Cabin")\
  .setOutputCol("CabinIndex")\
  .setHandleInvalid("keep")
    
embarkedIndexer = StringIndexer()\
  .setInputCol("Embarked")\
  .setOutputCol("EmbarkedIndex")\
  .setHandleInvalid("keep")

In [8]:
vectorAssembler = VectorAssembler()\
  .setInputCols(["Pclass", "SexIndex", "Age", "SibSp", "Parch", "Fare", "CabinIndex", "EmbarkedIndex"])\
  .setOutputCol("features")

In [9]:
from xgboost import XGBoostEstimator, XGBoostClassificationModel
from pipeline import XGBoostPipeline


In [10]:
xgboost = XGBoostEstimator(eta=0.1,max_depth=8,gamma=0.0, num_round=20,nworkers=3,
    featuresCol="features", 
    labelCol="Survival", 
    predictionCol="prediction"
)


"""
def get_param(): mutable.HashMap[String, Any] = {
    val params = new mutable.HashMap[String, Any]()
        params += "eta" -> 0.1
        params += "max_depth" -> 8
        params += "gamma" -> 0.0
        params += "colsample_bylevel" -> 1
        params += "objective" -> "binary:logistic"
        params += "num_class" -> 2
        params += "booster" -> "gbtree"
        params += "num_rounds" -> 20
        params += "nWorkers" -> 3
    return params
}
"""

'\ndef get_param(): mutable.HashMap[String, Any] = {\n    val params = new mutable.HashMap[String, Any]()\n        params += "eta" -> 0.1\n        params += "max_depth" -> 8\n        params += "gamma" -> 0.0\n        params += "colsample_bylevel" -> 1\n        params += "objective" -> "binary:logistic"\n        params += "num_class" -> 2\n        params += "booster" -> "gbtree"\n        params += "num_rounds" -> 20\n        params += "nWorkers" -> 3\n    return params\n}\n'

In [11]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='Survival', labelCol='prediction')

In [12]:
pipeline = Pipeline().setStages([sexIndexer, cabinIndexer, embarkedIndexer, vectorAssembler])

In [13]:
pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)

trainDF, testDF = dataset.randomSplit([0.8, 0.2], seed=24)

In [14]:
trainDF.printSchema()

root
 |-- PassengerId: double (nullable = false)
 |-- Survival: double (nullable = false)
 |-- Pclass: double (nullable = false)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = false)
 |-- SibSp: double (nullable = false)
 |-- Parch: double (nullable = false)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = false)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- SexIndex: double (nullable = false)
 |-- CabinIndex: double (nullable = false)
 |-- EmbarkedIndex: double (nullable = false)
 |-- features: vector (nullable = true)



In [15]:
xg = XGBoostEstimator(eta=0.1, subsample=0.9, min_child_weight=0.2, max_depth=8, gamma=0.0, num_round=20,nworkers=1,featuresCol="features", labelCol="Survival")

In [16]:

pipelineXG=Pipeline().setStages([xg])


In [17]:
paramGrid = (ParamGridBuilder().addGrid(xg.max_depth, [1,3, 5,7, 10]).build()) # regularization parameter
 #            .addGrid(xg.eta, [0.2, 0.5, 0.7]) # Elastic Net Parameter (Ridge = 0)
#			 .addGrid(xg.max_depth, [1, 5, 10]) #Number of iterations
#		 .addGrid(xg.alpha, [0.01, 0.05, 0.1]) # Number of features
            # .build())

In [18]:
cv = CrossValidator(estimator=pipelineXG, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

In [19]:
cvmodel = cv.fit(trainDF)

Py4JJavaError: An error occurred while calling o356.fit.
: ml.dmlc.xgboost4j.java.XGBoostError: XGBoostModel training failed
	at ml.dmlc.xgboost4j.scala.spark.XGBoost$.ml$dmlc$xgboost4j$scala$spark$XGBoost$$postTrackerReturnProcessing(XGBoost.scala:406)
	at ml.dmlc.xgboost4j.scala.spark.XGBoost$$anonfun$trainDistributed$4.apply(XGBoost.scala:356)
	at ml.dmlc.xgboost4j.scala.spark.XGBoost$$anonfun$trainDistributed$4.apply(XGBoost.scala:337)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
	at scala.collection.immutable.List.foreach(List.scala:381)
	at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
	at scala.collection.immutable.List.map(List.scala:285)
	at ml.dmlc.xgboost4j.scala.spark.XGBoost$.trainDistributed(XGBoost.scala:336)
	at ml.dmlc.xgboost4j.scala.spark.XGBoostEstimator.train(XGBoostEstimator.scala:139)
	at ml.dmlc.xgboost4j.scala.spark.XGBoostEstimator.train(XGBoostEstimator.scala:36)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
