# Model Validation

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.ml import Pipeline
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [2]:
sc = SparkContext("local", "sqlContext")
sqc = SQLContext(sc)

In [4]:
# Prepare training documents, which are labeled.
training = sqc.createDataFrame([(0, "a b c d e spark", 1.0),(1, "b d", 0.0),(2, "spark f g h", 1.0),(3, "hadoop mapreduce", 0.0),(4, "b spark who", 1.0),(5, "g d a y", 0.0),(6, "spark fly", 1.0),(7, "was mapreduce", 0.0),(8, "e spark program", 1.0),(9, "a e c l", 0.0),(10, "spark compile", 1.0),(11, "hadoop software", 0.0)], ["id", "text", "label"])
training.toPandas()

Unnamed: 0,id,text,label
0,0,a b c d e spark,1.0
1,1,b d,0.0
2,2,spark f g h,1.0
3,3,hadoop mapreduce,0.0
4,4,b spark who,1.0
5,5,g d a y,0.0
6,6,spark fly,1.0
7,7,was mapreduce,0.0
8,8,e spark program,1.0
9,9,a e c l,0.0


In [6]:
# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance. This will allow us to jointly choose parameters for all Pipeline stages.
# A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. We use a ParamGridBuilder to construct a grid of parameters to search over.
# With 3 values for hashingTF.numFeatures and 2 values for lr.regParam, this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, [10, 100, 1000]).addGrid(lr.regParam, [0.1, 0.01]).build()

crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=2)  # use 3+ folds in practice

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(training)

# Prepare test documents, which are unlabeled.
test = sqc.createDataFrame([(4, "spark i j k"), (5, "l m n"), (6, "mapreduce spark"), (7, "apache hadoop")], ["id", "text"])

# Make predictions on test documents. cvModel uses the best model found (lrModel).
prediction = cvModel.transform(test)
selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
    print(row)

Row(id=4, text='spark i j k', probability=DenseVector([0.2661, 0.7339]), prediction=1.0)
Row(id=5, text='l m n', probability=DenseVector([0.9209, 0.0791]), prediction=0.0)
Row(id=6, text='mapreduce spark', probability=DenseVector([0.4429, 0.5571]), prediction=1.0)
Row(id=7, text='apache hadoop', probability=DenseVector([0.8584, 0.1416]), prediction=0.0)


In [None]:
sc.stop()

## Train validation split

In [7]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

In [None]:
sc = SparkContext("local", "sqlContext")
sqc = SQLContext(sc)

In [10]:
path = 'D:/ProgramFiles/Spark/spark-3.0.0-bin-hadoop2.7/data/mllib/'

In [11]:
# Prepare training and test data.
data = sqc.read.format("libsvm").load(path + "sample_linear_regression_data.txt")
train, test = data.randomSplit([0.9, 0.1], seed=12345)
test.toPandas()

Unnamed: 0,label,features
0,-17.026492,"(0.8367805314799452, 0.1559190443625338, 0.048..."
1,-16.719097,"(-0.24375714099465773, -0.11915875769929496, -..."
2,-15.375858,"(-0.9794952880997945, -0.9547237660069134, 0.2..."
3,-13.772442,"(-0.3697050572653644, -0.11452811582755928, -0..."
4,-13.039928,"(-0.558607026518148, -0.7356765018678253, -0.7..."
5,-9.428988,"(0.8925906426831107, -0.6771269725125597, -0.1..."
6,-9.267965,"(-0.5057250557539077, -0.41655319851679495, 0...."
7,-9.173694,"(0.4430245286298278, 0.9923116639471541, -0.56..."
8,-7.150099,"(-0.1945259148773102, -0.4089845159829022, -0...."
9,-6.930604,"(0.09198647857985232, -0.3685113649452161, -0...."


In [14]:
lr = LinearRegression(maxIter=10)

# We use a ParamGridBuilder to construct a grid of parameters to search over.
# TrainValidationSplit will try all combinations of values and determine best model using the evaluator.
paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.01]).addGrid(lr.fitIntercept, [False, True]).addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]).build()

# In this case the estimator is simply the linear regression.
# A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.80% of the data will be used for training, 20% for validation.
tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(),trainRatio=0.8)

# Run TrainValidationSplit, and choose the best set of parameters.
model = tvs.fit(train)

# Make predictions on test data. model is the model with combination of parameters that performed best.
model.transform(test).select("features", "label", "prediction").show()

+--------------------+--------------------+--------------------+
|            features|               label|          prediction|
+--------------------+--------------------+--------------------+
|(10,[0,1,2,3,4,5,...| -17.026492264209548| -1.7800622423486911|
|(10,[0,1,2,3,4,5,...|  -16.71909683360509| -0.1893325701092588|
|(10,[0,1,2,3,4,5,...| -15.375857723312297|  0.7252323736487188|
|(10,[0,1,2,3,4,5,...| -13.772441561702871|  3.2696413241677718|
|(10,[0,1,2,3,4,5,...| -13.039928064104615| 0.18817684046065764|
|(10,[0,1,2,3,4,5,...|   -9.42898793151394|  -3.449987079269568|
|(10,[0,1,2,3,4,5,...|    -9.2679651250406| -0.3310907549069632|
|(10,[0,1,2,3,4,5,...|  -9.173693798406978|-0.42727135281551937|
|(10,[0,1,2,3,4,5,...| -7.1500991588127265|   2.936884251408867|
|(10,[0,1,2,3,4,5,...|  -6.930603551528371|-0.02839768193150...|
|(10,[0,1,2,3,4,5,...|  -6.456944198081549| -0.9224776887934015|
|(10,[0,1,2,3,4,5,...| -3.2843694575334834| -1.0821208483033875|
|(10,[0,1,2,3,4,5,...|   

## Credits & Links

http://spark.apache.org/docs/latest/ml-tuning.html