In [None]:
indexer = StringIndexer(inputCol='type', outputCol='type_idx')
onehot = OneHotEncoderEstimator(inputCols=['type_idx'], outputCols=['type_dummy'])
assemble = VectorAssembler(
inputCols=['mass', 'cyl', 'type_dummy'],
outputCol='features'
)
regression = LinearRegression(labelCol='consumption')

In [None]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[indexer, onehot, assemble, regression])

In [None]:
pipeline = pipeline.fit(cars_train)

In [None]:
predictions = pipeline.transform(cars_test)

In [None]:
# The LinearRegression object (fourth stage -> index 3)
pipeline.stages[3]
print(pipeline.stages[3].intercept)
print(pipeline.stages[3].coefficients)


    Create an indexer to convert the 'org' column into an indexed column called 'org_idx'.
    Create a one-hot encoder to convert the 'org_idx' and 'dow' columns into dummy variable columns called 'org_dummy' and 'dow_dummy'.
    Create an assembler which will combine the 'km' column with the two dummy variable columns. The output column should be called 'features'.
    Create a linear regression object to predict flight duration.


In [None]:
# Convert categorical strings to index values
indexer = StringIndexer(inputCol='org',outputCol='org_idx')

# One-hot encode index values
onehot = OneHotEncoderEstimator(
    inputCols=['org_idx','dow'],
    outputCols=['org_dummy','dow_dummy']
)

# Assemble predictors into a single column
assembler = VectorAssembler(inputCols=['km','org_dummy','dow_dummy'], outputCol='features')

# A linear regression object
regression = LinearRegression(labelCol='duration')


    Import the class for creating a pipeline.
    Create a pipeline object and specify the indexer, onehot, assembler and regression stages, in this order.
    Train the pipeline on the training data.
    Make predictions on the testing data.


In [None]:
# Import class for creating a pipeline
from pyspark.ml import Pipeline

# Construct a pipeline
pipeline = Pipeline(stages=[indexer,onehot,assembler,regression])

# Train the pipeline on the training data
pipeline = pipeline.fit(flights_train)

# Make predictions on the testing data
predictions = pipeline.transform(flights_test)


    Create an object for splitting text into tokens.
    Create an object to remove stop words. Rather than explicitly giving the input column name, use the getOutputCol() method on the previous object.
    Create objects for applying the hashing trick and transforming the data into a TF-IDF. Use the getOutputCol() method again.
    Create a pipeline which wraps all of the above steps as well as an object to create a Logistic Regression model.


In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF

# Break text into tokens at non-word characters
tokenizer = Tokenizer(inputCol='text', outputCol='words')

# Remove stop words
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='terms')

# Apply the hashing trick and transform to TF-IDF
hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol="hash")
idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features")

# Create a logistic regression object and add everything to a pipeline
logistic = LogisticRegression()
pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic])


    Create an empty parameter grid.
    Create objects for building and evaluating a linear regression model. The model should predict the "duration" field.
    Create a cross-validator object. Provide values for the estimator, estimatorParamMaps and evaluator arguments. Choose 5-fold cross validation.
    Train and test the model across multiple folds of the training data.


In [None]:
# Create an empty parameter grid
params = ParamGridBuilder().build()

# Create objects for building and evaluating a regression model
regression = LinearRegression(labelCol='duration')
evaluator = RegressionEvaluator(labelCol='duration')

# Create a cross validator
cv = CrossValidator(estimator=regression, estimatorParamMaps=params, evaluator=evaluator, numFolds=5)

# Train and test model on multiple folds of the training data
cv = cv.fit(flights_train)

# NOTE: Since cross-valdiation builds multiple models, the fit() method can take a little while to complete.


    Create a string indexer. Specify the input and output fields as org and org_idx.
    Create a one-hot encoder. Name the output field org_dummy.
    Assemble the km and org_dummy fields into a single field called features.
    Create a pipeline using the following operations: string indexer, one-hot encoder, assembler and linear regression. Use this to create a cross-validator.



In [None]:
# Create an indexer for the org field
indexer = StringIndexer(inputCol='org', outputCol='org_idx')

# Create an one-hot encoder for the indexed org field
onehot = OneHotEncoderEstimator(inputCols=['org_idx'], outputCols=['org_dummy'])

# Assemble the km and one-hot encoded fields
assembler = VectorAssembler(inputCols=['km','org_dummy'], outputCol='features')

# Create a pipeline and cross-validator.
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])
cv = CrossValidator(estimator=pipeline,estimatorParamMaps=params,evaluator=evaluator)

In [None]:
from pyspark.ml.tuning import ParamGridBuilder
# Create a parameter grid builder
params = ParamGridBuilder()
# Add grid points
params = params.addGrid(regression.fitIntercept, [True, False])
# Construct the grid
params = params.build()
# How many models?
print('Number of models to be tested: ', len(params))

In [None]:
cv = CrossValidator(estimator=regression,estimatorParamMaps=params,evaluator=evaluator)
cv = cv.setNumFolds(10).setSeed(13).fit(cars_train)

In [None]:
cv.avgMetrics

In [None]:
# Access the best model
cv.bestModel

In [None]:
predictions = cv.transform(cars_test)

In [None]:
cv.bestModel.explainParam('fitIntercept')

In [None]:
params = ParamGridBuilder() \
.addGrid(regression.fitIntercept, [True, False]) \
.addGrid(regression.regParam, [0.001, 0.01, 0.1, 1, 10]) \
.addGrid(regression.elasticNetParam, [0, 0.25, 0.5, 0.75, 1]) \
.build()

In [None]:
print ('Number of models to be tested: ', len(params))


    Create a parameter grid builder.
    Add grids for with regression.regParam (values 0.01, 0.1, 1.0, and 10.0) and regression.elasticNetParam (values 0.0, 0.5, and 1.0).
    Build the grid.
    Create a cross validator, specifying five folds.


In [None]:
# Create parameter grid
params = ParamGridBuilder()
# Add grids for two parameters
params = params.addGrid(regression.regParam, [0.01, 0.1, 1.0,10.0]) \
               .addGrid(regression.elasticNetParam, [0.0, 0.5,1.0])

# Build the parameter grid
params = params.build()
print('Number of models to be tested: ', len(params))

# Create cross-validator
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=params, evaluator=evaluator, numFolds=5)


    Retrieve the best model.
    Look at the stages in the best model.
    Isolate the linear regression stage and extract its parameters.
    Use the best model to generate predictions on the testing data and calculate the RMSE.


In [None]:
# Get the best model from cross validation
best_model = cv.bestModel

# Look at the stages in the best model
print(best_model.stages)

# Get the parameters for the LinearRegression object in the best model
best_model.stages[3].extractParamMap()

# Generate predictions on testing data using the best model then calculate RMSE
predictions = best_model.transform(flights_test)
evaluator.evaluate(predictions)


    Create a parameter grid builder object.
    Add grid points for numFeatures and binary parameters to the HashingTF object, giving values 1024, 4096 and 16384, and True and False, respectively.
    Add grid points for regParam and elasticNetParam parameters to the LogisticRegression object, giving values of 0.01, 0.1, 1.0 and 10.0, and 0.0, 0.5, and 1.0 respectively.
    Build the parameter grid.


In [None]:
# Create parameter grid
params = ParamGridBuilder()

# Add grid for hashing trick parameters
params = params.addGrid(hasher.numFeatures, [1024, 4096,16384]) \
               .addGrid(hasher.binary, [True,False])
# Add grid for logistic regression parameters
params = params.addGrid(logistic.regParam, [.01, 0.1, 1.0,10.0]) \
               .addGrid(logistic.elasticNetParam, [ 0.0, 0.5,1.0])

# Build parameter grid
params = params.build()

In [None]:
from pyspark.ml.classification import RandomForestClassifier
forest = RandomForestClassifier(numTrees=5)

In [None]:
forest = forest.fit(cars_train)

In [None]:
forest.trees

In [None]:
forest.featureImportances

In [None]:
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(maxIter=10)

In [None]:
gbt = gbt.fit(cars_train)


    Import the classes required to create Decision Tree and Gradient-Boosted Tree classifiers.
    Create Decision Tree and Gradient-Boosted Tree classifiers. Train on the training data.
    Create an evaluator and calculate AUC on testing data for both classifiers. Which model performs better?
    Find the number of trees and the relative importance of features in the Gradient-Boosted Tree classifier.


In [None]:
# Import the classes required
from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Create model objects and train on training data
tree = DecisionTreeClassifier().fit(flights_train)
gbt = GBTClassifier().fit(flights_train)

# Compare AUC on testing data
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(tree.transform(flights_test))
evaluator.evaluate(gbt.transform(flights_test))

# Find the number of trees and the relative importance of features
print(gbt.trees)
print(gbt.featureImportances)


    Create a random forest classifier object.
    Create a parameter grid builder object. Add grid points for the featureSubsetStrategy and maxDepth parameters.
    Create binary classification evaluator.
    Create a cross-validator object, specifying the estimator, parameter grid and evaluator. Choose 5-fold cross validation.


In [None]:
# Create a random forest classifier
forest = RandomForestClassifier()

# Create a parameter grid
params = ParamGridBuilder() \
            .addGrid(forest.featureSubsetStrategy, ['all', 'onethird', 'sqrt', 'log2']) \
            .addGrid(forest.maxDepth, [2, 5, 10]).build()

# Create a binary classification evaluator
evaluator = BinaryClassificationEvaluator()

# Create a cross-validator
cv = CrossValidator(estimator=forest, estimatorParamMaps=params, evaluator=evaluator, numFolds=5)


    Retrieve a list of average AUC metrics across all models in the parameter grid.
    What is the average AUC for the best model? This will be the largest AUC in the list.
    Find the value of the maxDepth and featureSubsetStrategy parameters for the best model.
    Calculate the AUC for the best model predictions on the testing data.


In [None]:
# Average AUC for each parameter combination in grid
avg_auc = cv.avgMetrics

# Average AUC for the best model
best_model_auc =  max(avg_auc)

# What's the optimal parameter value?
opt_max_depth = cv.bestModel.explainParam('maxDepth')
opt_feat_substrat = cv.bestModel.explainParam('featureSubsetStrategy')

# AUC for best model on testing data
best_auc = evaluator.evaluate(cv.transform(flights_test))