In [None]:
indexer = StringIndexer(inputCol='type', outputCol='type_idx')
onehot = OneHotEncoderEstimator(inputCols=['type_idx'], outputCols=['type_dummy'])
assemble = VectorAssembler(
inputCols=['mass', 'cyl', 'type_dummy'],
outputCol='features'
)
regression = LinearRegression(labelCol='consumption')

In [None]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[indexer, onehot, assemble, regression])

In [None]:
pipeline = pipeline.fit(cars_train)

In [None]:
predictions = pipeline.transform(cars_test)

In [None]:
# The LinearRegression object (fourth stage -> index 3)
pipeline.stages[3]
print(pipeline.stages[3].intercept)
print(pipeline.stages[3].coefficients)


    Create an indexer to convert the 'org' column into an indexed column called 'org_idx'.
    Create a one-hot encoder to convert the 'org_idx' and 'dow' columns into dummy variable columns called 'org_dummy' and 'dow_dummy'.
    Create an assembler which will combine the 'km' column with the two dummy variable columns. The output column should be called 'features'.
    Create a linear regression object to predict flight duration.


In [None]:
# Convert categorical strings to index values
indexer = StringIndexer(inputCol='org',outputCol='org_idx')

# One-hot encode index values
onehot = OneHotEncoderEstimator(
    inputCols=['org_idx','dow'],
    outputCols=['org_dummy','dow_dummy']
)

# Assemble predictors into a single column
assembler = VectorAssembler(inputCols=['km','org_dummy','dow_dummy'], outputCol='features')

# A linear regression object
regression = LinearRegression(labelCol='duration')


    Import the class for creating a pipeline.
    Create a pipeline object and specify the indexer, onehot, assembler and regression stages, in this order.
    Train the pipeline on the training data.
    Make predictions on the testing data.


In [None]:
# Import class for creating a pipeline
from pyspark.ml import Pipeline

# Construct a pipeline
pipeline = Pipeline(stages=[indexer,onehot,assembler,regression])

# Train the pipeline on the training data
pipeline = pipeline.fit(flights_train)

# Make predictions on the testing data
predictions = pipeline.transform(flights_test)


    Create an object for splitting text into tokens.
    Create an object to remove stop words. Rather than explicitly giving the input column name, use the getOutputCol() method on the previous object.
    Create objects for applying the hashing trick and transforming the data into a TF-IDF. Use the getOutputCol() method again.
    Create a pipeline which wraps all of the above steps as well as an object to create a Logistic Regression model.


In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF

# Break text into tokens at non-word characters
tokenizer = Tokenizer(inputCol='text', outputCol='words')

# Remove stop words
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='terms')

# Apply the hashing trick and transform to TF-IDF
hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol="hash")
idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features")

# Create a logistic regression object and add everything to a pipeline
logistic = LogisticRegression()
pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic])


    Create an empty parameter grid.
    Create objects for building and evaluating a linear regression model. The model should predict the "duration" field.
    Create a cross-validator object. Provide values for the estimator, estimatorParamMaps and evaluator arguments. Choose 5-fold cross validation.
    Train and test the model across multiple folds of the training data.


In [None]:
# Create an empty parameter grid
params = ParamGridBuilder().build()

# Create objects for building and evaluating a regression model
regression = LinearRegression(labelCol='duration')
evaluator = RegressionEvaluator(labelCol='duration')

# Create a cross validator
cv = CrossValidator(estimator=regression, estimatorParamMaps=params, evaluator=evaluator, numFolds=5)

# Train and test model on multiple folds of the training data
cv = cv.fit(flights_train)

# NOTE: Since cross-valdiation builds multiple models, the fit() method can take a little while to complete.


    Create a string indexer. Specify the input and output fields as org and org_idx.
    Create a one-hot encoder. Name the output field org_dummy.
    Assemble the km and org_dummy fields into a single field called features.
    Create a pipeline using the following operations: string indexer, one-hot encoder, assembler and linear regression. Use this to create a cross-validator.



In [None]:
# Create an indexer for the org field
indexer = StringIndexer(inputCol='org', outputCol='org_idx')

# Create an one-hot encoder for the indexed org field
onehot = OneHotEncoderEstimator(inputCols=['org_idx'], outputCols=['org_dummy'])

# Assemble the km and one-hot encoded fields
assembler = VectorAssembler(inputCols=['km','org_dummy'], outputCol='features')

# Create a pipeline and cross-validator.
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])
cv = CrossValidator(estimator=pipeline,estimatorParamMaps=params,evaluator=evaluator)