### GRADIENT BOOSTING - ASHLIE



In [19]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

### Vectorize data 

In [20]:
selected_cols=[cols for cols in sampled.columns if cols not in['label']]
assembler = VectorAssembler(inputCols=selected_cols,outputCol="features")

#Tree methods do not require scaling.
#Source:https://towardsdatascience.com/do-decision-trees-need-feature-scaling-97809eaa60c6#:~:text=Takeaway,the%20variance%20in%20the%20data.

output = assembler.transform(sampled)

In [21]:
cached_tr, cached_test = splitData(output,split_ratio,seed)  #using Shilpa's function

### PCA Model

In [22]:
pca = PCA(k=10, inputCol = "features", outputCol = "pca_features")
model=  pca.fit(output)
pca_sampled = model.transform(output)


In [23]:
cached_tr_pca, cached_test_pca = splitData(pca_sampled,split_ratio,seed)  #using Shilpa's function

### Initialize Gradient Boosting Tree Object (Estimators)

In [24]:
gb = GBTRegressor(labelCol="label", featuresCol="features")
gb_pca = GBTRegressor(labelCol="label", featuresCol="pca_features")

### Create Parameter Grid

In [25]:
gbparamGrid = (ParamGridBuilder()
             .addGrid(gb.maxDepth, [2, 5, 10])
             .addGrid(gb.maxBins, [10, 20, 40])
             .addGrid(gb.maxIter, [5, 10, 20])
             .build())

In [26]:
gbevaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label",metricName="accuracy")

### Cross Validation

In [27]:
numFolds = 5
threads = 3
gbcv = CrossValidator(estimator = gb,
                      estimatorParamMaps = gbparamGrid,
                      evaluator = gbevaluator,
                      numFolds = numFolds,
                      parallelism=threads)

In [28]:
numFolds = 5
threads = 3
gbcv_pca = CrossValidator(estimator = gb_pca,
                      estimatorParamMaps = gbparamGrid,
                      evaluator = gbevaluator,
                      numFolds = numFolds,
                      parallelism=threads)

### Fit the model

In [None]:
gbcvModel = gbcv.fit(cached_tr)
print(gbcvModel)

### Score predictions

In [None]:
gbpredictions = gbcvModel.transform(cached_test)

In [None]:
print('RMSE:', gbevaluator.evaluate(gbpredictions))

In [None]:
accuracy = gbevaluator.evaluate(gbpredictions)
print("Test Error = %g" % (1.0 - accuracy))

### Fit the model using PCA

In [None]:
gbcvModel_pca = gbcv_pca.fit(cached_tr_pca)
print(gbcvModel_pca)

### Score predictions Using PCA 

In [None]:
gbpredictions_pca = gbcvModel_pca.transform(cached_test_pca)

In [None]:
print('RMSE:', gbevaluator.evaluate(gbpredictions_pca))

In [None]:
accuracy = gbevaluator.evaluate(gbpredictions_pca)
print("Test Error = %g" % (1.0 - accuracy))