In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [73]:
!ls

drive  sample_data  spark-3.1.1-bin-hadoop3.2  spark-3.1.1-bin-hadoop3.2.tgz


In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier, GBTClassificationModel
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.pipeline import PipelineModel

# Load trained model

In [None]:
# Model's path

model_path = "./drive/" + "./MyDrive/" + "./SavedModels/" + "GBTmodel"

In [None]:
# Load trained model

loadedModel = PipelineModel.load(model_path)

In [None]:
# Load and parse the data file, converting it to a DataFrame
clean = spark.read.format("libsvm").load('/content/drive/MyDrive/Colab_Notebooks/delay_clean2K_SVM.txt')

In [None]:
# Number of rows in dataset
number_rows = clean.count()
number_rows

200000

In [None]:
clean.groupBy('label').count().show()

+-----+------+
|label| count|
+-----+------+
|  0.0|164920|
|  1.0| 35080|
+-----+------+



In [None]:
# Index labels, adding metadata to the label column
# Fit on whole dataset to include all labels in index
labelIndexer = StringIndexer(inputCol = "label", outputCol = "indexedLabel").fit(clean)

In [None]:
# Automatically identify categorical features, and index them
# Set maxCategories so features with > 4 distinct values are treated as continuous
featureIndexer = VectorIndexer(inputCol = "features", outputCol = "indexedFeatures", maxCategories = 4).fit(clean)

In [None]:
from pyspark.ml.feature import Normalizer

In [None]:
normalizer = Normalizer(inputCol = "features", outputCol = "normFeatures", p = 1.0)
NormOutput = normalizer.transform(clean)

In [None]:
# Make predictions with loaded model

predictions = loadedModel.transform(clean)

In [None]:
# Select example rows to display from prediction

predictions.select("prediction", "indexedLabel", "features").show(5)

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       0.0|         0.0|(137,[0,1,2,3,4,5...|
|       0.0|         0.0|(137,[0,1,2,3,4,5...|
|       0.0|         0.0|(137,[0,1,2,3,4,5...|
|       0.0|         0.0|(137,[0,1,2,3,4,5...|
|       1.0|         0.0|(137,[0,1,2,3,4,5...|
+----------+------------+--------------------+
only showing top 5 rows

