In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark

In [None]:
import sklearn
from sklearn.metrics import classification_report, confusion_matrix

## WARNING, FILE "delay_clean.csv" is > 4 GB  --  added to gitignore
## Using reduced dataset.....

# Gradient-boosted tree classifier (GBT)

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
# Load and parse the data file, converting it to a DataFrame
clean = spark.read.format("libsvm").load('/content/drive/MyDrive/Colab_Notebooks/delay_clean_SVM.txt')

In [None]:
clean.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [None]:
# Number of rows in dataset
row = clean.count()
row

6489057

In [None]:
# Index labels, adding metadata to the label column
# Fit on whole dataset to include all labels in index
labelIndexer = StringIndexer(inputCol = "label", outputCol = "indexedLabel").fit(clean)

In [None]:
# Automatically identify categorical features, and index them
# Set maxCategories so features with > 4 distinct values are treated as continuous
featureIndexer = VectorIndexer(inputCol = "features", outputCol = "indexedFeatures", maxCategories = 4).fit(clean)

In [None]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = clean.randomSplit([0.7, 0.3])

In [None]:
# Train a GBT model.
gbt = GBTClassifier(labelCol = "indexedLabel", featuresCol = "indexedFeatures", maxIter = 10)

In [None]:
# Chain indexers and GBT in a Pipeline
pipeline = Pipeline(stages = [labelIndexer, featureIndexer, gbt])

In [None]:
# Train model.  This also runs the indexers
model = pipeline.fit(trainingData)

In [None]:
# Make predictions
predictions = model.transform(testData)

In [None]:
# Select example rows to display
predictions.select("prediction", "indexedLabel", "features").show(5)

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       0.0|         0.0|(153,[0,1,2,3,4,5...|
|       0.0|         0.0|(153,[0,1,2,3,4,5...|
|       0.0|         0.0|(153,[0,1,2,3,4,5...|
|       0.0|         0.0|(153,[0,1,2,3,4,5...|
|       0.0|         0.0|(153,[0,1,2,3,4,5...|
+----------+------------+--------------------+
only showing top 5 rows



In [None]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol = "indexedLabel", predictionCol = "prediction", metricName = "accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % accuracy)
print("Test Error = %g" % (1.0 - accuracy))

Accuracy = 0.813302
Test Error = 0.186698


In [None]:
gbtModel = model.stages[2]
print(gbtModel)  # summary only

GBTClassificationModel: uid = GBTClassifier_863fc49eb4d2, numTrees=10, numClasses=2, numFeatures=153


In [None]:
y_true = predictions.select(['indexedLabel']).collect()
y_pred = predictions.select(['prediction']).collect()

In [None]:
print(confusion_matrix(y_true, y_pred)) # only TEST data is used (30% of the total rows)

[[1572966    5436]
 [ 357981   10170]]


In [None]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.81      1.00      0.90   1578402
         1.0       0.65      0.03      0.05    368151

    accuracy                           0.81   1946553
   macro avg       0.73      0.51      0.47   1946553
weighted avg       0.78      0.81      0.74   1946553



In [None]:
# gbt.save("gbt_model.model")

# Random forest classifier (RFC)

In [None]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString

In [None]:
# Train a RandomForest model
rf = RandomForestClassifier(labelCol = "indexedLabel", featuresCol = "indexedFeatures", numTrees = 10)

In [None]:
# Convert indexed labels back to original labels
labelConverter = IndexToString(inputCol = "prediction", outputCol = "predictedLabel",
                               labels = labelIndexer.labels)

In [None]:
# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages = [labelIndexer, featureIndexer, rf, labelConverter])

In [None]:
# Train model.  This also runs the indexers
model = pipeline.fit(trainingData)

In [None]:
# Make predictions
predictions = model.transform(testData)

In [None]:
# Select example rows to display
predictions.select("predictedLabel", "label", "features").show(5)

+--------------+-----+--------------------+
|predictedLabel|label|            features|
+--------------+-----+--------------------+
|           0.0|  0.0|(153,[0,1,2,3,4,5...|
|           0.0|  0.0|(153,[0,1,2,3,4,5...|
|           0.0|  0.0|(153,[0,1,2,3,4,5...|
|           0.0|  0.0|(153,[0,1,2,3,4,5...|
|           0.0|  0.0|(153,[0,1,2,3,4,5...|
+--------------+-----+--------------------+
only showing top 5 rows



In [None]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol = "indexedLabel", predictionCol = "prediction", metricName = "accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % accuracy)
print("Test Error = %g" % (1.0 - accuracy))

Accuracy = 0.81087
Test Error = 0.18913


In [None]:
rfModel = model.stages[2]
print(rfModel)  # summary only

RandomForestClassificationModel: uid=RandomForestClassifier_7f80a5f713eb, numTrees=10, numClasses=2, numFeatures=153


In [None]:
print(rfModel.featureImportances)

(153,[0,2,3,4,6,7,9,10,12,14,15,16,17,18,19,20,21,23,32,33,34,35,36,37,38,43,45,49,66,69,80,93,96,97,131,152],[0.013722297407261228,0.0004294372868175999,0.31674366209907634,0.0020650239400778108,0.010074656180582637,0.010394228964668692,0.00823838178631936,0.020487062597604074,3.7128311344219084e-05,0.06057027052614129,0.016836299154217048,0.01127002458560455,0.00013816383629222182,0.0009893559127998664,0.00035038177732433324,0.007856025751893628,0.2490370076280212,0.03478733165413174,0.0003801675641854459,0.0054328131394404825,0.03402580324994177,0.002598296873200929,0.003012469517916887,0.03609605586970796,0.05801269724502882,2.744222490399856e-05,0.057619042635495055,0.0335484673965302,0.003921220787310415,0.000167147088978276,3.23785262052603e-05,7.025860247329149e-06,0.0002592940838182998,2.0019058709483817e-05,7.152398964147105e-05,0.0007413954885601435])


In [None]:
y_true = predictions.select(['indexedLabel']).collect()
y_pred = predictions.select(['prediction']).collect()

In [None]:
print(confusion_matrix(y_true, y_pred))

[[1578402       0]
 [ 368151       0]]


In [None]:
print(classification_report(y_true, y_pred))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       0.81      1.00      0.90   1578402
         1.0       0.00      0.00      0.00    368151

    accuracy                           0.81   1946553
   macro avg       0.41      0.50      0.45   1946553
weighted avg       0.66      0.81      0.73   1946553



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# rf.save("rf_model.model")

# Factorization machines classifier

In [None]:
from pyspark.ml.classification import FMClassifier
from pyspark.ml.feature import MinMaxScaler

In [None]:
# Index labels, adding metadata to the label column
# Fit on whole dataset to include all labels in index
labelIndexer = StringIndexer(inputCol = "label", outputCol = "indexedLabel").fit(clean)

# Scale features
featureScaler = MinMaxScaler(inputCol = "features", outputCol = "scaledFeatures").fit(clean) 

In [None]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = clean.randomSplit([0.7, 0.3])

In [None]:
# Train a FM model
fm = FMClassifier(labelCol = "indexedLabel", featuresCol = "scaledFeatures", stepSize = 0.001)

In [None]:
# Create a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureScaler, fm])

In [None]:
# Train model
model = pipeline.fit(trainingData)

In [None]:
# Make predictions
predictions = model.transform(testData)

In [None]:
# Select example rows to display
predictions.select("prediction", "indexedLabel", "features").show(5)

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       0.0|         0.0|(153,[0,1,2,3,4,5...|
|       0.0|         0.0|(153,[0,1,2,3,4,5...|
|       0.0|         0.0|(153,[0,1,2,3,4,5...|
|       0.0|         0.0|(153,[0,1,2,3,4,5...|
|       0.0|         0.0|(153,[0,1,2,3,4,5...|
+----------+------------+--------------------+
only showing top 5 rows



In [None]:
# Select (prediction, true label) and compute test accuracy
evaluator = MulticlassClassificationEvaluator(
    labelCol = "indexedLabel", predictionCol = "prediction", metricName = "accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = %g" % accuracy)
print("Test Error = %g" % (1.0 - accuracy))

Test set accuracy = 0.810884
Test Error = 0.189116


In [None]:
y_true = predictions.select(['indexedLabel']).collect()
y_pred = predictions.select(['prediction']).collect()

In [None]:
print(confusion_matrix(y_true, y_pred))

[[1576995       0]
 [ 367790       0]]


In [None]:
print(classification_report(y_true, y_pred))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       0.81      1.00      0.90   1576995
         1.0       0.00      0.00      0.00    367790

    accuracy                           0.81   1944785
   macro avg       0.41      0.50      0.45   1944785
weighted avg       0.66      0.81      0.73   1944785



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# fm.save("fm_model.model")