In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [4]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark

In [5]:
import sklearn
from sklearn.metrics import classification_report, confusion_matrix

## WARNING, FILE "delay_clean.csv" is > 4 GB  --  added to gitignore
## Using reduced dataset.....

# Gradient-boosted tree classifier (GBT)

In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [7]:
# Load and parse the data file, converting it to a DataFrame
clean = spark.read.format("libsvm").load('/content/drive/MyDrive/Colab_Notebooks/delay_clean2K_SVM.txt')

In [8]:
clean.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [9]:
# Number of rows in dataset
row = clean.count()
row

200000

In [10]:
# Index labels, adding metadata to the label column
# Fit on whole dataset to include all labels in index
labelIndexer = StringIndexer(inputCol = "label", outputCol = "indexedLabel").fit(clean)

In [11]:
# Automatically identify categorical features, and index them
# Set maxCategories so features with > 4 distinct values are treated as continuous
featureIndexer = VectorIndexer(inputCol = "features", outputCol = "indexedFeatures", maxCategories = 4).fit(clean)

In [12]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = clean.randomSplit([0.7, 0.3])

In [13]:
# Train a GBT model.
gbt = GBTClassifier(labelCol = "indexedLabel", featuresCol = "indexedFeatures", maxIter = 10)

In [14]:
# Chain indexers and GBT in a Pipeline
pipeline = Pipeline(stages = [labelIndexer, featureIndexer, gbt])

In [15]:
# Train model.  This also runs the indexers
model = pipeline.fit(trainingData)

In [16]:
# Make predictions
predictions = model.transform(testData)

In [17]:
# Select example rows to display
predictions.select("prediction", "indexedLabel", "features").show(5)

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       1.0|         0.0|(137,[0,1,2,3,4,5...|
|       0.0|         0.0|(137,[0,1,2,3,4,5...|
|       1.0|         0.0|(137,[0,1,2,3,4,5...|
|       0.0|         0.0|(137,[0,1,2,3,4,5...|
|       0.0|         0.0|(137,[0,1,2,3,4,5...|
+----------+------------+--------------------+
only showing top 5 rows



In [18]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol = "indexedLabel", predictionCol = "prediction", metricName = "accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % accuracy)
print("Test Error = %g" % (1.0 - accuracy))

Accuracy = 0.82943
Test Error = 0.17057


In [19]:
gbtModel = model.stages[2]
print(gbtModel)  # summary only

GBTClassificationModel: uid = GBTClassifier_5ec15031dbc2, numTrees=10, numClasses=2, numFeatures=137


In [20]:
y_true = predictions.select(['indexedLabel']).collect()
y_pred = predictions.select(['prediction']).collect()

In [21]:
print(confusion_matrix(y_true, y_pred)) # only TEST data is used (30% of 200,000 total rows)

[[49318   308]
 [ 9969   656]]


In [22]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.83      0.99      0.91     49626
         1.0       0.68      0.06      0.11     10625

    accuracy                           0.83     60251
   macro avg       0.76      0.53      0.51     60251
weighted avg       0.81      0.83      0.77     60251



# Random forest classifier (RFC)

In [23]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString

In [24]:
# Train a RandomForest model
rf = RandomForestClassifier(labelCol = "indexedLabel", featuresCol = "indexedFeatures", numTrees = 10)

In [25]:
# Convert indexed labels back to original labels
labelConverter = IndexToString(inputCol = "prediction", outputCol = "predictedLabel",
                               labels = labelIndexer.labels)

In [26]:
# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages = [labelIndexer, featureIndexer, rf, labelConverter])

In [27]:
# Train model.  This also runs the indexers
model = pipeline.fit(trainingData)

In [28]:
# Make predictions
predictions1 = model.transform(testData)

In [29]:
# Select example rows to display
predictions1.select("predictedLabel", "label", "features").show(5)

+--------------+-----+--------------------+
|predictedLabel|label|            features|
+--------------+-----+--------------------+
|           0.0|  0.0|(137,[0,1,2,3,4,5...|
|           0.0|  0.0|(137,[0,1,2,3,4,5...|
|           0.0|  0.0|(137,[0,1,2,3,4,5...|
|           0.0|  0.0|(137,[0,1,2,3,4,5...|
|           0.0|  0.0|(137,[0,1,2,3,4,5...|
+--------------+-----+--------------------+
only showing top 5 rows



In [30]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol = "indexedLabel", predictionCol = "prediction", metricName = "accuracy")
accuracy = evaluator.evaluate(predictions1)
print("Accuracy = %g" % accuracy)
print("Test Error = %g" % (1.0 - accuracy))

Accuracy = 0.823654
Test Error = 0.176346


In [31]:
rfModel = model.stages[2]
print(rfModel)  # summary only

RandomForestClassificationModel: uid=RandomForestClassifier_1051dcbb9c41, numTrees=10, numClasses=2, numFeatures=137


In [32]:
print(rfModel.featureImportances)

(137,[1,2,3,4,5,6,7,8,9,11,12,13,14,15,16,17,18,19,21,22,23,25,28,33,34,35,36,37,45,57,58,60,65,68,76,89,91,129,136],[0.002660787090102533,0.00035223639064922057,0.011034488412578926,0.005980531575021249,0.0268728501930391,0.01255701207683193,0.010112045273101467,0.03138039306205343,0.019519441265388255,0.0046578071960948425,0.009664596789815688,0.0007050394197125379,0.021592993005491706,0.13449445728240195,0.046969780470165284,0.11462445517291064,0.22751679179819742,0.00884471409553305,0.04890562480873259,0.05585033291354542,0.030341525875597124,0.0002879433319649791,0.00019617524172224518,0.011982875241493756,0.003763092501249444,0.003910330987504793,0.0020737133118537234,0.002172821710871158,0.031175939497662154,0.0011252762930813976,0.0005677588646960306,0.00036957110683012457,0.01731754032267286,0.001780723107836594,0.002362403298749672,0.06055340448848664,0.0013214765253683224,0.0012031417362913003,0.03319790826470023])


In [33]:
y_true = predictions1.select(['indexedLabel']).collect()
y_pred = predictions1.select(['prediction']).collect()

In [34]:
print(confusion_matrix(y_true, y_pred)) # only TEST data is used (30% of 200,000 total rows)

[[49626     0]
 [10625     0]]


In [35]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.82      1.00      0.90     49626
         1.0       0.00      0.00      0.00     10625

    accuracy                           0.82     60251
   macro avg       0.41      0.50      0.45     60251
weighted avg       0.68      0.82      0.74     60251



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Factorization machines classifier

In [36]:
from pyspark.ml.classification import FMClassifier
from pyspark.ml.feature import MinMaxScaler

In [37]:
# Index labels, adding metadata to the label column
# Fit on whole dataset to include all labels in index
labelIndexer = StringIndexer(inputCol = "label", outputCol = "indexedLabel").fit(clean)

# Scale features
featureScaler = MinMaxScaler(inputCol = "features", outputCol = "scaledFeatures").fit(clean) 

In [38]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = clean.randomSplit([0.7, 0.3])

In [39]:
# Train a FM model
fm = FMClassifier(labelCol = "indexedLabel", featuresCol = "scaledFeatures", stepSize = 0.001)

In [40]:
# Create a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureScaler, fm])

In [41]:
# Train model
model = pipeline.fit(trainingData)

In [42]:
# Make predictions
predictions2 = model.transform(testData)

In [43]:
# Select example rows to display
predictions2.select("prediction", "indexedLabel", "features").show(5)

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       0.0|         0.0|(137,[0,1,2,3,4,5...|
|       0.0|         0.0|(137,[0,1,2,3,4,5...|
|       0.0|         0.0|(137,[0,1,2,3,4,5...|
|       0.0|         0.0|(137,[0,1,2,3,4,5...|
|       0.0|         0.0|(137,[0,1,2,3,4,5...|
+----------+------------+--------------------+
only showing top 5 rows



In [44]:
# Select (prediction, true label) and compute test accuracy
evaluator = MulticlassClassificationEvaluator(
    labelCol = "indexedLabel", predictionCol = "prediction", metricName = "accuracy")
accuracy = evaluator.evaluate(predictions2)
print("Test set accuracy = %g" % accuracy)
print("Test Error = %g" % (1.0 - accuracy))

Test set accuracy = 0.82278
Test Error = 0.17722


In [45]:
y_true = predictions2.select(['indexedLabel']).collect()
y_pred = predictions2.select(['prediction']).collect()

In [46]:
print(confusion_matrix(y_true, y_pred))

[[49194     0]
 [10596     0]]


In [47]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.82      1.00      0.90     49194
         1.0       0.00      0.00      0.00     10596

    accuracy                           0.82     59790
   macro avg       0.41      0.50      0.45     59790
weighted avg       0.68      0.82      0.74     59790



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
