In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [4]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark

In [5]:
import sklearn
from sklearn.metrics import classification_report, confusion_matrix

In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

## WARNING, FILE "delay_clean.csv" is > 4 GB  --  added to gitignore
## Using reduced dataset.....

In [7]:
# Load and parse the data file, converting it to a DataFrame
clean = spark.read.format("libsvm").load('/content/drive/MyDrive/Colab_Notebooks/delay_clean2K_SVM.txt')
clean.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
+-----+--------------------+
only showing top 5 rows



In [8]:
clean.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [9]:
# Number of rows in dataset
number_rows = clean.count()
number_rows

200000

In [10]:
clean.groupBy('label').count().show()

+-----+------+
|label| count|
+-----+------+
|  0.0|164920|
|  1.0| 35080|
+-----+------+



In [11]:
# Index labels, adding metadata to the label column
# Fit on whole dataset to include all labels in index
labelIndexer = StringIndexer(inputCol = "label", outputCol = "indexedLabel").fit(clean)

In [12]:
# Automatically identify categorical features, and index them
# Set maxCategories so features with > 4 distinct values are treated as continuous
featureIndexer = VectorIndexer(inputCol = "features", outputCol = "indexedFeatures", maxCategories = 4).fit(clean)

In [13]:
from pyspark.ml.feature import Normalizer

In [14]:
normalizer = Normalizer(inputCol = "features", outputCol = "normFeatures", p = 1.0)
NormOutput = normalizer.transform(clean)

In [15]:
# Split the data into training and test sets
(trainingData, testData) = clean.randomSplit([0.75, 0.25])

In [16]:
trainingData.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
+-----+--------------------+
only showing top 5 rows



In [17]:
testData.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
+-----+--------------------+
only showing top 5 rows



## Oversampling performed to dataset
### https://medium.com/@junwan01/oversampling-and-undersampling-with-pyspark-5dbc25cdf253

In [18]:
from pyspark.sql.functions import col, explode, array, lit

In [19]:
major_df = clean.filter(col("label") == 0)
minor_df = clean.filter(col("label") == 1)
ratio = int(major_df.count()/minor_df.count())
print("Ratio of original dataset: {}".format(ratio)+" to 1 (on time : delayed flights)")

Ratio of original dataset: 4 to 1 (on time : delayed flights)


In [20]:
a = range(ratio)

In [21]:
# duplicate the minority rows
oversampled_df = minor_df.withColumn("dummy", explode(array([lit(x) for x in a]))).drop('dummy')

In [22]:
# combine both oversampled minority rows and previous majority rows
combined_df = major_df.unionAll(oversampled_df)
combined_df.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
+-----+--------------------+
only showing top 20 rows



In [23]:
combined_df.groupBy('label').count().show()

+-----+------+
|label| count|
+-----+------+
|  0.0|164920|
|  1.0|140320|
+-----+------+



In [24]:
# Split the data into training and test sets
(trainingData, testData) = combined_df.randomSplit([0.7, 0.3])

In [25]:
trainingData.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
+-----+--------------------+
only showing top 5 rows



In [26]:
testData.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
|  0.0|(137,[0,1,2,3,4,5...|
+-----+--------------------+
only showing top 20 rows



# Gradient-boosted tree classifier (GBT)

In [82]:
# Train a GBT model
gbt = GBTClassifier(labelCol = "indexedLabel", featuresCol = "indexedFeatures", maxIter = 30, maxDepth = 10,
                    stepSize = 1)

In [83]:
# Chain indexers and GBT in a Pipeline
pipeline = Pipeline(stages = [labelIndexer, featureIndexer, gbt])

In [84]:
# Train model.  This also runs the indexers
model = pipeline.fit(trainingData)

In [85]:
# Make predictions
predictions = model.transform(testData)

In [86]:
# Select example rows to display
predictions.select("prediction", "indexedLabel", "features").show(5)

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       0.0|         0.0|(137,[0,1,2,3,4,5...|
|       1.0|         0.0|(137,[0,1,2,3,4,5...|
|       1.0|         0.0|(137,[0,1,2,3,4,5...|
|       1.0|         0.0|(137,[0,1,2,3,4,5...|
|       0.0|         0.0|(137,[0,1,2,3,4,5...|
+----------+------------+--------------------+
only showing top 5 rows



In [87]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol = "indexedLabel", predictionCol = "prediction", metricName = "accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % accuracy)
print("Test Error = %g" % (1.0 - accuracy))

Accuracy = 0.758809
Test Error = 0.241191


In [88]:
gbtModel = model.stages[2]
print(gbtModel)  # summary only

GBTClassificationModel: uid = GBTClassifier_0815243ad0da, numTrees=30, numClasses=2, numFeatures=137


In [89]:
y_true = predictions.select(['indexedLabel']).collect()
y_pred = predictions.select(['prediction']).collect()

In [90]:
print(confusion_matrix(y_true, y_pred))

[[38138 11520]
 [10575 31375]]


In [91]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.78      0.77      0.78     49658
         1.0       0.73      0.75      0.74     41950

    accuracy                           0.76     91608
   macro avg       0.76      0.76      0.76     91608
weighted avg       0.76      0.76      0.76     91608



In [93]:
importanceSummary = gbtModel.featureImportances
importanceSummary

SparseVector(137, {1: 0.0551, 2: 0.0678, 3: 0.0647, 4: 0.0631, 5: 0.0544, 6: 0.0154, 7: 0.018, 8: 0.0348, 9: 0.0114, 10: 0.012, 11: 0.0125, 12: 0.0146, 13: 0.0973, 14: 0.0054, 15: 0.0407, 16: 0.0096, 17: 0.0115, 18: 0.08, 19: 0.0741, 20: 0.0031, 21: 0.006, 22: 0.0072, 23: 0.0066, 24: 0.0077, 25: 0.0095, 26: 0.0059, 27: 0.0084, 28: 0.0061, 29: 0.0077, 30: 0.0057, 31: 0.0082, 32: 0.0071, 33: 0.0067, 34: 0.0077, 35: 0.0062, 36: 0.0055, 37: 0.0054, 38: 0.0036, 39: 0.0014, 40: 0.0001, 41: 0.0033, 42: 0.0008, 43: 0.0004, 44: 0.0011, 45: 0.004, 46: 0.002, 47: 0.0004, 48: 0.0003, 49: 0.0046, 50: 0.002, 51: 0.0024, 52: 0.0051, 53: 0.0011, 54: 0.0012, 55: 0.001, 56: 0.0003, 57: 0.0001, 58: 0.0003, 59: 0.0013, 60: 0.0005, 61: 0.0004, 62: 0.0002, 63: 0.0004, 64: 0.0005, 65: 0.0023, 66: 0.001, 67: 0.0002, 68: 0.0009, 69: 0.0009, 70: 0.0004, 71: 0.002, 72: 0.0016, 73: 0.0001, 74: 0.0003, 75: 0.0014, 76: 0.0013, 77: 0.0009, 78: 0.0002, 79: 0.0007, 80: 0.0004, 81: 0.001, 82: 0.0013, 83: 0.0009, 84: 0.

In [None]:
#gbt.save("gbt_model.model")
#withopen

# Random forest classifier (RFC)

In [94]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString

In [121]:
# Train a RandomForest model
rf = RandomForestClassifier(labelCol = "indexedLabel", featuresCol = "indexedFeatures", numTrees = 40)

In [122]:
# Convert indexed labels back to original labels
labelConverter = IndexToString(inputCol = "prediction", outputCol = "predictedLabel",
                               labels = labelIndexer.labels)

In [123]:
# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages = [labelIndexer, featureIndexer, rf, labelConverter])

In [124]:
# Train model.  This also runs the indexers
model = pipeline.fit(trainingData)

In [125]:
# Make predictions
predictions1 = model.transform(testData)

In [126]:
# Select example rows to display
predictions1.select("predictedLabel", "label", "features").show(5)

+--------------+-----+--------------------+
|predictedLabel|label|            features|
+--------------+-----+--------------------+
|           1.0|  0.0|(137,[0,1,2,3,4,5...|
|           1.0|  0.0|(137,[0,1,2,3,4,5...|
|           1.0|  0.0|(137,[0,1,2,3,4,5...|
|           1.0|  0.0|(137,[0,1,2,3,4,5...|
|           1.0|  0.0|(137,[0,1,2,3,4,5...|
+--------------+-----+--------------------+
only showing top 5 rows



In [127]:
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol = "indexedLabel", predictionCol = "prediction", metricName = "accuracy")
accuracy = evaluator.evaluate(predictions1)
print("Accuracy = %g" % accuracy)
print("Test Error = %g" % (1.0 - accuracy))

Accuracy = 0.631026
Test Error = 0.368974


In [128]:
rfModel = model.stages[2]
print(rfModel)  # summary only

RandomForestClassificationModel: uid=RandomForestClassifier_4bf288abc716, numTrees=40, numClasses=2, numFeatures=137


In [129]:
print(rfModel.featureImportances)

(137,[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,26,27,28,29,30,31,32,33,34,35,36,37,38,39,41,43,44,45,47,48,49,50,51,52,53,54,55,58,59,64,65,66,67,68,71,72,75,79,80,82,83,84,86,88,89,91,92,93,94,98,99,101,102,105,108,112,113,114,117,118,120,122,124,125,128,129,134,136],[0.031988998810185895,0.0013205657901055841,0.07164881645394541,0.0054315370185653465,0.007102843355670943,0.022267625425703885,0.008856352758705156,0.008041940232395198,0.015554697611313919,0.005803220589881185,0.0018896894141926972,0.008106176463513994,0.0008727091346435478,0.10583726418435208,0.13872218092199776,0.06765319179918691,0.053030128596152096,0.0626945842943237,0.1280896651433309,0.009312473118195901,0.05037767490450931,0.02205581593340675,0.013257837334217249,0.0037704702950079856,0.00034601952719647195,5.4016341557931426e-05,4.2223522025927394e-05,0.00033186124317006934,0.00036616567754785745,0.0007697339616855969,0.0038355649740051947,0.010393260372927053,0.01110086262807599,0.0028787

In [130]:
y_true = predictions1.select(['indexedLabel']).collect()
y_pred = predictions1.select(['prediction']).collect()

In [131]:
print(confusion_matrix(y_true, y_pred))

[[41786  7872]
 [25929 16021]]


In [132]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.62      0.84      0.71     49658
         1.0       0.67      0.38      0.49     41950

    accuracy                           0.63     91608
   macro avg       0.64      0.61      0.60     91608
weighted avg       0.64      0.63      0.61     91608



# Factorization machines classifier

In [134]:
from pyspark.ml.classification import FMClassifier
from pyspark.ml.feature import MinMaxScaler

In [135]:
# Index labels, adding metadata to the label column
# Fit on whole dataset to include all labels in index
labelIndexer = StringIndexer(inputCol = "label", outputCol = "indexedLabel").fit(clean)

# Scale features
featureScaler = MinMaxScaler(inputCol = "features", outputCol = "scaledFeatures").fit(clean) 

In [136]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = clean.randomSplit([0.7, 0.3])

In [164]:
# Train a FM model
fm = FMClassifier(labelCol = "indexedLabel", featuresCol = "scaledFeatures", stepSize = 0.01)

In [165]:
# Create a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureScaler, fm])

In [166]:
# Train model
model = pipeline.fit(trainingData)

In [167]:
# Make predictions
predictions2 = model.transform(testData)

In [168]:
# Select example rows to display
predictions2.select("prediction", "indexedLabel", "features").show(5)

+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       0.0|         0.0|(137,[0,1,2,3,4,5...|
|       0.0|         0.0|(137,[0,1,2,3,4,5...|
|       0.0|         0.0|(137,[0,1,2,3,4,5...|
|       0.0|         0.0|(137,[0,1,2,3,4,5...|
|       0.0|         0.0|(137,[0,1,2,3,4,5...|
+----------+------------+--------------------+
only showing top 5 rows



In [169]:
# Select (prediction, true label) and compute test accuracy
evaluator = MulticlassClassificationEvaluator(
    labelCol = "indexedLabel", predictionCol = "prediction", metricName = "accuracy")
accuracy = evaluator.evaluate(predictions2)
print("Test set accuracy = %g" % accuracy)
print("Test Error = %g" % (1.0 - accuracy))

Test set accuracy = 0.828716
Test Error = 0.171284


In [170]:
y_true = predictions2.select(['indexedLabel']).collect()
y_pred = predictions2.select(['prediction']).collect()

In [171]:
print(confusion_matrix(y_true, y_pred))

[[49372   383]
 [ 9954   641]]


In [172]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.83      0.99      0.91     49755
         1.0       0.63      0.06      0.11     10595

    accuracy                           0.83     60350
   macro avg       0.73      0.53      0.51     60350
weighted avg       0.80      0.83      0.77     60350

