In [6]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [7]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [8]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
# WARNING, FILE "delay_clean.csv" is > 4 GB  --  added to gitignore

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
# Load the csv into a dataframe
clean = spark.read.csv('/content/drive/MyDrive/Colab Notebooks/delay_clean.csv', header = True, inferSchema = True)
clean.limit(10)

DataFrame[MONTH: int, DAY_OF_WEEK: int, DEP_DEL15: int, DISTANCE_GROUP: int, SEGMENT_NUMBER: int, CONCURRENT_FLIGHTS: int, NUMBER_OF_SEATS: int, AIRPORT_FLIGHTS_MONTH: int, AIRLINE_FLIGHTS_MONTH: int, AIRLINE_AIRPORT_FLIGHTS_MONTH: int, AVG_MONTHLY_PASS_AIRPORT: int, AVG_MONTHLY_PASS_AIRLINE: int, FLT_ATTENDANTS_PER_PASS: double, GROUND_SERV_PER_PASS: double, PLANE_AGE: int, PREVIOUS_AIRPORT: string, PRCP: double, SNOW: double, SNWD: double, TMAX: double, AWND: double, DEP_TIME_BLK_0001-0559: double, DEP_TIME_BLK_0600-0659: double, DEP_TIME_BLK_0700-0759: double, DEP_TIME_BLK_0800-0859: double, DEP_TIME_BLK_0900-0959: double, DEP_TIME_BLK_1000-1059: double, DEP_TIME_BLK_1100-1159: double, DEP_TIME_BLK_1200-1259: double, DEP_TIME_BLK_1300-1359: double, DEP_TIME_BLK_1400-1459: double, DEP_TIME_BLK_1500-1559: double, DEP_TIME_BLK_1600-1659: double, DEP_TIME_BLK_1700-1759: double, DEP_TIME_BLK_1800-1859: double, DEP_TIME_BLK_1900-1959: double, DEP_TIME_BLK_2000-2059: double, DEP_TIME_BLK_2

## Linear Regression

In [35]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.sql.types import IntegerType

In [36]:
clean = clean.withColumn("PREVIOUS_AIRPORT",clean["PREVIOUS_AIRPORT"].cast(IntegerType()))

In [37]:
clean.printSchema()

root
 |-- MONTH: integer (nullable = true)
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- DEP_DEL15: integer (nullable = true)
 |-- DISTANCE_GROUP: integer (nullable = true)
 |-- SEGMENT_NUMBER: integer (nullable = true)
 |-- CONCURRENT_FLIGHTS: integer (nullable = true)
 |-- NUMBER_OF_SEATS: integer (nullable = true)
 |-- AIRPORT_FLIGHTS_MONTH: integer (nullable = true)
 |-- AIRLINE_FLIGHTS_MONTH: integer (nullable = true)
 |-- AIRLINE_AIRPORT_FLIGHTS_MONTH: integer (nullable = true)
 |-- AVG_MONTHLY_PASS_AIRPORT: integer (nullable = true)
 |-- AVG_MONTHLY_PASS_AIRLINE: integer (nullable = true)
 |-- FLT_ATTENDANTS_PER_PASS: double (nullable = true)
 |-- GROUND_SERV_PER_PASS: double (nullable = true)
 |-- PLANE_AGE: integer (nullable = true)
 |-- PREVIOUS_AIRPORT: integer (nullable = true)
 |-- PRCP: double (nullable = true)
 |-- SNOW: double (nullable = true)
 |-- SNWD: double (nullable = true)
 |-- TMAX: double (nullable = true)
 |-- AWND: double (nullable = true)
 |-- DEP_TIME_BL

In [38]:
inputColList = clean.columns
inputColList

['MONTH',
 'DAY_OF_WEEK',
 'DEP_DEL15',
 'DISTANCE_GROUP',
 'SEGMENT_NUMBER',
 'CONCURRENT_FLIGHTS',
 'NUMBER_OF_SEATS',
 'AIRPORT_FLIGHTS_MONTH',
 'AIRLINE_FLIGHTS_MONTH',
 'AIRLINE_AIRPORT_FLIGHTS_MONTH',
 'AVG_MONTHLY_PASS_AIRPORT',
 'AVG_MONTHLY_PASS_AIRLINE',
 'FLT_ATTENDANTS_PER_PASS',
 'GROUND_SERV_PER_PASS',
 'PLANE_AGE',
 'PREVIOUS_AIRPORT',
 'PRCP',
 'SNOW',
 'SNWD',
 'TMAX',
 'AWND',
 'DEP_TIME_BLK_0001-0559',
 'DEP_TIME_BLK_0600-0659',
 'DEP_TIME_BLK_0700-0759',
 'DEP_TIME_BLK_0800-0859',
 'DEP_TIME_BLK_0900-0959',
 'DEP_TIME_BLK_1000-1059',
 'DEP_TIME_BLK_1100-1159',
 'DEP_TIME_BLK_1200-1259',
 'DEP_TIME_BLK_1300-1359',
 'DEP_TIME_BLK_1400-1459',
 'DEP_TIME_BLK_1500-1559',
 'DEP_TIME_BLK_1600-1659',
 'DEP_TIME_BLK_1700-1759',
 'DEP_TIME_BLK_1800-1859',
 'DEP_TIME_BLK_1900-1959',
 'DEP_TIME_BLK_2000-2059',
 'DEP_TIME_BLK_2100-2159',
 'DEP_TIME_BLK_2200-2259',
 'DEP_TIME_BLK_2300-2359',
 'CARRIER_NAME_Alaska Airlines Inc.',
 'CARRIER_NAME_Allegiant Air',
 'CARRIER_NAME_Ameri

In [39]:
#Input all the features in one vector column
assembler = VectorAssembler(inputCols = ['MONTH', 'DAY_OF_WEEK', 'DISTANCE_GROUP', 'SEGMENT_NUMBER',
 'CONCURRENT_FLIGHTS', 'NUMBER_OF_SEATS', 'AIRPORT_FLIGHTS_MONTH', 'AIRLINE_FLIGHTS_MONTH', 'AIRLINE_AIRPORT_FLIGHTS_MONTH',
 'AVG_MONTHLY_PASS_AIRPORT', 'AVG_MONTHLY_PASS_AIRLINE', 'FLT_ATTENDANTS_PER_PASS', 'GROUND_SERV_PER_PASS',
 'PLANE_AGE', 'PREVIOUS_AIRPORT',
 'PRCP', 'SNOW', 'SNWD', 'TMAX', 'AWND', 'DEP_TIME_BLK_0001-0559', 'DEP_TIME_BLK_0600-0659', 'DEP_TIME_BLK_0700-0759',
 'DEP_TIME_BLK_0800-0859', 'DEP_TIME_BLK_0900-0959', 'DEP_TIME_BLK_1000-1059', 'DEP_TIME_BLK_1100-1159',
 'DEP_TIME_BLK_1200-1259', 'DEP_TIME_BLK_1300-1359', 'DEP_TIME_BLK_1400-1459', 'DEP_TIME_BLK_1500-1559',
 'DEP_TIME_BLK_1600-1659', 'DEP_TIME_BLK_1700-1759', 'DEP_TIME_BLK_1800-1859', 'DEP_TIME_BLK_1900-1959',
 'DEP_TIME_BLK_2000-2059', 'DEP_TIME_BLK_2100-2159', 'DEP_TIME_BLK_2200-2259', 'DEP_TIME_BLK_2300-2359',
 'CARRIER_NAME_Alaska Airlines Inc.', 'CARRIER_NAME_Allegiant Air', 'CARRIER_NAME_American Airlines Inc.',
 'CARRIER_NAME_American Eagle Airlines Inc.', 'CARRIER_NAME_Atlantic Southeast Airlines', 'CARRIER_NAME_Comair Inc.',
 'CARRIER_NAME_Delta Air Lines Inc.', 'CARRIER_NAME_Endeavor Air Inc.', 'CARRIER_NAME_Frontier Airlines Inc.',
 'CARRIER_NAME_Hawaiian Airlines Inc.', 'CARRIER_NAME_JetBlue Airways', 'CARRIER_NAME_Mesa Airlines Inc.',
 'CARRIER_NAME_Midwest Airline, Inc.', 'CARRIER_NAME_SkyWest Airlines Inc.', 'CARRIER_NAME_Southwest Airlines Co.',
 'CARRIER_NAME_Spirit Air Lines', 'CARRIER_NAME_United Air Lines Inc.', 'DEPARTING_AIRPORT_Adams Field',
 'DEPARTING_AIRPORT_Albany International', 'DEPARTING_AIRPORT_Albuquerque International Sunport',
 'DEPARTING_AIRPORT_Anchorage International', 'DEPARTING_AIRPORT_Atlanta Municipal',
 'DEPARTING_AIRPORT_Austin - Bergstrom International', 'DEPARTING_AIRPORT_Birmingham Airport',
 'DEPARTING_AIRPORT_Boise Air Terminal', 'DEPARTING_AIRPORT_Bradley International',
 'DEPARTING_AIRPORT_Charleston International', 'DEPARTING_AIRPORT_Chicago Midway International',
 "DEPARTING_AIRPORT_Chicago O'Hare International", 'DEPARTING_AIRPORT_Cincinnati/Northern Kentucky International',
 'DEPARTING_AIRPORT_Cleveland-Hopkins International', 'DEPARTING_AIRPORT_Dallas Fort Worth Regional',
 'DEPARTING_AIRPORT_Dallas Love Field', 'DEPARTING_AIRPORT_Des Moines Municipal',
 'DEPARTING_AIRPORT_Detroit Metro Wayne County', 'DEPARTING_AIRPORT_Douglas Municipal',
 'DEPARTING_AIRPORT_El Paso International',
 'DEPARTING_AIRPORT_Eppley Airfield',
 'DEPARTING_AIRPORT_Fort Lauderdale-Hollywood International',
 'DEPARTING_AIRPORT_Friendship International',
 'DEPARTING_AIRPORT_General Mitchell Field',
 'DEPARTING_AIRPORT_Greater Buffalo International',
 'DEPARTING_AIRPORT_Greenville-Spartanburg',
 'DEPARTING_AIRPORT_Hollywood-Burbank Midpoint',
 'DEPARTING_AIRPORT_Honolulu International',
 'DEPARTING_AIRPORT_Houston Intercontinental',
 'DEPARTING_AIRPORT_Indianapolis Muni/Weir Cook',
 'DEPARTING_AIRPORT_Jacksonville International',
 'DEPARTING_AIRPORT_James M Cox/Dayton International',
 'DEPARTING_AIRPORT_John F. Kennedy International',
 'DEPARTING_AIRPORT_Kahului Airport',
 'DEPARTING_AIRPORT_Kansas City International',
 'DEPARTING_AIRPORT_Keahole',
 'DEPARTING_AIRPORT_Kent County',
 'DEPARTING_AIRPORT_LaGuardia',
 'DEPARTING_AIRPORT_Lambert-St. Louis International',
 'DEPARTING_AIRPORT_Lihue Airport',
 'DEPARTING_AIRPORT_Logan International',
 'DEPARTING_AIRPORT_Long Beach Daugherty Field',
 'DEPARTING_AIRPORT_Los Angeles International',
 'DEPARTING_AIRPORT_Louis Armstrong New Orleans International',
 'DEPARTING_AIRPORT_McCarran International',
 'DEPARTING_AIRPORT_McGhee Tyson',
 'DEPARTING_AIRPORT_Memphis International',
 'DEPARTING_AIRPORT_Metropolitan Oakland International',
 'DEPARTING_AIRPORT_Miami International',
 'DEPARTING_AIRPORT_Minneapolis-St Paul International',
 'DEPARTING_AIRPORT_Myrtle Beach International',
 'DEPARTING_AIRPORT_Nashville International',
 'DEPARTING_AIRPORT_Newark Liberty International',
 'DEPARTING_AIRPORT_Norfolk International',
 'DEPARTING_AIRPORT_Northwest Arkansas Regional',
 'DEPARTING_AIRPORT_Ontario International',
 'DEPARTING_AIRPORT_Orange County',
 'DEPARTING_AIRPORT_Orlando International',
 'DEPARTING_AIRPORT_Palm Beach International',
 'DEPARTING_AIRPORT_Palm Springs International',
 'DEPARTING_AIRPORT_Pensacola Regional',
 'DEPARTING_AIRPORT_Philadelphia International',
 'DEPARTING_AIRPORT_Phoenix Sky Harbor International',
 'DEPARTING_AIRPORT_Piedmont Triad International',
 'DEPARTING_AIRPORT_Pittsburgh International',
 'DEPARTING_AIRPORT_Port Columbus International',
 'DEPARTING_AIRPORT_Portland International',
 'DEPARTING_AIRPORT_Portland International Jetport',
 'DEPARTING_AIRPORT_Puerto Rico International',
 'DEPARTING_AIRPORT_Raleigh-Durham International',
 'DEPARTING_AIRPORT_Reno/Tahoe International',
 'DEPARTING_AIRPORT_Richmond International',
 'DEPARTING_AIRPORT_Rochester Monroe County',
 'DEPARTING_AIRPORT_Ronald Reagan Washington National',
 'DEPARTING_AIRPORT_Sacramento International',
 'DEPARTING_AIRPORT_Salt Lake City International',
 'DEPARTING_AIRPORT_San Antonio International',
 'DEPARTING_AIRPORT_San Diego International Lindbergh Fl',
 'DEPARTING_AIRPORT_San Francisco International',
 'DEPARTING_AIRPORT_San Jose International',
 'DEPARTING_AIRPORT_Sanford NAS',
 'DEPARTING_AIRPORT_Savannah/Hilton Head International',
 'DEPARTING_AIRPORT_Seattle International',
 'DEPARTING_AIRPORT_Southwest Florida International',
 'DEPARTING_AIRPORT_Spokane International',
 'DEPARTING_AIRPORT_Standiford Field',
 'DEPARTING_AIRPORT_Stapleton International',
 'DEPARTING_AIRPORT_Syracuse Hancock International',
 'DEPARTING_AIRPORT_Tampa International',
 'DEPARTING_AIRPORT_Theodore Francis Green State',
 'DEPARTING_AIRPORT_Truax Field',
 'DEPARTING_AIRPORT_Tucson International',
 'DEPARTING_AIRPORT_Tulsa International',
 'DEPARTING_AIRPORT_Washington Dulles International',
 'DEPARTING_AIRPORT_Will Rogers World',
 'DEPARTING_AIRPORT_William P Hobby',
 'principal_component'], outputCol = 'DEP_DEL15')

In [40]:
output = assembler.transform(clean)

IllegalArgumentException: ignored

In [None]:
# Remove "DEP_DEL15" target (delayed more than 15 minutes) from features data

y = clean.select(col("DEP_DEL15")).collect()
X = clean.drop(col("DEP_DEL15"))

In [None]:
# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 50, stratify = y)

TypeError: ignored

In [None]:
X_test.head()

Unnamed: 0,MONTH,DAY_OF_WEEK,DISTANCE_GROUP,SEGMENT_NUMBER,CONCURRENT_FLIGHTS,NUMBER_OF_SEATS,AIRPORT_FLIGHTS_MONTH,AIRLINE_FLIGHTS_MONTH,AIRLINE_AIRPORT_FLIGHTS_MONTH,AVG_MONTHLY_PASS_AIRPORT,...,DEPARTING_AIRPORT_Syracuse Hancock International,DEPARTING_AIRPORT_Tampa International,DEPARTING_AIRPORT_Theodore Francis Green State,DEPARTING_AIRPORT_Truax Field,DEPARTING_AIRPORT_Tucson International,DEPARTING_AIRPORT_Tulsa International,DEPARTING_AIRPORT_Washington Dulles International,DEPARTING_AIRPORT_Will Rogers World,DEPARTING_AIRPORT_William P Hobby,principal_component
3980710,8,6,1,4,24,50,10855,28893,339,1680928,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-20.625256
1460088,3,1,2,1,5,76,4028,26929,1118,344196,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-11.505072
6282027,12,1,1,4,60,66,27188,70473,5294,3103410,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-6.564951
3063064,6,6,5,1,20,162,6007,24204,411,874468,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-11.343075
5992594,12,1,4,1,6,143,4098,113248,868,385767,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-14.16599


In [None]:
# Create a StandardScaler instance
scaler = StandardScaler()

In [None]:
# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

In [None]:
# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators = 128, random_state = 50)

In [None]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [None]:
# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")