# Data preaparation and xgboost regressor training

# Spark set up

In [1]:
import platform
local_os = platform.system()
if local_os == 'Linux':
    !apt-get install openjdk-8-jdk-headless -qq > /dev/null
    !wget https://dlcdn.apache.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz
    !tar zxvf /content/spark-3.3.1-bin-hadoop3.tgz
    !pip install -q findspark
    import findspark
    findspark.init()
print(local_os)

Windows


In [2]:
from pyspark.sql import SparkSession

In [3]:
import os
if local_os == 'Linux':
    os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
    os.environ["SPARK_HOME"] = "/content/spark-3.3.1-bin-hadoop3"
elif local_os == 'Windows':
    os.environ["JAVA_HOME"] = "C:/Program Files/Java/jdk-19/"
else:
    os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk/"


In [4]:
spark = SparkSession.builder\
        .master("local")\
        .appName("flights")\
        .getOrCreate()

In [5]:
spark

# Libs imports

In [6]:
from pyspark.sql.functions import desc, isnan, when, count, col, isnull

# Data loading

In [7]:
from pathlib import Path

if local_os == "Windows":
    source = "D:/data_sets/airlines/"
elif local_os == 'Linux':
    source = "/content/drive/MyDrive/datasets/flights_kaggle_split"
else:
    source = "../data/"
print(source)
source_path = Path(source).glob('*.parquet')
file_names = sorted(list(source_path))
file_names

D:/data_sets/airlines/


[WindowsPath('D:/data_sets/airlines/Combined_Flights_2018.parquet'),
 WindowsPath('D:/data_sets/airlines/Combined_Flights_2019.parquet'),
 WindowsPath('D:/data_sets/airlines/Combined_Flights_2020.parquet'),
 WindowsPath('D:/data_sets/airlines/Combined_Flights_2021.parquet'),
 WindowsPath('D:/data_sets/airlines/Combined_Flights_2022.parquet')]

In [8]:
def merge_data(file_names):
  first_file = file_names.pop(0)
  data = spark.read.parquet(first_file.as_posix())
  for file_name in file_names:
    temp_data = spark.read.parquet(file_name.as_posix())
    data = data.union(temp_data)
    print(file_name.as_posix())
  return data
data = merge_data(file_names)

D:/data_sets/airlines/Combined_Flights_2019.parquet
D:/data_sets/airlines/Combined_Flights_2020.parquet
D:/data_sets/airlines/Combined_Flights_2021.parquet
D:/data_sets/airlines/Combined_Flights_2022.parquet


# EDA

In [9]:
data.count(), len(data.columns)

(29193782, 62)

In [10]:
data.show(5)

+-------------------+-----------------+------+----+---------+--------+----------+-------+---------------+--------+-------+---------------+-------+--------------+-----------------+--------+----+-------+-----+----------+---------+-------------------------+---------------------------------------+------------------------+---------------------------+-------------------------------+-----------------+------------------------+---------------------------+-----------+-------------------------------+---------------+------------------+------------------+--------------+-----------+---------------+---------------+---------+-------------+----------------+----------------+------------+---------+-------------+-------------+-------+--------+--------------------+----------+-------+---------+--------+------+----------+--------+--------+------------------+----------+-------------+------------------+-----------------+
|         FlightDate|          Airline|Origin|Dest|Cancelled|Diverted|CRSDepTime|DepTime|D

## Data types exploration
Handling categorical values to train model

In [11]:
data = data.drop(data.__index_level_0__)

In [12]:
dtypes = set()
[dtypes.add(item[1]) for item in data.dtypes]
dtypes

{'bigint', 'boolean', 'double', 'string', 'timestamp'}

### Boolean exploration

In [13]:
[item[0] for item in data.dtypes if item[1] == 'boolean']

['Cancelled', 'Diverted']

In [14]:
data.groupBy('Cancelled').count().orderBy('count').collect()

[Row(Cancelled=True, count=777267), Row(Cancelled=False, count=28416515)]

In [15]:
data.groupBy('Diverted').count().orderBy('count').collect()

[Row(Diverted=True, count=68349), Row(Diverted=False, count=29125433)]

### String exploration

In [16]:
str_columns = [item[0] for item in data.dtypes if item[1] == 'string']

In [17]:
data.select(str_columns).show(5)

+-----------------+------+----+-------------------------+---------------------------------------+---------------------------+-----------------+---------------------------+-----------+--------------+-----------+---------------+------------+---------+-------------+----------+----------+
|          Airline|Origin|Dest|Marketing_Airline_Network|Operated_or_Branded_Code_Share_Partners|IATA_Code_Marketing_Airline|Operating_Airline|IATA_Code_Operating_Airline|Tail_Number|OriginCityName|OriginState|OriginStateName|DestCityName|DestState|DestStateName|DepTimeBlk|ArrTimeBlk|
+-----------------+------+----+-------------------------+---------------------------------------+---------------------------+-----------------+---------------------------+-----------+--------------+-----------+---------------+------------+---------+-------------+----------+----------+
|Endeavor Air Inc.|   ABY| ATL|                       DL|                           DL_CODESHARE|                         DL|               9E

### Timestamp exploration

In [18]:
timestamp_columns = [item[0] for item in data.dtypes if item[1] == 'timestamp']

In [19]:
data.select(timestamp_columns).show(5)

+-------------------+
|         FlightDate|
+-------------------+
|2018-01-22 18:00:00|
|2018-01-23 18:00:00|
|2018-01-24 18:00:00|
|2018-01-25 18:00:00|
|2018-01-26 18:00:00|
+-------------------+
only showing top 5 rows



## Null values handling

In [20]:
null_count = data.select([count(when(col(c).isNull(), c)).alias(c) for c in data.columns])
null_count.show()

+----------+-------+------+----+---------+--------+----------+-------+---------------+--------+-------+---------------+-------+--------------+-----------------+--------+----+-------+-----+----------+---------+-------------------------+---------------------------------------+------------------------+---------------------------+-------------------------------+-----------------+------------------------+---------------------------+-----------+-------------------------------+---------------+------------------+------------------+--------------+-----------+---------------+---------------+---------+-------------+----------------+----------------+------------+---------+-------------+-------------+-------+--------+--------------------+----------+-------+---------+--------+------+----------+--------+--------+------------------+----------+-------------+------------------+
|FlightDate|Airline|Origin|Dest|Cancelled|Diverted|CRSDepTime|DepTime|DepDelayMinutes|DepDelay|ArrTime|ArrDelayMinutes|AirTime|

In [21]:
null_values = null_count.collect()[0].asDict()

In [22]:
only_nulls = null_count.select([key for key in null_values if null_values[key] != 0])
only_nulls.show()

+-------+---------------+--------+-------+---------------+-------+--------------+-----------------+-----------+--------+--------------------+-------+---------+--------+------+--------+--------+------------------+------------------+
|DepTime|DepDelayMinutes|DepDelay|ArrTime|ArrDelayMinutes|AirTime|CRSElapsedTime|ActualElapsedTime|Tail_Number|DepDel15|DepartureDelayGroups|TaxiOut|WheelsOff|WheelsOn|TaxiIn|ArrDelay|ArrDel15|ArrivalDelayGroups|DivAirportLandings|
+-------+---------------+--------+-------+---------------+-------+--------------+-----------------+-----------+--------+--------------------+-------+---------+--------+------+--------+--------+------------------+------------------+
| 761652|         763084|  763084| 786177|         846183| 852561|            22|           845637|     267613|  763084|              763084| 780561|   780551|  793133|793143|  846183|  846183|            846183|                90|
+-------+---------------+--------+-------+---------------+-------+------

## Drop all null values

In [23]:
data_no_na = data.dropna()

# Feature selection

In [24]:
#Drop timestamp data
data_no_na = data_no_na.drop('FlightDate')

In [25]:
data_no_na.count(), len(data_no_na.columns)

(28339510, 60)

In [26]:
cols_to_encode = ['Airline', 'Origin', 'Dest']
[str_columns.remove(x) for x in cols_to_encode]
None

In [27]:
data_no_na = data_no_na.drop(*str_columns)
data_no_na.count(), len(data_no_na.columns)

(28339510, 46)

In [28]:
data_no_na.printSchema()

root
 |-- Airline: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Cancelled: boolean (nullable = true)
 |-- Diverted: boolean (nullable = true)
 |-- CRSDepTime: long (nullable = true)
 |-- DepTime: double (nullable = true)
 |-- DepDelayMinutes: double (nullable = true)
 |-- DepDelay: double (nullable = true)
 |-- ArrTime: double (nullable = true)
 |-- ArrDelayMinutes: double (nullable = true)
 |-- AirTime: double (nullable = true)
 |-- CRSElapsedTime: double (nullable = true)
 |-- ActualElapsedTime: double (nullable = true)
 |-- Distance: double (nullable = true)
 |-- Year: long (nullable = true)
 |-- Quarter: long (nullable = true)
 |-- Month: long (nullable = true)
 |-- DayofMonth: long (nullable = true)
 |-- DayOfWeek: long (nullable = true)
 |-- DOT_ID_Marketing_Airline: long (nullable = true)
 |-- Flight_Number_Marketing_Airline: long (nullable = true)
 |-- DOT_ID_Operating_Airline: long (nullable = true)
 |-- Flight_Number_

In [29]:
dep_cols = [x for x in data_no_na.columns if "Dep" in x]
arr_cols = [x for x in data_no_na.columns if "Arr" in x]

In [30]:
time_columns = dep_cols + arr_cols
time_columns

['CRSDepTime',
 'DepTime',
 'DepDelayMinutes',
 'DepDelay',
 'DepDel15',
 'DepartureDelayGroups',
 'ArrTime',
 'ArrDelayMinutes',
 'CRSArrTime',
 'ArrDelay',
 'ArrDel15',
 'ArrivalDelayGroups']

In [31]:
time_columns.remove('DepTime')
# time_columns.remove('ArrTime')
time_columns

['CRSDepTime',
 'DepDelayMinutes',
 'DepDelay',
 'DepDel15',
 'DepartureDelayGroups',
 'ArrTime',
 'ArrDelayMinutes',
 'CRSArrTime',
 'ArrDelay',
 'ArrDel15',
 'ArrivalDelayGroups']

In [32]:
data_no_na.select('DepDelay').describe().show()

+-------+-----------------+
|summary|         DepDelay|
+-------+-----------------+
|  count|         28339510|
|   mean| 9.23847367156313|
| stddev|47.10140749050439|
|    min|          -1280.0|
|    max|           7223.0|
+-------+-----------------+



In [33]:
data_no_na.select('ArrDelay').describe().show()

+-------+------------------+
|summary|          ArrDelay|
+-------+------------------+
|  count|          28339510|
|   mean|3.6081859213514984|
| stddev| 49.28063347282263|
|    min|           -1290.0|
|    max|            7232.0|
+-------+------------------+



In [34]:
data_no_na = data_no_na.drop(*time_columns)

In [35]:
len(data_no_na.columns)

35

## One hot enconding

In [36]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

In [37]:
def convert_categoric_to_numeric(dataframe, columns):
    ''' Use string indexer to code categorical to numerical and create new column on dataframe'''
    new_cols = []
    for column in columns:
        num_col= column + "_num"
        new_cols.append(num_col)
        indexer = StringIndexer(inputCol=column, outputCol=num_col).fit(data_no_na)
        dataframe = indexer.transform(dataframe)
    return dataframe, new_cols
df, new_cols = convert_categoric_to_numeric(data_no_na, cols_to_encode)
# Drop remaining columns
# df = df.drop(*cols_to_encode)
df.show(5)

+-----------------+------+----+---------+--------+-------+-------+--------------+-----------------+--------+----+-------+-----+----------+---------+------------------------+-------------------------------+------------------------+-------------------------------+---------------+------------------+------------------+---------------+---------+-------------+----------------+----------------+-------------+-------+-------+---------+--------+------+-------------+------------------+-----------+----------+--------+
|          Airline|Origin|Dest|Cancelled|Diverted|DepTime|AirTime|CRSElapsedTime|ActualElapsedTime|Distance|Year|Quarter|Month|DayofMonth|DayOfWeek|DOT_ID_Marketing_Airline|Flight_Number_Marketing_Airline|DOT_ID_Operating_Airline|Flight_Number_Operating_Airline|OriginAirportID|OriginAirportSeqID|OriginCityMarketID|OriginStateFips|OriginWac|DestAirportID|DestAirportSeqID|DestCityMarketID|DestStateFips|DestWac|TaxiOut|WheelsOff|WheelsOn|TaxiIn|DistanceGroup|DivAirportLandings|Airline_n

In [38]:
def generate_dummy_columns(dataframe, columns):
    ''' Use OneHotEncoder to convert enconded variables to vec column on dataframe'''
    for column in columns:
        col_vector = column + "_vec"
        enconder = OneHotEncoder(inputCol=column, outputCol=col_vector, dropLast=False).fit(dataframe)
        dataframe = enconder.transform(dataframe)
    return dataframe
df = generate_dummy_columns(df, new_cols)
# Drop encoded columns
df = df.drop(*new_cols)

In [41]:
from pyspark.ml.functions import vector_to_array

In [54]:
df_col_onehot = df.select('*', vector_to_array('Airline_num_vec').alias('airline_one_hot'))
df_col_onehot.show(1)


+-----------------+------+----+---------+--------+-------+-------+--------------+-----------------+--------+----+-------+-----+----------+---------+------------------------+-------------------------------+------------------------+-------------------------------+---------------+------------------+------------------+---------------+---------+-------------+----------------+----------------+-------------+-------+-------+---------+--------+------+-------------+------------------+---------------+-----------------+---------------+--------------------+
|          Airline|Origin|Dest|Cancelled|Diverted|DepTime|AirTime|CRSElapsedTime|ActualElapsedTime|Distance|Year|Quarter|Month|DayofMonth|DayOfWeek|DOT_ID_Marketing_Airline|Flight_Number_Marketing_Airline|DOT_ID_Operating_Airline|Flight_Number_Operating_Airline|OriginAirportID|OriginAirportSeqID|OriginCityMarketID|OriginStateFips|OriginWac|DestAirportID|DestAirportSeqID|DestCityMarketID|DestStateFips|DestWac|TaxiOut|WheelsOff|WheelsOn|TaxiIn|Dis

In [55]:
len(df_col_onehot.first()["airline_one_hot"])

28

In [71]:
num_categories = len(df_col_onehot.first()["airline_one_hot"])
cols_expanded = [(col('airline_one_hot')[i]) for i in range(num_categories)]
df_cols_onehot = df_col_onehot.select('*', *cols_expanded)
df_cols_onehot.show(1)

+-----------------+------+----+---------+--------+-------+-------+--------------+-----------------+--------+----+-------+-----+----------+---------+------------------------+-------------------------------+------------------------+-------------------------------+---------------+------------------+------------------+---------------+---------+-------------+----------------+----------------+-------------+-------+-------+---------+--------+------+-------------+------------------+---------------+-----------------+---------------+--------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+

In [78]:
cols_expanded = [(col('airline_one_hot')[i].alias(f'{indexer.labels[i]}')) for i in range(num_categories)]
df_cols_named =  df_col_onehot.select('*', *cols_expanded)
df_cols_named.show(1)

+-----------------+------+----+---------+--------+-------+-------+--------------+-----------------+--------+----+-------+-----+----------+---------+------------------------+-------------------------------+------------------------+-------------------------------+---------------+------------------+------------------+---------------+---------+-------------+----------------+----------------+-------------+-------+-------+---------+--------+------+-------------+------------------+---------------+-----------------+---------------+--------------------+----------------------+--------------------+---------------------+----------------------+---------------------+-----------------+---------------+---------+-----------------+-----------+--------------------+----------------+------------------+----------------------+-------------+-----------+---------------------------+---------------------------+------------------------+----------------------+----------------------------------------+--------------

In [72]:
indexer = StringIndexer(inputCol="Airline", outputCol="airline_encoded").fit(df_cols_onehot)

['Southwest Airlines Co.',
 'Delta Air Lines Inc.',
 'SkyWest Airlines Inc.',
 'American Airlines Inc.',
 'United Air Lines Inc.',
 'Republic Airlines',
 'JetBlue Airways',
 'Envoy Air',
 'Endeavor Air Inc.',
 'Comair Inc.',
 'Alaska Airlines Inc.',
 'Spirit Air Lines',
 'Mesa Airlines Inc.',
 'Frontier Airlines Inc.',
 'Allegiant Air',
 'Horizon Air',
 'Capital Cargo International',
 'Air Wisconsin Airlines Corp',
 'ExpressJet Airlines Inc.',
 'Hawaiian Airlines Inc.',
 'GoJet Airlines, LLC d/b/a United Express',
 'Commutair Aka Champlain Enterprises, Inc.',
 'Trans States Airlines',
 'Compass Airlines',
 'Virgin America',
 'Empire Airlines Inc.',
 'Peninsula Airways Inc.',
 'Cape Air']

## Data preparation

In [59]:
df.printSchema()

root
 |-- Cancelled: boolean (nullable = true)
 |-- Diverted: boolean (nullable = true)
 |-- DepTime: double (nullable = true)
 |-- AirTime: double (nullable = true)
 |-- CRSElapsedTime: double (nullable = true)
 |-- ActualElapsedTime: double (nullable = true)
 |-- Distance: double (nullable = true)
 |-- Year: long (nullable = true)
 |-- Quarter: long (nullable = true)
 |-- Month: long (nullable = true)
 |-- DayofMonth: long (nullable = true)
 |-- DayOfWeek: long (nullable = true)
 |-- DOT_ID_Marketing_Airline: long (nullable = true)
 |-- Flight_Number_Marketing_Airline: long (nullable = true)
 |-- DOT_ID_Operating_Airline: long (nullable = true)
 |-- Flight_Number_Operating_Airline: long (nullable = true)
 |-- OriginAirportID: long (nullable = true)
 |-- OriginAirportSeqID: long (nullable = true)
 |-- OriginCityMarketID: long (nullable = true)
 |-- OriginStateFips: long (nullable = true)
 |-- OriginWac: long (nullable = true)
 |-- DestAirportID: long (nullable = true)
 |-- DestAirport

## 

# Train XGBoost model

In [51]:
from xgboost.spark import SparkXGBRegressor

In [53]:
label = "DepTime"
feature_names = [x.name for x in df.schema if x.name != label]

In [57]:
regressor = SparkXGBRegressor(
  features_col=feature_names,
  label_col=label,
  num_workers=1,
  use_gpu=True,
)

In [58]:
model = regressor.fit(df)

If features_cols param set, then features_col param is ignored.
You enabled use_gpu in spark local mode. Please make sure your local node has at least 1 GPUs


ValueError: Values in feature columns must be integral types or float/double types.