# Data preaparation and xgboost regressor training

# Spark set up

In [3]:
import platform
local_os = platform.system()
if local_os == 'Linux':
    !apt-get install openjdk-8-jdk-headless -qq > /dev/null
    !wget https://dlcdn.apache.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz
    !tar zxvf /content/spark-3.3.1-bin-hadoop3.tgz
    !pip install -q findspark
    import findspark
    findspark.init()

In [4]:
from pyspark.sql import SparkSession

In [5]:
import os
if local_os == 'Linux':
    os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
    os.environ["SPARK_HOME"] = "/content/spark-3.3.1-bin-hadoop3"
else:
    os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk/"


In [6]:
spark = SparkSession.builder\
        .master("local")\
        .appName("flights")\
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/28 20:59:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
spark

# Libs imports

In [8]:
from pyspark.sql.functions import desc, isnan, when, count, col, isnull

# Data loading

In [9]:
from pathlib import Path
# source = "/content/drive/MyDrive/datasets/flights_kaggle_split"
source = "../data/"
source_path = Path(source).glob('*.parquet')
file_names = sorted(list(source_path))
file_names

[PosixPath('../data/Combined_Flights_2018.parquet'),
 PosixPath('../data/Combined_Flights_2019.parquet'),
 PosixPath('../data/Combined_Flights_2020.parquet'),
 PosixPath('../data/Combined_Flights_2021.parquet'),
 PosixPath('../data/Combined_Flights_2022.parquet')]

In [10]:
def merge_data(file_names):
  first_file = file_names.pop(0)
  data = spark.read.parquet(first_file.as_posix())
  for file_name in file_names:
    temp_data = spark.read.parquet(file_name.as_posix())
    data = data.union(temp_data)
    print(file_name.as_posix())
  return data
data = merge_data(file_names)

                                                                                

../data/Combined_Flights_2019.parquet
../data/Combined_Flights_2020.parquet
../data/Combined_Flights_2021.parquet
../data/Combined_Flights_2022.parquet


# EDA

In [11]:
data.count(), len(data.columns)

(29193782, 62)

In [12]:
data.show(5)

22/11/28 20:59:22 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
+-------------------+-----------------+------+----+---------+--------+----------+-------+---------------+--------+-------+---------------+-------+--------------+-----------------+--------+----+-------+-----+----------+---------+-------------------------+---------------------------------------+------------------------+---------------------------+-------------------------------+-----------------+------------------------+---------------------------+-----------+-------------------------------+---------------+------------------+------------------+--------------+-----------+---------------+---------------+---------+-------------+----------------+----------------+------------+---------+-------------+-------------+-------+--------+--------------------+----------+-------+---------+--------+------+----------+--------+-

                                                                                

## Data types exploration
Handling categorical values to train model

In [13]:
data = data.drop(data.__index_level_0__)

In [14]:
dtypes = set()
[dtypes.add(item[1]) for item in data.dtypes]
dtypes

{'bigint', 'boolean', 'double', 'string', 'timestamp'}

### Boolean exploration

In [15]:
[item[0] for item in data.dtypes if item[1] == 'boolean']

['Cancelled', 'Diverted']

In [16]:
data.groupBy('Cancelled').count().orderBy('count').collect()

                                                                                

[Row(Cancelled=True, count=777267), Row(Cancelled=False, count=28416515)]

In [17]:
data.groupBy('Diverted').count().orderBy('count').collect()

[Row(Diverted=True, count=68349), Row(Diverted=False, count=29125433)]

### String exploration

In [18]:
str_columns = [item[0] for item in data.dtypes if item[1] == 'string']

In [19]:
data.select(str_columns).show(5)

+-----------------+------+----+-------------------------+---------------------------------------+---------------------------+-----------------+---------------------------+-----------+--------------+-----------+---------------+------------+---------+-------------+----------+----------+
|          Airline|Origin|Dest|Marketing_Airline_Network|Operated_or_Branded_Code_Share_Partners|IATA_Code_Marketing_Airline|Operating_Airline|IATA_Code_Operating_Airline|Tail_Number|OriginCityName|OriginState|OriginStateName|DestCityName|DestState|DestStateName|DepTimeBlk|ArrTimeBlk|
+-----------------+------+----+-------------------------+---------------------------------------+---------------------------+-----------------+---------------------------+-----------+--------------+-----------+---------------+------------+---------+-------------+----------+----------+
|Endeavor Air Inc.|   ABY| ATL|                       DL|                           DL_CODESHARE|                         DL|               9E

### Timestamp exploration

In [20]:
timestamp_columns = [item[0] for item in data.dtypes if item[1] == 'timestamp']

In [21]:
data.select(timestamp_columns).show(5)

+-------------------+
|         FlightDate|
+-------------------+
|2018-01-22 18:00:00|
|2018-01-23 18:00:00|
|2018-01-24 18:00:00|
|2018-01-25 18:00:00|
|2018-01-26 18:00:00|
+-------------------+
only showing top 5 rows



## Null values handling

In [22]:
null_count = data.select([count(when(col(c).isNull(), c)).alias(c) for c in data.columns])
null_count.show()



+----------+-------+------+----+---------+--------+----------+-------+---------------+--------+-------+---------------+-------+--------------+-----------------+--------+----+-------+-----+----------+---------+-------------------------+---------------------------------------+------------------------+---------------------------+-------------------------------+-----------------+------------------------+---------------------------+-----------+-------------------------------+---------------+------------------+------------------+--------------+-----------+---------------+---------------+---------+-------------+----------------+----------------+------------+---------+-------------+-------------+-------+--------+--------------------+----------+-------+---------+--------+------+----------+--------+--------+------------------+----------+-------------+------------------+
|FlightDate|Airline|Origin|Dest|Cancelled|Diverted|CRSDepTime|DepTime|DepDelayMinutes|DepDelay|ArrTime|ArrDelayMinutes|AirTime|

                                                                                

In [23]:
null_values = null_count.collect()[0].asDict()

                                                                                

In [24]:
only_nulls = null_count.select([key for key in null_values if null_values[key] != 0])
only_nulls.show()



+-------+---------------+--------+-------+---------------+-------+--------------+-----------------+-----------+--------+--------------------+-------+---------+--------+------+--------+--------+------------------+------------------+
|DepTime|DepDelayMinutes|DepDelay|ArrTime|ArrDelayMinutes|AirTime|CRSElapsedTime|ActualElapsedTime|Tail_Number|DepDel15|DepartureDelayGroups|TaxiOut|WheelsOff|WheelsOn|TaxiIn|ArrDelay|ArrDel15|ArrivalDelayGroups|DivAirportLandings|
+-------+---------------+--------+-------+---------------+-------+--------------+-----------------+-----------+--------+--------------------+-------+---------+--------+------+--------+--------+------------------+------------------+
| 761652|         763084|  763084| 786177|         846183| 852561|            22|           845637|     267613|  763084|              763084| 780561|   780551|  793133|793143|  846183|  846183|            846183|                90|
+-------+---------------+--------+-------+---------------+-------+------

                                                                                

## Drop all null values

In [25]:
data_no_na = data.dropna()

# Feature selection

In [26]:
#Drop timestamp data
data_no_na = data_no_na.drop('FlightDate')

In [27]:
data_no_na.count(), len(data_no_na.columns)

                                                                                

(28339510, 60)

In [28]:
remain_cols = ['Airline','Origin', 'Dest']
[str_columns.remove(x) for x in remain_cols ]

[None, None, None]

In [29]:
data_no_na = data_no_na.drop(*str_columns)
data_no_na.count(), len(data_no_na.columns)

                                                                                

(28339510, 46)

In [30]:
data_no_na.printSchema()

root
 |-- Airline: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Cancelled: boolean (nullable = true)
 |-- Diverted: boolean (nullable = true)
 |-- CRSDepTime: long (nullable = true)
 |-- DepTime: double (nullable = true)
 |-- DepDelayMinutes: double (nullable = true)
 |-- DepDelay: double (nullable = true)
 |-- ArrTime: double (nullable = true)
 |-- ArrDelayMinutes: double (nullable = true)
 |-- AirTime: double (nullable = true)
 |-- CRSElapsedTime: double (nullable = true)
 |-- ActualElapsedTime: double (nullable = true)
 |-- Distance: double (nullable = true)
 |-- Year: long (nullable = true)
 |-- Quarter: long (nullable = true)
 |-- Month: long (nullable = true)
 |-- DayofMonth: long (nullable = true)
 |-- DayOfWeek: long (nullable = true)
 |-- DOT_ID_Marketing_Airline: long (nullable = true)
 |-- Flight_Number_Marketing_Airline: long (nullable = true)
 |-- DOT_ID_Operating_Airline: long (nullable = true)
 |-- Flight_Number_

## One hot enconding

In [31]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

In [32]:
def convert_categoric_to_numeric(dataframe, columns):
    ''' Use string indexer to code categorical to numerical and create new column on dataframe'''
    new_cols = []
    for column in columns:
        num_col= column + "_num"
        new_cols.append(num_col)
        indexer = StringIndexer(inputCol=column, outputCol=num_col).fit(data_no_na)
        dataframe = indexer.transform(dataframe)
    return dataframe, new_cols
df, new_cols = convert_categoric_to_numeric(data_no_na, remain_cols)
# Drop remaining columns
df = df.drop(*remain_cols)
df.show(5)

                                                                                

+---------+--------+----------+-------+---------------+--------+-------+---------------+-------+--------------+-----------------+--------+----+-------+-----+----------+---------+------------------------+-------------------------------+------------------------+-------------------------------+---------------+------------------+------------------+---------------+---------+-------------+----------------+----------------+-------------+-------+--------+--------------------+-------+---------+--------+------+----------+--------+--------+------------------+-------------+------------------+-----------+----------+--------+
|Cancelled|Diverted|CRSDepTime|DepTime|DepDelayMinutes|DepDelay|ArrTime|ArrDelayMinutes|AirTime|CRSElapsedTime|ActualElapsedTime|Distance|Year|Quarter|Month|DayofMonth|DayOfWeek|DOT_ID_Marketing_Airline|Flight_Number_Marketing_Airline|DOT_ID_Operating_Airline|Flight_Number_Operating_Airline|OriginAirportID|OriginAirportSeqID|OriginCityMarketID|OriginStateFips|OriginWac|DestAirp

In [33]:
def generate_dummy_columns(dataframe, columns):
    ''' Use OneHotEncoder to convert enconded variables to vec column on dataframe'''
    for column in columns:
        col_vector = column + "_vec"
        enconder = OneHotEncoder(inputCol=column, outputCol=col_vector).fit(dataframe)
        dataframe = enconder.transform(dataframe)
    return dataframe
df = generate_dummy_columns(df, new_cols)
# Drop encoded columns
df = df.drop(*new_cols)
df.show(5)

+---------+--------+----------+-------+---------------+--------+-------+---------------+-------+--------------+-----------------+--------+----+-------+-----+----------+---------+------------------------+-------------------------------+------------------------+-------------------------------+---------------+------------------+------------------+---------------+---------+-------------+----------------+----------------+-------------+-------+--------+--------------------+-------+---------+--------+------+----------+--------+--------+------------------+-------------+------------------+---------------+-----------------+---------------+
|Cancelled|Diverted|CRSDepTime|DepTime|DepDelayMinutes|DepDelay|ArrTime|ArrDelayMinutes|AirTime|CRSElapsedTime|ActualElapsedTime|Distance|Year|Quarter|Month|DayofMonth|DayOfWeek|DOT_ID_Marketing_Airline|Flight_Number_Marketing_Airline|DOT_ID_Operating_Airline|Flight_Number_Operating_Airline|OriginAirportID|OriginAirportSeqID|OriginCityMarketID|OriginStateFips|

## Data preparation

In [36]:
df.columns

['Cancelled',
 'Diverted',
 'CRSDepTime',
 'DepTime',
 'DepDelayMinutes',
 'DepDelay',
 'ArrTime',
 'ArrDelayMinutes',
 'AirTime',
 'CRSElapsedTime',
 'ActualElapsedTime',
 'Distance',
 'Year',
 'Quarter',
 'Month',
 'DayofMonth',
 'DayOfWeek',
 'DOT_ID_Marketing_Airline',
 'Flight_Number_Marketing_Airline',
 'DOT_ID_Operating_Airline',
 'Flight_Number_Operating_Airline',
 'OriginAirportID',
 'OriginAirportSeqID',
 'OriginCityMarketID',
 'OriginStateFips',
 'OriginWac',
 'DestAirportID',
 'DestAirportSeqID',
 'DestCityMarketID',
 'DestStateFips',
 'DestWac',
 'DepDel15',
 'DepartureDelayGroups',
 'TaxiOut',
 'WheelsOff',
 'WheelsOn',
 'TaxiIn',
 'CRSArrTime',
 'ArrDelay',
 'ArrDel15',
 'ArrivalDelayGroups',
 'DistanceGroup',
 'DivAirportLandings',
 'Airline_num_vec',
 'Origin_num_vec',
 'Dest_num_vec']

In [35]:
y = df.select('Cancelled').collect()

                                                                                

Py4JJavaError: An error occurred while calling o699.collectToPython.
: java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.sql.execution.SparkPlan$$anon$1._next(SparkPlan.scala:391)
	at org.apache.spark.sql.execution.SparkPlan$$anon$1.getNext(SparkPlan.scala:402)
	at org.apache.spark.sql.execution.SparkPlan$$anon$1.getNext(SparkPlan.scala:388)
	at org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.util.NextIterator.foreach(NextIterator.scala:21)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeCollect$1(SparkPlan.scala:425)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeCollect$1$adapted(SparkPlan.scala:424)
	at org.apache.spark.sql.execution.SparkPlan$$Lambda$3141/0x0000000801ea6000.apply(Unknown Source)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:424)
	at org.apache.spark.sql.Dataset.$anonfun$collectToPython$1(Dataset.scala:3688)
	at org.apache.spark.sql.Dataset$$Lambda$3345/0x0000000801f3d198.apply(Unknown Source)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:3858)
	at org.apache.spark.sql.Dataset$$Lambda$2668/0x0000000801d5ce20.apply(Unknown Source)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:510)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3856)
	at org.apache.spark.sql.Dataset$$Lambda$2260/0x0000000801c55108.apply(Unknown Source)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$$$Lambda$2268/0x0000000801c57c30.apply(Unknown Source)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.execution.SQLExecution$$$Lambda$2261/0x0000000801c553c0.apply(Unknown Source)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3856)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3685)
	at java.base/java.lang.invoke.DirectMethodHandle$Holder.invokeVirtual(DirectMethodHandle$Holder)
	at java.base/java.lang.invoke.LambdaForm$MH/0x0000000801160400.invoke(LambdaForm$MH)


# Train XGBoost model

In [None]:
from sklearn.model_selection import train_test_split(X)

In [39]:
from xgboost import XGBClassifier

'1.5.0'