# Flight data analysis

In [1]:
%%html
<style>
table {float:left}
</style>

In [2]:
%%html
<style>
div.output_area pre {
    white-space: pre;
}
</style>

In [3]:
import os
import sys
import gc
from datetime import (
    datetime,
    date
)
import numpy as np
import pandas as pd

#  Environemnt Variables

## Hadoop

In [4]:
os.environ['HADOOP_CONF_DIR'] = "/opt/hadoop/hadoop-3.2.2/etc/hadoop"

In [5]:
%%bash
export HADOOP_CONF_DIR="/opt/hadoop/hadoop-3.2.2/etc/hadoop"
ls $HADOOP_CONF_DIR | head -n 5

capacity-scheduler.xml
configuration.xsl
container-executor.cfg
core-site.xml
core-site.xml.48132.2022-02-15@12:29:41~


## PYTHONPATH

Refer to the **pyspark** modules to load from the ```$SPARK_HOME/python/lib``` in the Spark installation.

* [PySpark Getting Started](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)

> Ensure the SPARK_HOME environment variable points to the directory where the tar file has been extracted. Update PYTHONPATH environment variable such that it can find the PySpark and Py4J under SPARK_HOME/python/lib. One example of doing this is shown below:

```
export PYTHONPATH=$(ZIPS=("$SPARK_HOME"/python/lib/*.zip); IFS=:; echo "${ZIPS[*]}"):$PYTHONPATH
```

Alternatively install **pyspark** with pip or conda locally which installs the Spark runtime libararies (for standalone).

* [Can PySpark work without Spark?](https://stackoverflow.com/questions/51728177/can-pyspark-work-without-spark)

> As of v2.2, executing pip install pyspark will install Spark. If you're going to use Pyspark it's clearly the simplest way to get started. On my system Spark is installed inside my virtual environment (miniconda) at lib/python3.6/site-packages/pyspark/jars  
> PySpark has a Spark installation installed. If installed through pip3, you can find it with pip3 show pyspark. Ex. for me it is at ~/.local/lib/python3.8/site-packages/pyspark. This is a standalone configuration so it can't be used for managing clusters like a full Spark installation.

In [6]:
# os.environ['PYTHONPATH'] = "/opt/spark/spark-3.1.2/python/lib/py4j-0.10.9-src.zip:/opt/spark/spark-3.1.2/python/lib/pyspark.zip"
sys.path.extend([
    "/opt/spark/spark-3.1.2/python/lib/py4j-0.10.9-src.zip",
    "/opt/spark/spark-3.1.2/python/lib/pyspark.zip"
])

## PySpark package imports

Execute after the PYTHONPATH setup.

In [7]:
import pyspark.sql 
from pyspark.sql.types import *
from pyspark.sql.functions import (
    col,
    when,
    lit,
    avg,
    stddev,
    isnan,
    date_format,
    to_date,
    months_between,
    add_months,
    lower,
    upper,
)

---
# Spark Session


In [8]:
from pyspark.sql import SparkSession

In [9]:
spark = SparkSession.builder\
    .master('yarn') \
    .config('spark.submit.deployMode', 'client') \
    .config('spark.debug.maxToStringFields', 100) \
    .config('spark.executor.memory', '2g') \
    .getOrCreate()

2022-02-24 07:49:52,678 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-02-24 07:49:57,214 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [10]:
NUM_CORES = 4
NUM_PARTITIONS = 3

spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)
spark.conf.set('spark.sql.legacy.timeParserPolicy', 'LEGACY')

# DataFrame from Python data

* [SparkSession.createDataFrame(data, schema=None, samplingRatio=None, verifySchema=True)](https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.sql.SparkSession.createDataFrame.html)

# Schema Definition

* [Data Types](https://spark.apache.org/docs/latest/sql-ref-datatypes.html#data-types)

```from pyspark.sql.types import *```

| Data type | Value type in Python | API to access or create a data type |  |
|:---|:---|:---|:--|
|ByteType | int or long Note: Numbers will be converted to 1-byte signed integer numbers at runtime. Please make sure that numbers are within the range of -128 to 127. | ByteType() |  |
| ShortType | int or long Note: Numbers will be converted to 2-byte signed integer numbers at runtime. Please make sure that numbers are within the range of -32768 to 32767. | ShortType() |  |
| IntegerType | int or long | IntegerType() |  |
| LongType | long Note: Numbers will be converted to 8-byte signed integer numbers at runtime. Please make sure that numbers are within the range of -9223372036854775808 to 9223372036854775807.Otherwise, please convert data to decimal.Decimal and use DecimalType. | LongType() |  |
| FloatType | float Note: Numbers will be converted to 4-byte single-precision floating point numbers at runtime. | FloatType() |  |
| DoubleType | float | DoubleType() |  |
| DecimalType | decimal.Decimal | DecimalType() |  |
| StringType | string | StringType() |  |
| BinaryType | bytearray | BinaryType() |  |
| BooleanType | bool | BooleanType() |  |
| TimestampType | datetime.datetime | TimestampType() |  |
| DateType | datetime.date | DateType() |  |
| ArrayType | list, tuple, or array | ArrayType(elementType, [containsNull]) Note:The default value of containsNull is True. |  |
| MapType | dict | MapType(keyType, valueType, [valueContainsNull]) Note:The default value of valueContainsNull is True. |  |
| StructType | list or tuple | StructType(fields) Note: fields is a Seq of StructFields. Also, two fields with the same name are not allowed. |  |
| StructField | The value type in Python of the data type of this field (For example, Int for a StructField with the data type IntegerType) | StructField(name, dataType, [nullable]) Note: The default value of nullable is True. |  |


# Data


In [11]:
%%bash
hdfs dfs -mkdir -p flight
hdfs dfs -put -f ./data/flight/*.csv flight/

In [25]:
flight = spark.read\
    .option("compression", "none")\
    .option("header", True)\
    .option("sep", ',')\
    .option("nullValue", np.nan)\
    .option("inferSchema", True)\
    .option("dateFormat", "yyyy-MM-dd")\
    .csv("flight/flightData.csv")\
    .withColumn("date", to_date(col("date"), "yyyy-MM-dd"))\
    .withColumn(
        "direction", 
        when(lower(col("from")) == "uk", 1)
        .when(lower(col("to"))   == "uk", -1)
        .otherwise(0)
    )

flight.printSchema()
flight.createOrReplaceTempView("flight")
flight.show(5, truncate=False)

root
 |-- passengerId: integer (nullable = true)
 |-- flightId: integer (nullable = true)
 |-- from: string (nullable = true)
 |-- to: string (nullable = true)
 |-- date: date (nullable = true)
 |-- direction: integer (nullable = false)

+-----------+--------+----+---+----------+---------+
|passengerId|flightId|from|to |date      |direction|
+-----------+--------+----+---+----------+---------+
|48         |0       |cg  |ir |2017-01-01|0        |
|94         |0       |cg  |ir |2017-01-01|0        |
|82         |0       |cg  |ir |2017-01-01|0        |
|21         |0       |cg  |ir |2017-01-01|0        |
|51         |0       |cg  |ir |2017-01-01|0        |
+-----------+--------+----+---+----------+---------+
only showing top 5 rows



In [13]:
flight.where(col("direction") == -1).limit(5).show()



+-----------+--------+----+---+----------+---------+
|passengerId|flightId|from| to|      date|direction|
+-----------+--------+----+---+----------+---------+
|        382|      18|  jo| uk|2017-01-04|       -1|
|        385|      18|  jo| uk|2017-01-04|       -1|
|       1001|      18|  jo| uk|2017-01-04|       -1|
|       1025|      18|  jo| uk|2017-01-04|       -1|
|        378|      18|  jo| uk|2017-01-04|       -1|
+-----------+--------+----+---+----------+---------+



                                                                                

In [14]:
passenger = spark.read\
    .option("compression", "none")\
    .option("header", True)\
    .option("sep", ',')\
    .option("nullValue", np.nan)\
    .option("inferSchema", True)\
    .csv("flight/passengers.csv")

passenger.printSchema()
passenger.createOrReplaceTempView("passenger")
passenger.show()

                                                                                

root
 |-- passengerId: integer (nullable = true)
 |-- firstName: string (nullable = true)
 |-- lastName: string (nullable = true)

+-----------+---------+--------+
|passengerId|firstName|lastName|
+-----------+---------+--------+
|      14751| Napoleon| Gaylene|
|       2359| Katherin| Shanell|
|       5872|   Stevie|  Steven|
|       3346|Margarita|   Gerri|
|       3704|    Earle|  Candis|
|       1226|    Trent|    Omer|
|       2677|    Janee|  Lillia|
|        179|     Gita|Chastity|
|       9763|   Hilton|Jaquelyn|
|      11414|      Leo|Margaret|
|       6870|     Tama|     Bok|
|       3290|    Logan|    Anya|
|      13264|   Lowell|Kathryne|
|        455|  Maritza|  Maxima|
|      13006|     Yuri|   Joyce|
|      10323|  Latasha|  Estell|
|       7376|   Kaycee|Kiersten|
|      15015|   Curtis| Abraham|
|       9217|   Verena|Josefine|
|       5183|     Loan| Latonya|
+-----------+---------+--------+
only showing top 20 rows



# Number of flights per month

## Number of months betwen dates

* [pyspark.sql.functions.months_between(date1, date2, roundOff=True)](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.functions.months_between.html)

> Returns number of months between dates date1 and date2. 

In [15]:
query = """
SELECT
    date,
    MIN(date) OVER (ORDER BY date) AS base_date,
    FLOOR(months_between(date, (SELECT MIN(date) FROM flight))) AS month_index
FROM
    flight
ORDER BY 
    passengerID, 
    month_index
"""
spark.sql(query).show(10)

2022-02-24 07:51:13,855 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

+----------+----------+-----------+
|      date| base_date|month_index|
+----------+----------+-----------+
|2017-01-01|2017-01-01|          0|
|2017-11-29|2017-01-01|         10|
|2017-12-12|2017-01-01|         11|
|2017-12-22|2017-01-01|         11|
|2017-12-29|2017-01-01|         11|
|2017-01-01|2017-01-01|          0|
|2017-01-01|2017-01-01|          0|
|2017-01-10|2017-01-01|          0|
|2017-02-06|2017-01-01|          1|
|2017-03-05|2017-01-01|          2|
+----------+----------+-----------+
only showing top 10 rows



## Month +N

* [pyspark.sql.functions.add_months(start, months)](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.functions.add_months.html)

> Returns the date that is months months after start

In [16]:
N = 3
query = """
SELECT
    date_format(
        add_months(to_date("2017-01-01", "yyyy-MM-dd"), {}),
        "yyyy-MM"
    ) as plus_n_month
FROM
    flight
""".format(N)
spark.sql(query).show(10)

+------------+
|plus_n_month|
+------------+
|     2017-04|
|     2017-04|
|     2017-04|
|     2017-04|
|     2017-04|
|     2017-04|
|     2017-04|
|     2017-04|
|     2017-04|
|     2017-04|
+------------+
only showing top 10 rows



## (flight, month) pairs to group by

In [17]:
query = """
WITH month_indexed AS (
    SELECT DISTINCT
        flightId,
        date,
        FLOOR(months_between(date, (SELECT MIN(date) FROM flight))) AS month_index
    FROM
        flight
)
SELECT 
    *
FROM
    month_indexed
ORDER BY
    date,
    flightId
"""
spark.sql(query).show()



+--------+----------+-----------+
|flightId|      date|month_index|
+--------+----------+-----------+
|       0|2017-01-01|          0|
|       1|2017-01-01|          0|
|       2|2017-01-01|          0|
|       3|2017-01-01|          0|
|       4|2017-01-01|          0|
|       5|2017-01-02|          0|
|       6|2017-01-02|          0|
|       7|2017-01-02|          0|
|       8|2017-01-02|          0|
|       9|2017-01-02|          0|
|      10|2017-01-02|          0|
|      11|2017-01-03|          0|
|      12|2017-01-03|          0|
|      13|2017-01-03|          0|
|      14|2017-01-03|          0|
|      15|2017-01-03|          0|
|      16|2017-01-04|          0|
|      17|2017-01-04|          0|
|      18|2017-01-04|          0|
|      19|2017-01-05|          0|
+--------+----------+-----------+
only showing top 20 rows





## Flights per month as GROUP BY month_index

In [18]:
query = """
WITH month_indexed AS (
    SELECT DISTINCT
        flightId,
        FLOOR(months_between(date, (SELECT MIN(date) FROM flight))) AS month_index
    FROM
        flight
)
SELECT 
    month_index,
    date_format(
        add_months(to_date("2017-01-01", "yyyy-MM-dd"), month_index),
        "yyyy-MM"
    ) as month,
    count(flightId) as number_of_flights
FROM
    month_indexed
GROUP BY 
    month_index
ORDER BY
    month_index
"""
spark.sql(query).show()
spark.sql(query).explain(mode="formatted")

                                                                                

+-----------+-------+-----------------+
|month_index|  month|number_of_flights|
+-----------+-------+-----------------+
|          0|2017-01|               97|
|          1|2017-02|               73|
|          2|2017-03|               82|
|          3|2017-04|               92|
|          4|2017-05|               92|
|          5|2017-06|               71|
|          6|2017-07|               87|
|          7|2017-08|               76|
|          8|2017-09|               85|
|          9|2017-10|               76|
|         10|2017-11|               75|
|         11|2017-12|               94|
+-----------+-------+-----------------+

== Physical Plan ==
* Sort (10)
+- Exchange (9)
   +- * HashAggregate (8)
      +- Exchange (7)
         +- * HashAggregate (6)
            +- * HashAggregate (5)
               +- Exchange (4)
                  +- * HashAggregate (3)
                     +- * Project (2)
                        +- Scan csv  (1)


(1) Scan csv 
Output [2]: [flightId#17, dat

---
# Frequent Flyers

Find the names of the 10 most frequent flyers.

In [19]:
query = """
SELECT
    f.passengerId,
    COUNT(*) number_of_flights
FROM
    flight f
GROUP BY 
    f.passengerId
ORDER BY
    number_of_flights DESC
LIMIT 10
"""
spark.sql(query).show()
spark.sql(query).explain(True)

                                                                                

+-----------+-----------------+
|passengerId|number_of_flights|
+-----------+-----------------+
|       2068|               32|
|       1677|               27|
|       4827|               27|
|       8961|               26|
|       3173|               26|
|        917|               25|
|       5096|               25|
|       8363|               25|
|       6084|               25|
|       2857|               25|
+-----------+-----------------+

== Parsed Logical Plan ==
'GlobalLimit 10
+- 'LocalLimit 10
   +- 'Sort ['number_of_flights DESC NULLS LAST], true
      +- 'Aggregate ['f.passengerId], ['f.passengerId, 'COUNT(1) AS number_of_flights#257]
         +- 'SubqueryAlias f
            +- 'UnresolvedRelation [flight], [], false

== Analyzed Logical Plan ==
passengerId: int, number_of_flights: bigint
GlobalLimit 10
+- LocalLimit 10
   +- Sort [number_of_flights#257L DESC NULLS LAST], true
      +- Aggregate [passengerId#16], [passengerId#16, count(1) AS number_of_flights#257L]
        

---
# Longest run

In [33]:
query = """
WITH sequencedRun AS (
    SELECT
        passengerId,
        date,
        from, 
        to,
        direction,
        ROW_NUMBER() OVER (PARTITION BY passengerId ORDER BY date) AS seq
    FROM
        flight f
    ORDER BY 
        passengerId,
        seq
),
closedRun AS (
    SELECT 
        passengerId, 
        from, to, 
        direction, 
        seq,
        -------------------------------------------------------------------------------- 
        -- For a departure flight, take the seq of the return flight, if there is.
        -------------------------------------------------------------------------------- 
        CASE 
            WHEN direction == 1
            THEN lead(seq) OVER (PARTITION BY passengerId ORDER BY seq)
        END AS return,
        -------------------------------------------------------------------------------- 
        -- For a departure flight, count the visiting countries, if returned.
        -------------------------------------------------------------------------------- 
        CASE 
            WHEN direction == 1
            THEN lead(seq) OVER (PARTITION BY passengerId ORDER BY seq) - seq
        END AS countries
    FROM sequencedRun s
    WHERE 
        -------------------------------------------------------------------------------- 
        -- Remove those without UK
        -------------------------------------------------------------------------------- 
        direction != 0
        -------------------------------------------------------------------------------- 
        -- Select passengers having both depart (+1) and return (-1), which is 
        -- distinct direction count is 2.
        -------------------------------------------------------------------------------- 
        AND EXISTS (  
            SELECT passengerId
            FROM
                sequencedRun
            WHERE 
                direction != 0 AND
                passengerId == s.passengerId
            GROUP BY
                passengerId
            Having count(DISTINCT direction) == 2
        )
    ORDER BY 
        passengerId, seq
)
SELECT * FROM closedRun 
ORDER BY 
    passengerId, seq
"""

closed = spark.sql(query)
closed.show()
closed.explain(True)
closed.createOrReplaceTempView("closedRun")

+-----------+----+---+---------+---+------+---------+
|passengerId|from| to|direction|seq|return|countries|
+-----------+----+---+---------+---+------+---------+
|         16|  cg| uk|       -1|  5|  null|     null|
|         16|  uk| cg|        1|  6|  null|     null|
|         22|  iq| uk|       -1| 10|  null|     null|
|         22|  uk| nl|        1| 11|    15|        4|
|         22|  at| uk|       -1| 15|  null|     null|
|         22|  uk| bm|        1| 16|  null|     null|
|         52|  se| uk|       -1|  6|  null|     null|
|         52|  uk| cn|        1|  7|  null|     null|
|         53|  ch| uk|       -1|  6|  null|     null|
|         53|  uk| se|        1|  7|     8|        1|
|         53|  se| uk|       -1|  8|  null|     null|
|         53|  uk| tj|        1|  9|    13|        4|
|         53|  th| uk|       -1| 13|  null|     null|
|         72|  tj| uk|       -1|  9|  null|     null|
|         72|  uk| iq|        1| 10|  null|     null|
|         82|  iq| uk|      

In [34]:
query = """
SELECT 
    passengerId as `Passenger ID`,
    max(countries) as `Longest Run`
FROM closedRun
WHERE 
    countries IS NOT NULL
GROUP BY 
    passengerId
ORDER BY 
    max(countries) DESC
"""
spark.sql(query).show()

+------------+-----------+
|Passenger ID|Longest Run|
+------------+-----------+
|        2975|         16|
|        2939|         15|
|        3573|         15|
|         760|         15|
|        8562|         15|
|        2982|         14|
|        8590|         14|
|        2926|         14|
|        2935|         14|
|        3600|         14|
|         755|         14|
|         917|         13|
|        8363|         13|
|        7643|         13|
|        3565|         12|
|        1982|         12|
|        8961|         12|
|        1053|         12|
|        3466|         11|
|        2967|         11|
+------------+-----------+
only showing top 20 rows



---
# Stop Spark Session

In [35]:
spark.stop()



# Cleanup

In [36]:
del spark
gc.collect()

1202