In [1]:
# Installing required packages
!pip install pyspark
!pip install findspark



In [2]:
import findspark
findspark.init()

In [3]:
# PySpark is the Spark API for Python. In this lab, we use PySpark to initialize the spark context. 
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, lit, isnan, length, col
from pyspark.sql.types import StructType, IntegerType, StringType, FloatType


In [4]:
REGEX_EMPTY_STR= r'[\t ]+$'

In [5]:
def check_empty_column(coluna):
    return (col(coluna).isNull() | (col(coluna) == '') | col(coluna).rlike(REGEX_EMPTY_STR))

In [6]:
# Creating a spark context class
sc = SparkContext()

# Creating a spark session
spark = SparkSession \
    .builder \
    .appName("Python Spark DataFrames basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

Definindo Schema do dataframe

In [7]:

schema = StructType() \
.add("faa", StringType(), True) \
.add("name", StringType(), True) \
.add("lat", FloatType(), True) \
.add("lon", FloatType(), True) \
.add("alt", IntegerType(), True) \
.add("tz", FloatType(), True) \
.add("dst", StringType(), True)

In [8]:
# Ler o dataset
df = spark.read.options(header=True, delimiter=",", inferSChema=False).schema(schema).csv("data/airports.csv")
df_sql = df
rdd = df.rdd

df.show()
df.printSchema()

df_sql.createOrReplaceTempView("airports")


+---+--------------------+---------+-----------+----+----+---+
|faa|                name|      lat|        lon| alt|  tz|dst|
+---+--------------------+---------+-----------+----+----+---+
|04G|   Lansdowne Airport|41.130474|  -80.61958|1044|-5.0|  A|
|06A|Moton Field Munic...| 32.46057|  -85.68003| 264|-5.0|  A|
|06C| Schaumburg Regional| 41.98934|  -88.10124| 801|-6.0|  A|
|06N|     Randall Airport| 41.43191|  -74.39156| 523|-5.0|  A|
|09J|Jekyll Island Air...|31.074472|  -81.42778|  11|-4.0|  A|
|0A9|Elizabethton Muni...|36.371223| -82.173416|1593|-4.0|  A|
|0G6|Williams County A...|41.467304| -84.506775| 730|-5.0|  A|
|0G7|Finger Lakes Regi...|42.883564| -76.781235| 492|-5.0|  A|
|0P2|Shoestring Aviati...|39.794823| -76.647194|1000|-5.0|  U|
|0S9|Jefferson County ...| 48.05381|-122.810646| 108|-8.0|  A|
|0W3|Harford County Ai...|39.566837|   -76.2024| 409|-5.0|  A|
|10C|  Galt Field Airport| 42.40289| -88.375114| 875|-6.0|  U|
|17G|Port Bucyrus-Craw...|40.781555|  -82.97481|1003|-5

## Airport - Perguntas


#### Pergunta 1

In [9]:
df = df.withColumn("qa_faa", when(check_empty_column('faa'), 'M'))
df.filter(df.qa_faa == 'M').show()

df.groupBy('faa').count().show(100)

+---+----+---+---+---+---+---+------+
|faa|name|lat|lon|alt| tz|dst|qa_faa|
+---+----+---+---+---+---+---+------+
+---+----+---+---+---+---+---+------+

+---+-----+
|faa|count|
+---+-----+
|BGM|    1|
|FMY|    1|
|HYL|    1|
|LEB|    1|
|OXC|    1|
|RKP|    1|
|4A7|    1|
|AVX|    1|
|DLG|    1|
|ELI|    1|
|INL|    1|
|OLF|    1|
|TYE|    1|
|FRN|    1|
|MSY|    1|
|RDG|    1|
|1CS|    1|
|CDW|    1|
|DRT|    1|
|DWS|    1|
|FOK|    1|
|GEG|    1|
|HVR|    1|
|KNW|    1|
|4A9|    1|
|BUR|    1|
|CGX|    1|
|DKK|    1|
|F57|    1|
|GVT|    1|
|NMM|    1|
|NQI|    1|
|PTK|    1|
|SNA|    1|
|BYW|    1|
|FOD|    1|
|GRB|    1|
|GTF|    1|
|JGC|    1|
|JRB|    1|
|MRN|    1|
|OPF|    1|
|AIK|    1|
|ASH|    1|
|BXS|    1|
|CAR|    1|
|IFP|    1|
|MXY|    1|
|AUW|    1|
|HOM|    1|
|IDA|    1|
|K03|    1|
|STE|    1|
|SVA|    1|
|WDR|    1|
|GRR|    1|
|HHR|    1|
|LWB|    1|
|NXX|    1|
|3D2|    1|
|JLN|    1|
|MPB|    1|
|NHK|    1|
|NPZ|    1|
|PAO|    1|
|PVU|    1|
|3G3|    1|
|ARA|  

In [10]:
df = df.withColumn("qa_faa", 
                   when((length(df.faa) < 3) | (length(df.faa) > 5), 'F')
                   .otherwise(df.qa_faa))
df.filter(df.qa_faa == 'F').show()


df1 = spark.sql("""
    SELECT *,
        CASE
            WHEN
                faa IS NULL OR faa like '' 
                THEN 'M'
            when 
                length(faa) < 3 or length(faa) > 5 
                then 'F' 
        END AS qa_faa
    FROM airports
""")

df1.filter(df1.qa_faa == 'M').show()
df1.filter(df1.qa_faa == 'F').show()

+---+----+---+---+---+---+---+------+
|faa|name|lat|lon|alt| tz|dst|qa_faa|
+---+----+---+---+---+---+---+------+
+---+----+---+---+---+---+---+------+

+---+----+---+---+---+---+---+------+
|faa|name|lat|lon|alt| tz|dst|qa_faa|
+---+----+---+---+---+---+---+------+
+---+----+---+---+---+---+---+------+

+---+----+---+---+---+---+---+------+
|faa|name|lat|lon|alt| tz|dst|qa_faa|
+---+----+---+---+---+---+---+------+
+---+----+---+---+---+---+---+------+



#### Pergunta 2

In [11]:
df = df.withColumn("qa_name", when(check_empty_column('name'), 'M'))
df.filter(df.qa_name == 'M').show()

df1 = spark.sql("""
                    SELECT *,
                        CASE
                            WHEN
                                name IS NULL OR name like ''
                                THEN 'M'
                        END AS qa_name
                    FROM airports
""")

df.filter(df.qa_name == 'M').show()

+---+--------------------+---------+----------+---+----+---+------+-------+
|faa|                name|      lat|       lon|alt|  tz|dst|qa_faa|qa_name|
+---+--------------------+---------+----------+---+----+---+------+-------+
|IMM|          Immokalee | 26.43389| -81.40139| 37|-5.0|  A|  null|      M|
|MGM|Montgomery Region...| 32.30064|-86.393974|221|-6.0|  A|  null|      M|
|RFD|Chicago Rockford ...|42.195362| -89.09722|742|-6.0|  A|  null|      M|
+---+--------------------+---------+----------+---+----+---+------+-------+

+---+--------------------+---------+----------+---+----+---+------+-------+
|faa|                name|      lat|       lon|alt|  tz|dst|qa_faa|qa_name|
+---+--------------------+---------+----------+---+----+---+------+-------+
|IMM|          Immokalee | 26.43389| -81.40139| 37|-5.0|  A|  null|      M|
|MGM|Montgomery Region...| 32.30064|-86.393974|221|-6.0|  A|  null|      M|
|RFD|Chicago Rockford ...|42.195362| -89.09722|742|-6.0|  A|  null|      M|
+---+------

#### Pergunta 3

In [12]:
df = df.withColumn("qa_lat", when( check_empty_column('lat'), 'M'))
df.filter(df.qa_lat == 'M').show()
df.filter(df.lat.rlike('^[^0-9]*$')).groupBy('lat').count().show()

+---+----+---+---+---+---+---+------+-------+------+
|faa|name|lat|lon|alt| tz|dst|qa_faa|qa_name|qa_lat|
+---+----+---+---+---+---+---+------+-------+------+
+---+----+---+---+---+---+---+------+-------+------+

+---+-----+
|lat|count|
+---+-----+
+---+-----+



In [13]:
df = df.withColumn("qa_lat",
                   when( (df.lat < -180) | (df.lat > 180), 'I')
                   .otherwise(df.qa_lat))
df.filter(df.qa_lat == 'I').show()

+---+----+---+---+---+---+---+------+-------+------+
|faa|name|lat|lon|alt| tz|dst|qa_faa|qa_name|qa_lat|
+---+----+---+---+---+---+---+------+-------+------+
+---+----+---+---+---+---+---+------+-------+------+



In [14]:
df = df.withColumn("qa_lat", 
                   when(df.lat.cast('int').isNull(), 'A')
                   .otherwise(df.qa_lat))
df.filter(df.qa_lat == 'A').show()

df1 = spark.sql("""
                    SELECT *,
                        CASE
                            WHEN
                                lat IS NULL OR lat == ''
                                THEN 'M'
                            WHEN
                                lat < -180 OR lat > 180
                                THEN 'I'
                            WHEN
                                lat rlike '^([^0-9]*)$'
                                THEN 'A'
                        END AS qa_lat
                    FROM airports
""")

df1.filter(df1.qa_lat.isin(['M','I','A'])).show()

+---+----+---+---+---+---+---+------+-------+------+
|faa|name|lat|lon|alt| tz|dst|qa_faa|qa_name|qa_lat|
+---+----+---+---+---+---+---+------+-------+------+
+---+----+---+---+---+---+---+------+-------+------+

+---+----+---+---+---+---+---+------+
|faa|name|lat|lon|alt| tz|dst|qa_lat|
+---+----+---+---+---+---+---+------+
+---+----+---+---+---+---+---+------+



#### Pergunta 4


In [15]:
df = df.withColumn("qa_lon", when(check_empty_column('lon'), 'M'))
df.filter(df.qa_lon == 'M').show()
df.filter(df.lon.rlike('^[^0-9]*$')).groupBy('lon').count().show()

+---+----+---+---+---+---+---+------+-------+------+------+
|faa|name|lat|lon|alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|
+---+----+---+---+---+---+---+------+-------+------+------+
+---+----+---+---+---+---+---+------+-------+------+------+

+---+-----+
|lon|count|
+---+-----+
+---+-----+



In [16]:
df = df.withColumn("qa_lon", 
                   when( (df.lon < -180) | (df.lon > 180), 'I')
                   .otherwise(df.qa_lon))
df.filter(df.qa_lon == 'I').show()

+---+----+---+---+---+---+---+------+-------+------+------+
|faa|name|lat|lon|alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|
+---+----+---+---+---+---+---+------+-------+------+------+
+---+----+---+---+---+---+---+------+-------+------+------+



In [17]:
df = df.withColumn("qa_lon", 
                   when(df.lon.cast('int').isNull(), 'A')
                   .otherwise(df.qa_lon))
df.filter(df.qa_lon == 'A').show()



df1 = spark.sql("""
                    SELECT *,
                        CASE
                            WHEN
                                lon IS NULL OR lon == ''
                                THEN 'M'
                            WHEN
                                lon < -180 OR lon > 180
                                THEN 'I'
                            WHEN
                                lon rlike '^([^0-9]*)$'
                                THEN 'A'
                        END AS qa_lon
                    FROM airports
""")

df1.filter(df1.qa_lon.isin(['M','I','A'])).show()

+---+----+---+---+---+---+---+------+-------+------+------+
|faa|name|lat|lon|alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|
+---+----+---+---+---+---+---+------+-------+------+------+
+---+----+---+---+---+---+---+------+-------+------+------+

+---+----+---+---+---+---+---+------+
|faa|name|lat|lon|alt| tz|dst|qa_lon|
+---+----+---+---+---+---+---+------+
+---+----+---+---+---+---+---+------+



#### Pergunta 5

In [18]:
df = df.withColumn("qa_alt", when(check_empty_column('alt'), 'M'))

df.filter(df.alt.rlike('^[^0-9]*$')).show()
df.filter(df.qa_alt == 'M').show()

+---+----+---+---+---+---+---+------+-------+------+------+------+
|faa|name|lat|lon|alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|qa_alt|
+---+----+---+---+---+---+---+------+-------+------+------+------+
+---+----+---+---+---+---+---+------+-------+------+------+------+

+---+----+---+---+---+---+---+------+-------+------+------+------+
|faa|name|lat|lon|alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|qa_alt|
+---+----+---+---+---+---+---+------+-------+------+------+------+
+---+----+---+---+---+---+---+------+-------+------+------+------+



In [19]:
df = df.withColumn("qa_alt", 
                   when( df.alt < 0, 'I')
                   .otherwise(df.qa_alt))

df.filter(df.qa_alt == 'I').show()

+---+-------------+---------+----------+---+----+---+------+-------+------+------+------+
|faa|         name|      lat|       lon|alt|  tz|dst|qa_faa|qa_name|qa_lat|qa_lon|qa_alt|
+---+-------------+---------+----------+---+----+---+------+-------+------+------+------+
|IPL|  Imperial Co| 32.83422|-115.57874|-54|-8.0|  A|  null|   null|  null|  null|     I|
|NJK|El Centro Naf|32.829224|-115.67167|-42|-8.0|  A|  null|   null|  null|  null|     I|
+---+-------------+---------+----------+---+----+---+------+-------+------+------+------+



In [20]:
df = df.withColumn("qa_alt", 
                   when( df.alt.cast('int').isNull(), 'A')
                   .otherwise(df.qa_alt))

df.filter(df.qa_alt == 'A').show()


df1 = spark.sql("""
                    SELECT *,
                        CASE
                            WHEN
                                alt IS NULL OR alt == ''
                                THEN 'M'
                            WHEN
                                alt < 0
                                THEN 'I'
                            WHEN
                                alt rlike '^([^0-9]*)$'
                                THEN 'A'
                        END AS qa_alt
                    FROM airports
""")

df1.filter(df1.qa_alt.isin(['M','I','A'])).show()

+---+----+---+---+---+---+---+------+-------+------+------+------+
|faa|name|lat|lon|alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|qa_alt|
+---+----+---+---+---+---+---+------+-------+------+------+------+
+---+----+---+---+---+---+---+------+-------+------+------+------+

+---+-------------+---------+----------+---+----+---+------+
|faa|         name|      lat|       lon|alt|  tz|dst|qa_alt|
+---+-------------+---------+----------+---+----+---+------+
|IPL|  Imperial Co| 32.83422|-115.57874|-54|-8.0|  A|     I|
|NJK|El Centro Naf|32.829224|-115.67167|-42|-8.0|  A|     I|
+---+-------------+---------+----------+---+----+---+------+



#### Pergunta 6

In [21]:
df = df.withColumn("qa_tz", when( check_empty_column('tz'), 'M'))
df.filter(df.qa_tz =='M').show()

df.filter(df.tz.rlike('^[^0-9]*$')).show()

+---+----+---+---+---+---+---+------+-------+------+------+------+-----+
|faa|name|lat|lon|alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|qa_alt|qa_tz|
+---+----+---+---+---+---+---+------+-------+------+------+------+-----+
+---+----+---+---+---+---+---+------+-------+------+------+------+-----+

+---+----+---+---+---+---+---+------+-------+------+------+------+-----+
|faa|name|lat|lon|alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|qa_alt|qa_tz|
+---+----+---+---+---+---+---+------+-------+------+------+------+-----+
+---+----+---+---+---+---+---+------+-------+------+------+------+-----+



In [22]:
df = df.withColumn("qa_tz", 
                   when((df.tz < -11) | (df.tz > 14), 'I')
                   .otherwise(df.qa_tz))
df.filter(df.qa_tz == 'I').show()
print(df.filter(col('tz').between(-7,-5) & (col('dst') != 'A')).count())
print(df.filter(col('dst') == 'A').count())


+---+----+---+---+---+---+---+------+-------+------+------+------+-----+
|faa|name|lat|lon|alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|qa_alt|qa_tz|
+---+----+---+---+---+---+---+------+-------+------+------+------+-----+
+---+----+---+---+---+---+---+------+-------+------+------+------+-----+

51
1329


In [23]:
df = df.withColumn("qa_tz", 
                   when(df.tz.cast('int').isNull(), 'A')
                   .otherwise(df.qa_tz))

df.filter(df.qa_tz =='A').show()


df1 = spark.sql("""
                    SELECT *,
                        CASE
                            WHEN
                                tz IS NULL OR tz == ''
                                THEN 'M'
                            WHEN
                                tz < -11 OR tz > 14
                                THEN 'I'
                            WHEN
                                tz rlike '^([^0-9]*)$'
                                THEN 'A'
                        END AS qa_tz
                    FROM airports
""")

df1.filter(df1.qa_tz.isin(['M','I','A'])).show()

+---+----+---+---+---+---+---+------+-------+------+------+------+-----+
|faa|name|lat|lon|alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|qa_alt|qa_tz|
+---+----+---+---+---+---+---+------+-------+------+------+------+-----+
+---+----+---+---+---+---+---+------+-------+------+------+------+-----+

+---+----+---+---+---+---+---+-----+
|faa|name|lat|lon|alt| tz|dst|qa_tz|
+---+----+---+---+---+---+---+-----+
+---+----+---+---+---+---+---+-----+



#### Pergunta 7

In [24]:
df = df.withColumn("qa_dst", 
                   when(check_empty_column('dst'), 'M'))

df.filter(df.qa_dst == 'M').show()
df.filter(df.dst.rlike('[0-9]')).show()
print(df.filter((col('dst') == 'U') | (col('tz').between(-7,-5) & (col('dst') != 'A'))).count())
print(df.filter(col('dst') == 'A').count())

+---+----+---+---+---+---+---+------+-------+------+------+------+-----+------+
|faa|name|lat|lon|alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|qa_alt|qa_tz|qa_dst|
+---+----+---+---+---+---+---+------+-------+------+------+------+-----+------+
+---+----+---+---+---+---+---+------+-------+------+------+------+-----+------+

+---+----+---+---+---+---+---+------+-------+------+------+------+-----+------+
|faa|name|lat|lon|alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|qa_alt|qa_tz|qa_dst|
+---+----+---+---+---+---+---+------+-------+------+------+------+-----+------+
+---+----+---+---+---+---+---+------+-------+------+------+------+-----+------+

59
1329


In [25]:
categories = ['E','A','S','O','Z','N','U']
df = df.withColumn("qa_dst", 
                   when(~df.dst.isin(categories), 'C')
                   .otherwise(df.qa_dst))
df.filter(df.qa_dst =='C').show()

+---+----+---+---+---+---+---+------+-------+------+------+------+-----+------+
|faa|name|lat|lon|alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|qa_alt|qa_tz|qa_dst|
+---+----+---+---+---+---+---+------+-------+------+------+------+-----+------+
+---+----+---+---+---+---+---+------+-------+------+------+------+-----+------+



In [26]:
df = df.withColumn("qa_dst", 
                   when(df.dst.cast('int').isNotNull(), 'N')
                   .otherwise(df.qa_dst))

df.filter(df.qa_dst == 'N').show()

df1 = spark.sql("""
                    SELECT *,
                        CASE
                            WHEN
                                dst IS NULL OR dst == ''
                                THEN 'M'
                            WHEN
                                dst NOT IN ('E','A','S','O','Z','N','U')
                                THEN 'C'
                            WHEN
                                dst rlike '^([0-9]*)$'
                                THEN 'N'
                        END AS qa_dst
                    FROM airports
""")

df1.filter(df1.qa_dst.isin(['M','C','N'])).show()

+---+----+---+---+---+---+---+------+-------+------+------+------+-----+------+
|faa|name|lat|lon|alt| tz|dst|qa_faa|qa_name|qa_lat|qa_lon|qa_alt|qa_tz|qa_dst|
+---+----+---+---+---+---+---+------+-------+------+------+------+-----+------+
+---+----+---+---+---+---+---+------+-------+------+------+------+-----+------+

+---+----+---+---+---+---+---+------+
|faa|name|lat|lon|alt| tz|dst|qa_dst|
+---+----+---+---+---+---+---+------+
+---+----+---+---+---+---+---+------+



In [27]:
df_qa = df.select(df.faa ,df.qa_faa, df.qa_name, df.qa_lat, df.qa_lon, df.qa_tz, df.qa_dst, df.qa_alt)
df_qa.write.mode('overwrite').parquet("output/airports_qa.parquet")

## Plane - Perguntas

In [28]:
schema_plane = StructType()\
.add("tailnum", StringType(), True)\
.add("year", IntegerType(), True)\
.add("type", StringType(), True) \
.add("manufacturer", StringType(), True)\
.add("model", StringType(), True)\
.add("engines", IntegerType(), True)\
.add("seats", IntegerType(), True)\
.add("speed", IntegerType(), True)\
.add("engine", StringType(), True)

In [29]:
df_planes = spark.read.options(header=True, delimiter=",").schema(schema_plane).csv("data/planes.csv")

df_planes_sql = df_planes
df_planes_sql.createOrReplaceTempView("planes")


df_planes.printSchema()

root
 |-- tailnum: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- manufacturer: string (nullable = true)
 |-- model: string (nullable = true)
 |-- engines: integer (nullable = true)
 |-- seats: integer (nullable = true)
 |-- speed: integer (nullable = true)
 |-- engine: string (nullable = true)



#### Pergunta 1

In [30]:
df_planes = df_planes.withColumn("qa_tailnum", 
                                   when(check_empty_column('tailnum'), 'M')
                                  .when(~length(df_planes.tailnum).between(5,6), 'S')
                                  .when(~df_planes.tailnum.startswith('N'),'FN')
                                  .when(df_planes.tailnum.rlike('I|O|N0'),'FE')
                                  .when(
                                       ~df_planes.tailnum.rlike('^N[1-9][0-9]{2,3}([ABCDEFGHJKLMNPQRSTUVXWYZ]{1,2})')
                                       ,'F'))

df_planes.filter(df_planes.qa_tailnum.isin(['M','S','FN','FE','F'])).groupBy('qa_tailnum').count().show()
df_planes

+----------+-----+
|qa_tailnum|count|
+----------+-----+
|         F|  298|
+----------+-----+



DataFrame[tailnum: string, year: int, type: string, manufacturer: string, model: string, engines: int, seats: int, speed: int, engine: string, qa_tailnum: string]

In [31]:
df1 = spark.sql("""
                    SELECT *,
                        CASE
                            WHEN
                                tailnum IS NULL OR tailnum == ''
                                THEN 'M'
                            WHEN
                                length(tailnum) <> 5
                                THEN 'S'
                            WHEN 
                                tailnum rlike "N[1-9][0-9]*2Z$|N[0-9]*2[A-Z][A-Z]$"
                                THEN 'F'
                            WHEN 
                                tailnum NOT LIKE 'N%'
                                THEN 'FN'
                            WHEN
                                 tailnum LIKE '%I%' OR tailnum LIKE '%O%' OR tailnum LIKE 'N0'
                                 THEN 'FE'
                        END AS qa_tailnum
                    FROM planes

""")

df1.filter(df1.qa_tailnum.isin(['M','F','FN','FE'])).show()

+-------+----+----+------------+-----+-------+-----+-----+------+----------+
|tailnum|year|type|manufacturer|model|engines|seats|speed|engine|qa_tailnum|
+-------+----+----+------------+-----+-------+-----+-----+------+----------+
+-------+----+----+------------+-----+-------+-----+-----+------+----------+



#### Pergunta 2

In [32]:
df_planes = df_planes.withColumn("qa_year", 
                                 when( check_empty_column('year'), 'M'))
df_planes.filter(df_planes.qa_year == 'M').show()
df_planes.groupBy('year').count().show(100)

+-------+----+--------------------+----------------+-------------+-------+-----+-----+-------------+----------+-------+
|tailnum|year|                type|    manufacturer|        model|engines|seats|speed|       engine|qa_tailnum|qa_year|
+-------+----+--------------------+----------------+-------------+-------+-----+-----+-------------+----------+-------+
| N174US|null|Fixed wing multi ...|AIRBUS INDUSTRIE|     A321-211|      2|  199| null|    Turbo-jet|      null|      M|
| N177US|null|Fixed wing multi ...|AIRBUS INDUSTRIE|     A321-211|      2|  199| null|    Turbo-jet|      null|      M|
| N181UW|null|Fixed wing multi ...|AIRBUS INDUSTRIE|     A321-211|      2|  199| null|    Turbo-jet|      null|      M|
| N194UW|null|Fixed wing multi ...|          AIRBUS|     A321-211|      2|  199| null|    Turbo-fan|      null|      M|
| N271LV|null|Fixed wing multi ...|          BOEING|      737-705|      2|  149| null|    Turbo-fan|      null|      M|
| N298WN|null|Fixed wing multi ...|     

In [33]:
df_planes = df_planes.withColumn("qa_year", 
                                 when(df_planes.year < 1950, 'I')
                                 .otherwise(df_planes.qa_year))
df_planes.filter(df_planes.qa_year == 'I').show()


df1 = spark.sql("""
            SELECT *,
                CASE
                    WHEN
                        year IS NULL OR year == ''
                        THEN 'M'
                    WHEN
                        year < 1950
                        THEN 'I'
                    END AS qa_year
                FROM planes
""")

df1.filter(df1.qa_year.isin(['M','I'])).show()

+-------+----+--------------------+------------+---------+-------+-----+-----+----------+----------+-------+
|tailnum|year|                type|manufacturer|    model|engines|seats|speed|    engine|qa_tailnum|qa_year|
+-------+----+--------------------+------------+---------+-------+-----+-----+----------+----------+-------+
| N235SW|   0|Fixed wing multi ...|     EMBRAER|EMB-120ER|      2|   32| null|Turbo-prop|      null|      I|
+-------+----+--------------------+------------+---------+-------+-----+-----+----------+----------+-------+

+-------+----+--------------------+----------------+-------------+-------+-----+-----+-------------+-------+
|tailnum|year|                type|    manufacturer|        model|engines|seats|speed|       engine|qa_year|
+-------+----+--------------------+----------------+-------------+-------+-----+-----+-------------+-------+
| N174US|null|Fixed wing multi ...|AIRBUS INDUSTRIE|     A321-211|      2|  199| null|    Turbo-jet|      M|
| N177US|null|Fixe

#### Pergunta 3


In [34]:
df_planes = df_planes.withColumn("qa_type", 
                                 when(check_empty_column('type'), 'M'))

df_planes.filter(df_planes.qa_type == 'M').show()
df_planes.groupBy('type').count().show()

+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+
|tailnum|year|type|manufacturer|model|engines|seats|speed|engine|qa_tailnum|qa_year|qa_type|
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+

+--------------------+-----+
|                type|count|
+--------------------+-----+
|          Rotorcraft|    3|
|Fixed wing multi ...| 2615|
|Fixed wing single...|   10|
+--------------------+-----+



In [35]:
types = ['Fixed wing multi engine','Fixed wing single engine','Rotorcraft']
df_planes = df_planes.withColumn("qa_type", 
                                 when(~df_planes.type.isin(types), 'C')
                                 .otherwise(df_planes.qa_type))

df_planes.filter(df_planes.qa_type == 'C').show()


df1 = spark.sql("""
                    SELECT *,
                CASE
                    WHEN
                        type IS NULL OR type == ''
                        THEN 'M'
                    WHEN
                        type NOT IN ('Fixed wing multi engine','Fixed wing single engine','Rotorcraft')
                        THEN 'C'
                    END AS qa_type
                FROM planes
""")

df1.filter(df1.qa_type.isin(['M','C'])).show()

+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+
|tailnum|year|type|manufacturer|model|engines|seats|speed|engine|qa_tailnum|qa_year|qa_type|
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+

+-------+----+----+------------+-----+-------+-----+-----+------+-------+
|tailnum|year|type|manufacturer|model|engines|seats|speed|engine|qa_type|
+-------+----+----+------------+-----+-------+-----+-----+------+-------+
+-------+----+----+------------+-----+-------+-----+-----+------+-------+



#### Pergunta 4

In [36]:
df_planes = df_planes.withColumn("qa_manufacturer", 
                                 when(check_empty_column('manufacturer'),'M'))

df_planes.filter(df_planes.qa_manufacturer == 'M').show()
df_planes.groupBy('manufacturer').count().show()

+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+
|tailnum|year|type|manufacturer|model|engines|seats|speed|engine|qa_tailnum|qa_year|qa_type|qa_manufacturer|
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+

+--------------------+-----+
|        manufacturer|count|
+--------------------+-----+
|       BARKER JACK L|    1|
|    AIRBUS INDUSTRIE|  401|
|ROBINSON HELICOPT...|    1|
|            SIKORSKY|    1|
|              BOEING| 1460|
|             EMBRAER|   37|
|  CIRRUS DESIGN CORP|    1|
|          MARZ BARRY|    1|
|              CESSNA|    4|
|     LAMBERT RICHARD|    1|
|      BOMBARDIER INC|  214|
|               PIPER|    2|
|                BELL|    1|
|MCDONNELL DOUGLAS...|    2|
|   MCDONNELL DOUGLAS|   94|
|        KILDALL GARY|    1|
|           

In [37]:
manufacturers = ['AIRBUS','BOEING','BOMBARDIER','CESSNA','EMBRAER','SIKORSKY','CANADAIR',
                 'PIPER','MCDONNELL DOUGLAS','CIRRUS','BELL','KILDALL GARY','LAMBERT RICHARD',
                 'BARKER JACK','ROBINSON HELICOPTER','GULFSTREAM','MARZ BARRY']

REGEX_MANUFACTURER_LIST = r'|'.join(map(lambda word: f'.*{word}.*', manufacturers))

df_planes = df_planes.withColumn("qa_manufacturer", 
                                 when(~df_planes.manufacturer.rlike(REGEX_MANUFACTURER_LIST), 'C')
                                 .otherwise(df_planes.qa_manufacturer))
df_planes.filter(df_planes.qa_manufacturer == 'C').show()



df1 = spark.sql("""
                    SELECT *,
                CASE
                    WHEN
                        manufacturer IS NULL OR manufacturer == ''
                        THEN 'M'
                    WHEN
                        manufacturer NOT IN ('AIRBUS','BOEING','BOMBARDIER','CESSNA','EMBRAER',
                                            'SIKORSKY','CANADAIR','PIPER','MCDONNELL DOUGLAS',
                                            'CIRRUS','BELL','KILDALL GARY','LAMBERT RICHARD',
                                            'BARKER JACK','ROBINSON HELICOPTER','GULFSTREAM',
                                            'MARZ BARRY')
                        THEN 'C'
                    END AS qa_manufacturer
                FROM planes
""")

df1.filter(df1.qa_manufacturer.isin(['M','C'])).show()


+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+
|tailnum|year|type|manufacturer|model|engines|seats|speed|engine|qa_tailnum|qa_year|qa_type|qa_manufacturer|
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+

+-------+----+--------------------+----------------+-----------+-------+-----+-----+---------+---------------+
|tailnum|year|                type|    manufacturer|      model|engines|seats|speed|   engine|qa_manufacturer|
+-------+----+--------------------+----------------+-----------+-------+-----+-----+---------+---------------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|   A320-214|      2|  182| null|Turbo-fan|              C|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|   A320-214|      2|  182| null|Turbo-fan|              C|
| N104UW

#### Pergunta 5

In [38]:
df_planes = df_planes.withColumn("qa_model", 
                                 when( check_empty_column('model'), 'M')
                                . when(
                                     ((col('manufacturer').rlike(r'.*AIRBUS.*') & ~col('model').startswith('A')) |
                                      (col('manufacturer').rlike(r'.*BOEING.*') & ~col('model').startswith('7')) |
                                      (col('manufacturer').rlike(r'.*(BOMBARDIER|CANADAIR).*') & ~col('model').startswith('CL')) |
                                      (col('manufacturer').rlike(r'.*MCDONNELL DOUGLAS.*') &  ~(col('model').startswith('MD') | col('model').startswith('DC')) )),
                                     'F'))
df_planes.filter(df_planes.qa_model.isin(['M','F'])).count()

15

#### Pergunta 6

In [39]:
df_planes = df_planes.withColumn("qa_engines", 
                                 when(check_empty_column('engines'), 'M'))
df_planes.filter(df_planes.qa_engines == 'M').show()
df_planes.filter(col('engines').rlike('^[^0-9]*$')).show()

+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+
|tailnum|year|type|manufacturer|model|engines|seats|speed|engine|qa_tailnum|qa_year|qa_type|qa_manufacturer|qa_model|qa_engines|
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+

+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+
|tailnum|year|type|manufacturer|model|engines|seats|speed|engine|qa_tailnum|qa_year|qa_type|qa_manufacturer|qa_model|qa_engines|
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+----

In [40]:
df_planes = df_planes.withColumn("qa_engines", 
                                 when( (df_planes.engines < 1) | (df_planes.engines > 4), 'I')
                                 .otherwise(df_planes.qa_engines))

df_planes.filter(df_planes.qa_engines == 'I').show()

+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+
|tailnum|year|type|manufacturer|model|engines|seats|speed|engine|qa_tailnum|qa_year|qa_type|qa_manufacturer|qa_model|qa_engines|
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+



In [41]:
df_planes = df_planes.withColumn("qa_engines", 
                                 when( df_planes.engines.rlike('\D+'), 'A')
                                 .otherwise(df_planes.qa_engines))

df_planes.filter(df_planes.qa_engines == 'A').show()

df1 = spark.sql("""
                    SELECT *,
                        CASE
                            WHEN
                                engines IS NULL OR engines == ''
                                THEN 'M'
                            WHEN
                                engines < 1 OR engines > 4
                                THEN 'I'
                            WHEN
                                engines rlike '[^0-9]'
                                THEN 'A'
                        END AS qa_engines
                    FROM planes
""")

df1.filter(df1.qa_engines.isin(['M','I','A'])).show()

+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+
|tailnum|year|type|manufacturer|model|engines|seats|speed|engine|qa_tailnum|qa_year|qa_type|qa_manufacturer|qa_model|qa_engines|
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+

+-------+----+----+------------+-----+-------+-----+-----+------+----------+
|tailnum|year|type|manufacturer|model|engines|seats|speed|engine|qa_engines|
+-------+----+----+------------+-----+-------+-----+-----+------+----------+
+-------+----+----+------------+-----+-------+-----+-----+------+----------+



#### Pergunta 7

In [42]:
df_planes = df_planes.withColumn("qa_seats", 
                                 when( check_empty_column('seats'), 'M'))
df_planes.filter(df_planes.qa_seats == 'M').show()

df_planes.filter(col('engines').rlike('^[^0-9]*$')).show()

+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+--------+
|tailnum|year|type|manufacturer|model|engines|seats|speed|engine|qa_tailnum|qa_year|qa_type|qa_manufacturer|qa_model|qa_engines|qa_seats|
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+--------+
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+--------+

+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+--------+
|tailnum|year|type|manufacturer|model|engines|seats|speed|engine|qa_tailnum|qa_year|qa_type|qa_manufacturer|qa_model|qa_engines|qa_seats|
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+--------+
+-------+----+----+------------+-

In [43]:
df_planes = df_planes.withColumn("qa_seats", 
                                 when((df_planes.seats < 2) | (df_planes.seats > 500), 'I')
                                 .otherwise(df_planes.qa_seats))

df_planes.filter(df_planes.qa_seats == 'I').show()

+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+--------+
|tailnum|year|type|manufacturer|model|engines|seats|speed|engine|qa_tailnum|qa_year|qa_type|qa_manufacturer|qa_model|qa_engines|qa_seats|
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+--------+
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+--------+



In [44]:
df_planes = df_planes.withColumn("qa_seats", 
                                 when( df_planes.seats.rlike('\D+'), 'A')
                                 .otherwise(df_planes.qa_seats))

df_planes.filter(df_planes.qa_seats == 'A').show()


df1 = spark.sql("""
                    SELECT *,
                        CASE
                            WHEN
                                seats IS NULL OR seats == ''
                                THEN 'M'
                            WHEN
                                seats < 2 OR seats > 500
                                THEN 'I'
                            WHEN
                                seats rlike '[^0-9]'
                                THEN 'A'
                        END AS qa_seats
                    FROM planes
""")

df1.filter(df1.qa_seats.isin(['M','I','A'])).show()

+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+--------+
|tailnum|year|type|manufacturer|model|engines|seats|speed|engine|qa_tailnum|qa_year|qa_type|qa_manufacturer|qa_model|qa_engines|qa_seats|
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+--------+
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+--------+

+-------+----+----+------------+-----+-------+-----+-----+------+--------+
|tailnum|year|type|manufacturer|model|engines|seats|speed|engine|qa_seats|
+-------+----+----+------------+-----+-------+-----+-----+------+--------+
+-------+----+----+------------+-----+-------+-----+-----+------+--------+



#### Pergunta 8

In [45]:
df_planes = df_planes.withColumn("qa_speed", 
                                 when(check_empty_column('speed'), 'M'))
df_planes.filter(df_planes.qa_speed == 'M').show()

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+----------+-------+-------+---------------+--------+----------+--------+--------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|qa_tailnum|qa_year|qa_type|qa_manufacturer|qa_model|qa_engines|qa_seats|qa_speed|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+----------+-------+-------+---------------+--------+----------+--------+--------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      null|   null|   null|           null|    null|      null|    null|       M|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      null|   null|   null|           null|    null|      null|    null|       M|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|      null|   null|   null|           null| 

In [46]:
df_planes = df_planes.withColumn("qa_speed", 
                                 when((df_planes.speed < 50) | (df_planes.speed > 150), 'I')
                                 .otherwise(df_planes.qa_speed))

df_planes.filter(df_planes.qa_speed == 'I').show()

+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+--------+--------+
|tailnum|year|type|manufacturer|model|engines|seats|speed|engine|qa_tailnum|qa_year|qa_type|qa_manufacturer|qa_model|qa_engines|qa_seats|qa_speed|
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+--------+--------+
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+--------+--------+



In [47]:
df_planes = df_planes.withColumn("qa_speed", 
                                 when( df_planes.speed.rlike('\D+'), 'A')
                                 .otherwise(df_planes.qa_speed))

df_planes.filter(df_planes.qa_speed == 'A').show()


df1 = spark.sql("""
                    SELECT *,
                        CASE
                            WHEN
                                speed IS NULL OR speed == ''
                                THEN 'M'
                            WHEN
                                speed < 50 OR speed > 150
                                THEN 'I'
                            WHEN
                                speed rlike '[^0-9]'
                                THEN 'A'
                        END AS qa_speed
                    FROM planes
""")

df1.filter(df1.qa_speed.isin(['M','I','A'])).show()

+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+--------+--------+
|tailnum|year|type|manufacturer|model|engines|seats|speed|engine|qa_tailnum|qa_year|qa_type|qa_manufacturer|qa_model|qa_engines|qa_seats|qa_speed|
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+--------+--------+
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+--------+--------+

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+--------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|qa_speed|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+--------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182| null|Turbo-fan|       M|
| N103U

#### Pergunta 9

In [48]:
df_planes = df_planes.withColumn("qa_engine", 
                                 when( check_empty_column('engine'), 'M'))
df_planes.filter(df_planes.qa_engine == 'M').show()

+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+--------+--------+---------+
|tailnum|year|type|manufacturer|model|engines|seats|speed|engine|qa_tailnum|qa_year|qa_type|qa_manufacturer|qa_model|qa_engines|qa_seats|qa_speed|qa_engine|
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+--------+--------+---------+
+-------+----+----+------------+-----+-------+-----+-----+------+----------+-------+-------+---------------+--------+----------+--------+--------+---------+



In [49]:
engine_list = ['Turbo-fan', 'Turbo-jet','Turbo-prop','Turbo-shaft','4 Cycle']

df_planes = df_planes.withColumn("qa_engine", 
                                 when(~df_planes.engine.isin(engine_list), 'C')
                                 .otherwise(df_planes.qa_engine))

df_planes.filter(df_planes.qa_engine == 'C').groupBy('qa_engine').count().show()
df_planes.groupBy('engine').count().show()

df1 = spark.sql("""
                    SELECT *,
                        CASE
                            WHEN
                                engine IS NULL OR engine == ''
                                THEN 'M'
                            WHEN
                                engine NOT IN ('Turbo-fan', 'Turbo-jet','Turbo-prop',
                                                'Turbo-shaft','4 Cycle')
                                THEN 'C'
                        END AS qa_engine
                    FROM planes
""")

df1.filter(df1.qa_engine.isin(['M','C'])).show()

+---------+-----+
|qa_engine|count|
+---------+-----+
|        C|   10|
+---------+-----+

+-------------+-----+
|       engine|count|
+-------------+-----+
|    Turbo-jet|  450|
|      4 Cycle|    1|
|    Turbo-fan| 2127|
|   Turbo-prop|   37|
|Reciprocating|   10|
|  Turbo-shaft|    3|
+-------------+-----+

+-------+----+--------------------+------------------+-------------+-------+-----+-----+-------------+---------+
|tailnum|year|                type|      manufacturer|        model|engines|seats|speed|       engine|qa_engine|
+-------+----+--------------------+------------------+-------------+-------+-----+-----+-------------+---------+
| N201AA|1959|Fixed wing single...|            CESSNA|          150|      1|    2|   90|Reciprocating|        C|
| N202AA|1980|Fixed wing multi ...|            CESSNA|         421C|      2|    8|   90|Reciprocating|        C|
| N425AA|1968|Fixed wing single...|             PIPER|    PA-28-180|      1|    4|  107|Reciprocating|        C|
| N508JB|2

In [50]:
df_planes_qa = df_planes.select(col("tailnum"),
                                col("qa_tailnum"),
                                col("qa_year"), 
                                col("qa_type"), 
                                col("qa_manufacturer"), 
                                col("qa_model"), 
                                col("qa_engines"), 
                                col("qa_seats"), 
                                col("qa_speed"), 
                                col("qa_engine")
                               )
df_planes_qa.write.mode('overwrite').parquet("output/planes_qa.parquet")

## Flights - perguntas

In [51]:
schema_flights = StructType()\
.add("year", IntegerType(), True)\
.add('month', IntegerType(), True)\
.add('day', IntegerType(), True)\
.add('dep_time', StringType(), True)\
.add('dep_delay', IntegerType(), True)\
.add('arr_time', StringType(), True)\
.add('arr_delay', IntegerType(), True)\
.add('carrier', StringType(), True)\
.add('tailnum', StringType(), True)\
.add('flight', StringType(), True)\
.add('origin', StringType(), True)\
.add('dest', StringType(), True)\
.add('air_time', IntegerType(), True)\
.add('distance', IntegerType(), True)\
.add('hour', IntegerType(), True)\
.add('minute', IntegerType(), True)

In [52]:
df_flights = spark.read.options(delimiter=',', header=True).schema(schema_flights).csv("data/flights.csv")

df_flights_sql = df_flights
rdd = df_flights.rdd
rdd1 = rdd.coalesce(3)
colunas = df_flights.columns
df_flights_sql.createOrReplaceTempView("flights")


#### Pergunta 1

In [53]:
df_flights = df_flights.withColumn("qa_year_month_day", 
                                   when(check_empty_column('year'), 'MY'))

df_flights.filter(df_flights.qa_year_month_day == 'MY').show()

df_flights.groupBy('minute').count().orderBy('minute').show(65)
print(df_flights.filter(col('minute').isNull()).count())
print(df_flights.filter(col('hour').isNull()).count())
print(df_flights.filter(col('minute')==0).count())
print(df_flights.filter(col('hour')==0).count())

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_year_month_day|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+

+------+-----+
|minute|count|
+------+-----+
|  null|   48|
|     0|  178|
|     1|  184|
|     2|  173|
|     3|  161|
|     4|  157|
|     5|  167|
|     6|  157|
|     7|  153|
|     8|  153|
|     9|  183|
|    10|  160|
|    11|  161|
|    12|  161|
|    13|  150|
|    14|  136|
|    15|  162|
|    16|  152|
|    17|  155|
|    18|  130|
|    19|  155|
|    20|  163|
|    21|  139|
|    22|  156|
|    23|  142|
|    24|  177|
|    25|  18

In [54]:
df_flights = df_flights.withColumn("qa_year_month_day", 
                                   when(check_empty_column('month'), 'MM')
                                   .otherwise(df_flights.qa_year_month_day))

df_flights.filter(df_flights.qa_year_month_day == 'MM').show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_year_month_day|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+



In [55]:
df_flights = df_flights.withColumn("qa_year_month_day", 
                                   when(check_empty_column('day'), 'MD')
                                   .otherwise(df_flights.qa_year_month_day))

df_flights.filter(df_flights.qa_year_month_day == 'MD').show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_year_month_day|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+



In [56]:
df_flights = df_flights.withColumn("qa_year_month_day", 
                                   when(df_flights.year < 1950, 'IY')
                                   .otherwise(df_flights.qa_year_month_day))

df_flights.filter(df_flights.qa_year_month_day == 'IY').show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_year_month_day|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+



In [57]:
df_flights = df_flights.withColumn("qa_year_month_day", 
                                   when((df_flights.month < 1) | (df_flights.month > 12) , 'IM')
                                   .otherwise(df_flights.qa_year_month_day))

df_flights.filter(df_flights.qa_year_month_day == 'IM').show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_year_month_day|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+



In [58]:
df_flights = df_flights.withColumn("qa_year_month_day", 
                                   when((df_flights.day < 1) | 
                                        (df_flights.day > 31) |
                                        ((df_flights.month == 2) & ((df_flights.day < 1) | (df_flights.day > 29))), 'ID')
                                   .otherwise(df_flights.qa_year_month_day))

df_flights.filter(df_flights.qa_year_month_day == 'ID').show()




df1 = spark.sql("""
                    SELECT *,
                        CASE
                            WHEN
                                year IS NULL OR year == ''
                                THEN 'MY'
                            WHEN
                                month IS NULL OR month == ''
                                THEN 'MM'
                            WHEN
                                day IS NULL OR day == ''
                                THEN 'MD'
                            WHEN
                                year < 1950
                                THEN 'IY'
                            WHEN 
                                month < 1 OR month > 12
                                THEN 'IM'
                            WHEN
                                day < 1 OR day > 31 OR
                                (month == 2 AND day > 29)
                            THEN 'ID'
                        END AS qa_year_month_day
                    FROM flights
""")

df1.filter(df1.qa_year_month_day.isin(['MY','MM','MD','IY','IM','ID'])).show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_year_month_day|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_year_month_day|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+
+----+-----+---+--------+--------

#### Pergunta 2


In [59]:
df_flights = df_flights.withColumn("qa_hour_minute", 
                                   when(check_empty_column('hour') | col('hour').startswith('N'), 'MH')
                                  .when(check_empty_column('minute') | col('hour').startswith('N'), 'MM'))

df_flights.filter(df_flights.qa_hour_minute == 'MH').show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_year_month_day|qa_hour_minute|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+
|2014|    3|  4|      NA|     null|      NA|     null|     UA|     NA|   156|   SEA| DEN|    null|    1024|null|  null|             null|            MH|
|2014|    2| 12|      NA|     null|      NA|     null|     AS| N527AS|     2|   SEA| DCA|    null|    2329|null|  null|             null|            MH|
|2014|    7|  1|      NA|     null|      NA|     null|     WN| N8323C|  2485|   SEA| MDW|    null|    1733|null|  null|             null|            MH|
|2014|    4| 30|      NA|     null|      NA|     null|     AS| N526AS|   566|   PD

In [60]:
df_flights = df_flights.withColumn("qa_hour_minute", 
                                   when((df_flights.hour < 0) |
                                        (df_flights.hour > 23), 'IH')
                                   .otherwise(df_flights.qa_hour_minute))

df_flights.filter(df_flights.qa_hour_minute == 'IH').show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_year_month_day|qa_hour_minute|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+
|2014|    6| 20|    2400|       10|     535|        0|     AA| N3GHAA|  2486|   SEA| ORD|     197|    1721|  24|     0|             null|            IH|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+



In [61]:
df_flights = df_flights.withColumn("qa_hour_minute", 
                                   when((df_flights.minute < 0) |
                                        (df_flights.minute > 59), 'IM')
                                   .otherwise(df_flights.qa_hour_minute))

df_flights.filter(df_flights.qa_hour_minute == 'IM').show()

df1 = spark.sql("""
                    SELECT *,
                        CASE
                            WHEN
                                hour IS NULL OR hour == ''
                                THEN 'MH'
                            WHEN
                                minute is NULL OR minute == ''
                                THEN 'MM'
                            WHEN
                                hour < 0 OR hour > 23
                                THEN 'IH'
                            WHEN
                                minute < 0 OR minute > 59
                                THEN 'IM'
                            END AS qa_hour_minute
                        FROM flights
""")

df1.filter(df1.qa_hour_minute.isin(['MH','MM','IH','IM'])).show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_year_month_day|qa_hour_minute|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+--------------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_hour_minute|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+-----

#### Pergunta 3



In [62]:
df_flights = df_flights.withColumn("qa_dep_arr", 
                                   when(check_empty_column('dep_time') |
                                        col('dep_time').startswith('N'), 'MD')
                                   .when(check_empty_column('arr_time') |
                                        col('arr_time').startswith('N'), 'MA')
                                   .when(
                                       ~df_flights.dep_time.rlike('^([0-1]?[0-9]|2[0-3])[0-5][0-9]$'),'FD')
                                   .when(
                                       ~df_flights.arr_time.rlike('^([0-1]?[0-9]|2[0-3])[0-5][0-9]$'),'FA')
                                  )

df_flights.filter(df_flights.qa_dep_arr == 'MD').show()
df_flights.filter(col('dep_time').rlike('^[^0-9]*$')).groupBy('dep_time').count().show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_year_month_day|qa_hour_minute|qa_dep_arr|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+
|2014|    3|  4|      NA|     null|      NA|     null|     UA|     NA|   156|   SEA| DEN|    null|    1024|null|  null|             null|            MH|        MD|
|2014|    2| 12|      NA|     null|      NA|     null|     AS| N527AS|     2|   SEA| DCA|    null|    2329|null|  null|             null|            MH|        MD|
|2014|    7|  1|      NA|     null|      NA|     null|     WN| N8323C|  2485|   SEA| MDW|    null|    1733|null|  null|             null|            MH|        MD|
|2014|    4| 30|

In [63]:
df1 = spark.sql("""
                    SELECT *,
                        CASE
                            WHEN
                                dep_time IS NULL or dep_time == ''
                                THEN 'MD'
                            WHEN
                                arr_time IS NULL OR arr_time == ''
                                THEN 'MA'
                            WHEN
                                dep_time NOT rlike '^([0-1]?[0-9]|2[0-3])[0-5][0-9]$'
                                THEN 'FD'
                            WHEN
                                arr_time NOT rlike '^([0-1]?[0-9]|2[0-3])[0-5][0-9]$'
                                THEN 'FA'
                        END AS qa_dep_arr
                    FROM flights
""")

df1.filter(df1.qa_dep_arr.isin(['MD','MA','FD','FA'])).show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+----------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_dep_arr|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+----------+
|2014|    6|  2|    2222|        7|      55|       15|     AS| N402AS|    99|   SEA| ANC|     190|    1448|  22|    22|        FA|
|2014|    7|  5|    2224|       -1|      48|      -20|     AS| N459AS|   143|   PDX| ANC|     185|    1542|  22|    24|        FA|
|2014|   12| 17|    2234|      223|      11|      212|     UA| N39450|  1596|   PDX| SFO|      76|     550|  22|    34|        FA|
|2014|    3| 10|    2222|      -13|      55|      -30|     AS| N431AS|   143|   PDX| ANC|     203|    1542|  22|    22|        FA|
|2014|    7| 29|       1|        2|     600|        6|     UA| N458UA|   280|   PDX

#### Pergunta 4

In [64]:
df_flights = df_flights.withColumn("qa_dep_arr_delay", 
                                   when(check_empty_column('dep_delay') |
                                        col('dep_delay').startswith('N'), 'MD')
                                   .when(check_empty_column('arr_delay') |
                                        col('arr_delay').startswith('N'), 'MA'))

df_flights.filter(df_flights.qa_dep_arr_delay == 'MD').show()
print(df_flights.filter(col('dep_delay').isNull()).count())
print(df_flights.filter(col('arr_delay').isNull()).count())
df_flights.filter(col('dep_delay').rlike('^[^0-9]*$')).groupBy('dep_delay').count().show(100)

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_year_month_day|qa_hour_minute|qa_dep_arr|qa_dep_arr_delay|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+
|2014|    3|  4|      NA|     null|      NA|     null|     UA|     NA|   156|   SEA| DEN|    null|    1024|null|  null|             null|            MH|        MD|              MD|
|2014|    2| 12|      NA|     null|      NA|     null|     AS| N527AS|     2|   SEA| DCA|    null|    2329|null|  null|             null|            MH|        MD|              MD|
|2014|    7|  1|      NA|     null|      NA|     null|     WN| N8323C|  2485|   SEA| MDW|    nu

In [65]:
df_flights.filter(df_flights.qa_dep_arr_delay == 'MA').show()


df1 = spark.sql("""
                    SELECT *,
                        CASE
                            WHEN
                                dep_delay IS NULL OR dep_delay == ''
                                THEN 'MD'
                            WHEN
                                arr_delay IS NULL OR arr_delay == ''
                                THEN 'MA'
                        END AS qa_dep_arr_delay
                    FROM flights
""")

df_flights.filter(col('arr_delay').rlike('^[^0-9]*$')).groupBy('arr_delay').count().show(100)
df1.filter(df1.qa_dep_arr_delay.isin(['MD','MA'])).show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_year_month_day|qa_hour_minute|qa_dep_arr|qa_dep_arr_delay|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+
|2014|    4|  6|    1329|        4|    2159|     null|     DL| N130DL|  1929|   SEA| ATL|    null|    2182|  13|    29|             null|          null|      null|              MA|
|2014|   12| 27|    1420|       40|    2012|     null|     OO| N224AG|  3452|   SEA| HDN|    null|     891|  14|    20|             null|          null|      null|              MA|
|2014|    6| 11|    1054|       -9|    1624|     null|     OO| N926SW|  5576|   PDX| DEN|    nu

#### Pergunta 5

In [66]:
df_flights = df_flights.withColumn("qa_carrier", 
                                   when(check_empty_column('carrier'), 'M'))

df_flights.filter(df_flights.qa_carrier == 'M').show()
df_flights.filter(df_flights.carrier.startswith('NA')).show()
df_flights.groupBy('carrier').count().show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_year_month_day|qa_hour_minute|qa_dep_arr|qa_dep_arr_delay|qa_carrier|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+
|year|month|day|dep_time|dep_delay|arr_

In [67]:
import time

inicio = time.time()
df_flights = df_flights.withColumn("qa_carrier", 
                                   when(~df_flights.carrier.rlike('^([0-9]|[A-Z]|[a-z]){2}$'),'F')
                                   .otherwise(df_flights.qa_carrier))

df_flights.filter(df_flights.qa_carrier == 'F').show()




df1 = spark.sql("""
                    SELECT *,
                        CASE
                            WHEN
                                carrier IS NULL OR carrier == ''
                                THEN 'M'
                            WHEN
                                carrier NOT rlike '^([0-9]|[A-Z]|[a-z]){2}$'
                                THEN 'F'
                        END AS qa_carrier
                    FROM flights
""")

df1.filter(df1.qa_carrier.isin(['M','F'])).show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_year_month_day|qa_hour_minute|qa_dep_arr|qa_dep_arr_delay|qa_carrier|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+----------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|di

#### Pergunta 6


In [68]:
df_flights = df_flights.withColumn("qa_tailnum", 
                                   when(check_empty_column('tailnum'), 'M')
                                  .when(~length(df_flights.tailnum).between(5,6), 'S')
                                  .when(~df_flights.tailnum.startswith('N'),'FN')
                                  .when(df_flights.tailnum.rlike('I|O|N0'),'FE')
                                  .when(
                                       ~df_flights.tailnum.rlike('^N[1-9][0-9]{2,3}([ABCDEFGHJKLMNPQRSTUVXWYZ]{1,2})')
                                       ,'F'))

df_flights.filter(df_flights.qa_tailnum.isin(['S','FN','FE','F'])).groupBy('qa_tailnum').count().show()


+----------+-----+
|qa_tailnum|count|
+----------+-----+
|         F|  987|
|         S|   14|
|        FN|    2|
+----------+-----+



In [69]:
df1 = spark.sql("""SELECT *,
                        CASE
                            WHEN
                                tailnum IS NULL OR tailnum == ''
                                THEN 'M'
                            WHEN 
                                tailnum NOT LIKE 'N%'
                                THEN 'FN'
                            WHEN
                                 tailnum LIKE '%I%' OR tailnum LIKE '%O%' OR tailnum LIKE 'N0'
                                 THEN 'FE'
                            WHEN
                                length(tailnum) <> 5
                                THEN 'S'
                            WHEN 
                                tailnum NOT rlike 'N[1-9][0-9]*2Z$|N[0-9]*2[A-Z][A-Z]$'
                                THEN 'F'
                        END AS qa_tailnum
                    FROM flights
""")
df1.filter(df1.qa_tailnum.isin(['M','S','F','FN','FE'])).show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+----------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_tailnum|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+----------+
|2014|   12|  8|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|   6|    58|         S|
|2014|    1| 22|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|  10|    40|         S|
|2014|    3|  9|    1443|       -2|    1652|        2|     VX| N847VA|   755|   SEA| SFO|     111|     679|  14|    43|         S|
|2014|    4|  9|    1705|       45|    1839|       34|     WN| N360SW|   344|   PDX| SJC|      83|     569|  17|     5|         S|
|2014|    3|  9|     754|       -1|    1015|        1|     AS| N612AS|   522|   SEA

#### Pergunta 7

In [70]:
df_flights = df_flights.withColumn("qa_flight", 
                                   when(check_empty_column('flight'), 'M'))

df_flights.filter(df_flights.qa_flight == 'M').show()
df_flights.filter(col('flight').rlike('^[^0-9]*$')).groupBy('flight').count().show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_year_month_day|qa_hour_minute|qa_dep_arr|qa_dep_arr_delay|qa_carrier|qa_tailnum|qa_flight|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+

+------+-----+
|flight|count|
+------+-----+
+------+-----+



In [71]:
df_flights = df_flights.withColumn("qa_flight", 
                                   when(~df_flights.flight.rlike('^[0-9]{4}$'), 'F')
                                   .otherwise(df_flights.qa_flight))

df_flights.filter(df_flights.qa_flight == 'F').show()


df1 = spark.sql("""
                    SELECT *,
                        CASE
                            WHEN
                                flight IS NULL OR flight == ''
                                THEN 'M'
                            WHEN
                                flight NOT rlike '^[0-9]{4}$'
                                THEN 'F'
                        END AS qa_flight
                    FROM flights
""")

df1.filter(df1.qa_flight.isin(['M','F'])).show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_year_month_day|qa_hour_minute|qa_dep_arr|qa_dep_arr_delay|qa_carrier|qa_tailnum|qa_flight|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+
|2014|    1| 22|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|  10|    40|             null|          null|      null|            null|      null|      null|        F|
|2014|    3|  9|    1443|       -2|    1652|        2|     VX| N847VA|   755|   SEA| SFO|     111|     679|  14|    43|             null|          n

#### Pergunta 8

In [72]:
df_flights = df_flights.withColumn("qa_origin_dest", 
                                   when(check_empty_column('origin'), 'MO'))

df_flights.filter(df_flights.qa_origin_dest == 'MO').show()
df_flights.groupBy('origin').count().show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+--------------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_year_month_day|qa_hour_minute|qa_dep_arr|qa_dep_arr_delay|qa_carrier|qa_tailnum|qa_flight|qa_origin_dest|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+--------------+
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+--------------+

+------+-----+
|origin|count|
+------+-----+
|   SEA| 6754|
|   PDX| 3246|
+------+----

In [73]:
df_flights = df_flights.withColumn("qa_origin_dest", 
                                   when(check_empty_column('dest'), 'MD')
                                   .otherwise(df_flights.qa_origin_dest))

df_flights.filter(df_flights.qa_origin_dest == 'MD').show()
df_flights.groupBy('dest').count().show(100)

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+--------------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_year_month_day|qa_hour_minute|qa_dep_arr|qa_dep_arr_delay|qa_carrier|qa_tailnum|qa_flight|qa_origin_dest|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+--------------+
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+--------------+

+----+-----+
|dest|count|
+----+-----+
| MSY|    9|
| GEG|  105|
| BUR|  137|
| SNA|  1

In [74]:
df_flights = df_flights.withColumn("qa_origin_dest", 
                                   when(~df_flights.origin.rlike('^([0-9]|[A-Z]|[a-z]){3}$'),'FO')
                                   .otherwise(df_flights.qa_origin_dest))

df_flights.filter(df_flights.qa_origin_dest == 'FO').show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+--------------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_year_month_day|qa_hour_minute|qa_dep_arr|qa_dep_arr_delay|qa_carrier|qa_tailnum|qa_flight|qa_origin_dest|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+--------------+
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+--------------+



In [75]:
df_flights = df_flights.withColumn("qa_origin_dest", 
                                   when(~df_flights.dest.rlike('^([0-9]|[A-Z]|[a-z]){3}$'),'FD')
                                   .otherwise(df_flights.qa_origin_dest))

df_flights.filter(df_flights.qa_origin_dest == 'FD').show()


df1 = spark.sql("""
                    SELECT *,
                        CASE
                            WHEN
                                origin IS NULL OR origin == ''
                                THEN 'MO'
                            WHEN 
                                dest IS NULL OR dest == ''
                                THEN 'MD'
                            WHEN
                                origin NOT rlike '^([0-9]|[A-Z]|[a-z]){3}$'
                                THEN 'FO'
                            WHEN
                                dest NOT rlike '^([0-9]|[A-Z]|[a-z]){3}$'
                                THEN 'FD'
                        END AS qa_origin_dest
                    FROM flights
""")

df1.filter(df1.qa_origin_dest.isin(['MO','MD','FO','FD'])).show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+--------------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_year_month_day|qa_hour_minute|qa_dep_arr|qa_dep_arr_delay|qa_carrier|qa_tailnum|qa_flight|qa_origin_dest|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+--------------+
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+--------------+

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+---

#### Pergunta 9

In [76]:
df_flights = df_flights.withColumn("qa_air_time", 
                                   when(check_empty_column('air_time') |
                                        col('air_time').startswith('N'),'M'))

df_flights.filter(df_flights.qa_air_time == 'M').show()
df_flights.filter(col('air_time').rlike('^[^0-9]*$')).groupBy('air_time').count().show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+--------------+-----------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_year_month_day|qa_hour_minute|qa_dep_arr|qa_dep_arr_delay|qa_carrier|qa_tailnum|qa_flight|qa_origin_dest|qa_air_time|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+--------------+-----------+
|2014|    4|  6|    1329|        4|    2159|     null|     DL| N130DL|  1929|   SEA| ATL|    null|    2182|  13|    29|             null|          null|      null|              MA|      null|      null|     null|          null|          M|
|2014|    3|  4|      NA|     null|     

In [77]:
df_flights = df_flights.withColumn("qa_air_time", 
                                   when((df_flights.air_time < 20) | (df_flights.air_time > 500),'I')
                                   .otherwise(df_flights.qa_air_time))

df_flights.filter(df_flights.qa_air_time == 'I').show()


df1 = spark.sql("""
                    SELECT *,
                        CASE
                            WHEN
                                air_time IS NULL OR air_time == ''
                                THEN 'M'
                            WHEN
                                air_time < 20 OR air_time > 500
                                THEN 'I'
                        END AS qa_air_time
                    FROM flights

""")

df1.filter(df1.qa_air_time.isin(['M','I'])).show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+--------------+-----------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_year_month_day|qa_hour_minute|qa_dep_arr|qa_dep_arr_delay|qa_carrier|qa_tailnum|qa_flight|qa_origin_dest|qa_air_time|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+--------------+-----------+
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+--------------+-----------+

+----+-----+---+--------+---------+----

#### Pergunta 10

In [78]:
df_flights = df_flights.withColumn("qa_distance", 
                                   when(check_empty_column('distance'),'M'))

df_flights.filter(df_flights.qa_distance == 'M').show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+--------------+-----------+-----------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_year_month_day|qa_hour_minute|qa_dep_arr|qa_dep_arr_delay|qa_carrier|qa_tailnum|qa_flight|qa_origin_dest|qa_air_time|qa_distance|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+--------------+-----------+-----------+
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+--------------+-----------+-----

In [79]:
df_flights = df_flights.withColumn("qa_distance", 
                                   when((df_flights.distance < 50) | (df_flights.distance > 3000),'I')
                                   .otherwise(df_flights.qa_distance))

df_flights.filter(df_flights.qa_distance == 'I').show()


df1 = spark.sql("""
                    SELECT *,
                        CASE
                            WHEN
                                distance IS NULL OR distance == ''
                                THEN 'M'
                            WHEN
                                distance < 50 OR distance > 3000
                                THEN 'I'
                        END AS qa_distance
                    FROM flights

""")

df1.filter(df1.qa_distance.isin(['M','I'])).show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+--------------+-----------+-----------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_year_month_day|qa_hour_minute|qa_dep_arr|qa_dep_arr_delay|qa_carrier|qa_tailnum|qa_flight|qa_origin_dest|qa_air_time|qa_distance|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+--------------+-----------+-----------+
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-----------------+--------------+----------+----------------+----------+----------+---------+--------------+-----------+-----

#### Pergunta 11

In [80]:
df_flights = df_flights.withColumn("qa_distance_airtime", 
                                   when(check_empty_column('distance') |
                                       check_empty_column('air_time'), 'M')
                                   .when(col("air_time") >= (col("distance") * 0.1) + 30, 'TL')
                                   .when(col("air_time") <= (col("distance") * 0.1) + 10, 'TS')
                                   .otherwise('TR'))

#df_flights.filter(df_flights.qa_distance_airtime == 'M').show()
df_flights.filter(col('distance').rlike('^[^0-9]*$')).groupBy('distance').count().show()

+--------+-----+
|distance|count|
+--------+-----+
+--------+-----+



In [81]:
df1 = spark.sql("""
                    SELECT *,
                        CASE 
                            WHEN 
                                air_time IS NULL OR
                                air_time == '' OR 
                                distance IS NULL OR 
                                distance == '' 
                                THEN 'M'
                            WHEN
                                air_time >= (distance * 0.1) + 30
                                THEN 'TL'
                                ELSE
                                    CASE 
                                        WHEN
                                            air_time <= (distance * 0.1) + 10
                                            THEN 'TS'
                                            ELSE 'TR'
                                    END
                        END AS qa_distance_airtime
                    FROM flights
""")

df1.filter(df1.qa_distance_airtime == 'TR').show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-------------------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|qa_distance_airtime|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+-------------------+
|2014|    4|  9|    1705|       45|    1839|       34|     WN| N360SW|   344|   PDX| SJC|      83|     569|  17|     5|                 TR|
|2014|    1| 15|    1037|        7|    1352|        2|     WN| N646SW|    48|   PDX| DEN|     121|     991|  10|    37|                 TR|
|2014|   11| 19|    1812|       -3|    2352|       -4|     AS| N564AS|    26|   SEA| ORD|     198|    1721|  18|    12|                 TR|
|2014|   12| 17|    2015|       50|    2150|       41|     AS| N626AS|   368|   SEA| SMF|      76|     605|  20|    15|                 TR|
|2014|    6|  5|    

In [82]:
df_flights_qa = df_flights.select("origin",
                                  "dest",
                                  "tailnum",
                                  "qa_year_month_day",
                                 "qa_hour_minute",
                                 "qa_dep_arr",
                                 "qa_dep_arr_delay",
                                 "qa_carrier",
                                 "qa_tailnum",
                                 "qa_flight",
                                 "qa_origin_dest",
                                 "qa_air_time",
                                 "qa_distance",
                                 "qa_distance_airtime")
                                 
df_flights_qa.write.mode('overwrite').parquet('output/flights_qa.parquet')