In [1]:
from FlightRadar24.api import FlightRadar24API
fr_api = FlightRadar24API()

In [2]:
airports = fr_api.get_airports()
airlines = fr_api.get_airlines()
flights = fr_api.get_flights()
zones = fr_api.get_zones()

In [3]:
flight = flights[100]
details = fr_api.get_flight_details(flight.id)
flight.set_flight_details(details)
#flight.destination_airport_name
#print("Flying to", flight.destination_airport_name)

In [4]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import SparkSession, Row
import pydeequ


Please set env variable SPARK_VERSION


In [6]:
spark = (SparkSession
            .builder
            .config("spark.jars.packages", pydeequ.deequ_maven_coord)
            .config("spark.jars.excludes", pydeequ.f2j_maven_coord)
            .getOrCreate())


23/05/08 11:49:50 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [7]:
df_airlines = spark.createDataFrame(airlines)
df_airlines.show(10)

                                                                                

+----+----+------------------+
|Code|ICAO|              Name|
+----+----+------------------+
|  2I| CSB|            21 Air|
|  4Q| ONY|   25only Aviation|
|    | BRO|   2Excel Aviation|
|  Q5| MLA|       40-Mile Air|
|  FE| IHO|  748 Air Services|
|  AQ| JYH|             9 Air|
|  S5| NKP|        Abakan Air|
|    | ABP|          ABS Jets|
|    | BAR|Abu Dhabi Aviation|
|  GB| ABX|           ABX Air|
+----+----+------------------+
only showing top 10 rows



La colonne 'alt' contient qq données comme "-1" qui sont catégorisées comme string et pas int.
La colonne 'lon' et 'lan contient qq données catégorisées comme int et pas float comme la majorité.

In [8]:
def convert_airports(lst):
    result = [dict( [a, int(x)] if a == "alt" else [a, float(x)] if a == 'lon' or a == 'lat' else [a, x] for a, x in b.items()) for b in lst]
    return result

In [9]:
airports_typed_columns = convert_airports(airports)

schema_airports = StructType([
   StructField("name", StringType(), True),
   StructField("iata", StringType(), True),
   StructField("icao", StringType(), True),
   StructField("country", StringType(), True),
   StructField("lat", FloatType(), True),
   StructField("lon", FloatType(), True),
   StructField("alt", IntegerType(), True)
   ])
df_airports = spark.createDataFrame(airports_typed_columns, schema=schema_airports)
df_airports.show(10)

+--------------------+----+----+--------------+---------+---------+---+
|                name|iata|icao|       country|      lat|      lon|alt|
+--------------------+----+----+--------------+---------+---------+---+
|    A Coruna Airport| LCG|LECO|         Spain| 43.30206| -8.37725|326|
|Aachen Merzbruck ...| AAH|EDKA|       Germany| 50.82305| 6.186111|623|
|     Aalborg Airport| AAL|EKYT|       Denmark| 57.09278| 9.849164| 10|
|      Aarhus Airport| AAR|EKAH|       Denmark| 56.30001|   10.619| 82|
|  Aarhus Sea Airport| QEA|EKAC|       Denmark|56.151993|10.247725|  1|
|     Aasiaat Airport| JEG|BGAA|     Greenland| 68.72184| -52.7847| 74|
|      Abadan Airport| ABD|OIAA|          Iran| 30.37111| 48.22833| 19|
|Abakan Internatio...| ABA|UNAA|        Russia|    53.74|   91.385|831|
|Abbotsford Intern...| YXX|CYXX|        Canada| 49.02527|  -122.36|195|
|Aberdeen Internat...| ABZ|EGPD|United Kingdom| 57.20194| -2.19777|215|
+--------------------+----+----+--------------+---------+-------

In [10]:
def convert_flights(lst):
    result = [b.__dict__ for b in lst]
    return result

In [11]:
type(flights)
flights[0].__dict__
convert_flights(flights)

[{'id': '300b87c8',
  'icao_24bit': 'D0194C',
  'latitude': 51.6707,
  'longitude': 8.8882,
  'heading': 89,
  'altitude': 1709,
  'ground_speed': 1,
  'squawk': 'N/A',
  'aircraft_code': 'GLID',
  'registration': 'WINDRAD',
  'time': 1683539284,
  'origin_airport_iata': 'N/A',
  'destination_airport_iata': 'N/A',
  'number': 'N/A',
  'airline_iata': 'N/A',
  'on_ground': 0,
  'vertical_speed': 0,
  'callsign': 'WINDRAD',
  'airline_icao': 'N/A'},
 {'id': '302381be',
  'icao_24bit': 'AD08C9',
  'latitude': 35.0924,
  'longitude': -99.4221,
  'heading': 270,
  'altitude': 66300,
  'ground_speed': 13,
  'squawk': 'N/A',
  'aircraft_code': 'BALL',
  'registration': 'N939TH',
  'time': 1683539323,
  'origin_airport_iata': 'N/A',
  'destination_airport_iata': 'N/A',
  'number': 'N/A',
  'airline_iata': 'N/A',
  'on_ground': 0,
  'vertical_speed': -128,
  'callsign': 'HBAL628',
  'airline_icao': 'N/A'},
 {'id': '302d3702',
  'icao_24bit': 'D0123C',
  'latitude': 51.8038,
  'longitude': 9.277

In [17]:
schema = StructType([
   StructField("id", StringType(), True),
   StructField("icao_24bit", StringType(), True),
   StructField("latitude", FloatType(), True),
   StructField("longitude", FloatType(), True),
   StructField("heading", IntegerType(), True),
   StructField("altitude", IntegerType(), True),
   StructField("ground_speed", IntegerType(), True),
   StructField("squawk", StringType(), True),
   StructField("aircraft_code", StringType(), True),
   StructField("registration", StringType(), True),
   StructField("time", IntegerType(), True),
   StructField("origin_airport_iata", StringType(), True),
   StructField("destination_airport_iata", StringType(), True),
   StructField("number", StringType(), True),
   StructField("airline_iata", StringType(), True),
   StructField("on_ground", IntegerType(), True),
   StructField("vertical_speed", IntegerType(), True),
   StructField("callsign", StringType(), True),
   StructField("airline_icao", StringType(), True)
   ])
df_flights = spark.createDataFrame(flights, schema=schema)

df_flights.show(5)

+--------+----------+--------+---------+-------+--------+------------+------+-------------+------------+----------+-------------------+------------------------+------+------------+---------+--------------+--------+------------+
|      id|icao_24bit|latitude|longitude|heading|altitude|ground_speed|squawk|aircraft_code|registration|      time|origin_airport_iata|destination_airport_iata|number|airline_iata|on_ground|vertical_speed|callsign|airline_icao|
+--------+----------+--------+---------+-------+--------+------------+------+-------------+------------+----------+-------------------+------------------------+------+------------+---------+--------------+--------+------------+
|300b87c8|    D0194C| 51.6707|   8.8882|     89|    1709|           1|   N/A|         GLID|     WINDRAD|1683539284|                N/A|                     N/A|   N/A|         N/A|        0|             0| WINDRAD|         N/A|
|302381be|    AD08C9| 35.0924| -99.4221|    270|   66300|          13|   N/A|         BA

In [18]:
df_airlines.show(5)

+----+----+----------------+
|Code|ICAO|            Name|
+----+----+----------------+
|  2I| CSB|          21 Air|
|  4Q| ONY| 25only Aviation|
|    | BRO| 2Excel Aviation|
|  Q5| MLA|     40-Mile Air|
|  FE| IHO|748 Air Services|
+----+----+----------------+
only showing top 5 rows



In [20]:
from pydeequ.analyzers import AnalysisRunner, AnalyzerContext, ApproxCountDistinct, Completeness, Compliance, Mean, Size

analysisResult = AnalysisRunner(spark) \
            .onData(df_airlines) \
            .addAnalyzer(Size()) \
                 .run()
            #.addAnalyzer(Completeness("Code")) \
            #.addAnalyzer(ApproxCountDistinct("Code")) \
            #.run()

TypeError: 'JavaPackage' object is not callable