In [1]:
# Install required packages
!pip install pymongo
!pip install graphframes



In [1]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressor
from math import sqrt
from pymongo import MongoClient
from pyspark.sql.functions import avg, collect_list, struct, explode, desc, col
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from graphframes import GraphFrame

In [2]:
# Define MongoDB connection settings
mongo_url = 'mongodb://localhost:27017/flightdb.flightdata'


In [3]:
# Create a spark session
spark = SparkSession.builder.appName("myApp").config("spark.mongodb.input.uri", mongo_url) \
    .config("spark.mongodb.output.uri", mongo_url) \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.1') \
    .getOrCreate()

23/05/14 20:02:50 WARN Utils: Your hostname, Sriharshas-Laptop.local resolves to a loopback address: 127.0.0.1; using 10.0.0.20 instead (on interface en0)
23/05/14 20:02:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/Users/sriharsha.nemalikonda/opt/anaconda3/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/sriharsha.nemalikonda/.ivy2/cache
The jars for the packages stored in: /Users/sriharsha.nemalikonda/.ivy2/jars
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-0fea74ca-ef66-4dd5-8fd9-882e8f416f4e;1.0
	confs: [default]
	found org.mongodb.spark#mongo-spark-connector_2.12;3.0.1 in central
	found org.mongodb#mongodb-driver-sync;4.0.5 in central
	found org.mongodb#bson;4.0.5 in central
	found org.mongodb#mongodb-driver-core;4.0.5 in central
:: resolution report :: resolve 428ms :: artifacts dl 20ms
	:: modules in use:
	org.mongodb#bson;4.0.5 from central in [default]
	org.mongodb#mongodb-driver-core;4.0.5 from central in [default]
	org.mongodb#mongodb-driver-sync;4.0.5 from central in [default]
	org.mongodb.spark#mongo-spark-connector_2.12;3.0.1 from central in [default]
	---------------------------------------------------------------------
	|                  |            m

In [4]:
# Load data from MongoDB
df = spark.read.format("mongo").load()


                                                                                

In [5]:
# Print schema of the spark dataframe
df.printSchema()

root
 |-- ACTUAL_ELAPSED_TIME: integer (nullable = true)
 |-- AIR_TIME: integer (nullable = true)
 |-- ARR_DELAY: integer (nullable = true)
 |-- ARR_TIME: integer (nullable = true)
 |-- CANCELLATION_CODE: string (nullable = true)
 |-- CANCELLED: integer (nullable = true)
 |-- CARRIER_DELAY: integer (nullable = true)
 |-- CRS_ARR_TIME: integer (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- CRS_ELAPSED_TIME: integer (nullable = true)
 |-- DEP_DELAY: integer (nullable = true)
 |-- DEP_TIME: integer (nullable = true)
 |-- DEST: string (nullable = true)
 |-- DISTANCE: integer (nullable = true)
 |-- DIVERTED: integer (nullable = true)
 |-- FL_DATE: string (nullable = true)
 |-- LATE_AIRCRAFT_DELAY: integer (nullable = true)
 |-- NAS_DELAY: integer (nullable = true)
 |-- OP_CARRIER: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: integer (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- SECURITY_DELAY: integer (nullable = true)
 |-- TAXI_IN: integer (nullable 

In [8]:
# Check the available columns
df.columns

['ACTUAL_ELAPSED_TIME',
 'AIR_TIME',
 'ARR_DELAY',
 'ARR_TIME',
 'CANCELLATION_CODE',
 'CANCELLED',
 'CARRIER_DELAY',
 'CRS_ARR_TIME',
 'CRS_DEP_TIME',
 'CRS_ELAPSED_TIME',
 'DEP_DELAY',
 'DEP_TIME',
 'DEST',
 'DISTANCE',
 'DIVERTED',
 'FL_DATE',
 'LATE_AIRCRAFT_DELAY',
 'NAS_DELAY',
 'OP_CARRIER',
 'OP_CARRIER_FL_NUM',
 'ORIGIN',
 'SECURITY_DELAY',
 'TAXI_IN',
 'TAXI_OUT',
 'WEATHER_DELAY',
 'WHEELS_OFF',
 'WHEELS_ON',
 '_id']

In [8]:
# Display the 20 rows of the dataframe
df.show(20)

23/05/13 23:38:52 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+-------------------+--------+---------+--------+-----------------+---------+-------------+------------+------------+----------------+---------+--------+----+--------+--------+----------+-------------------+---------+----------+-----------------+------+--------------+-------+--------+-------------+----------+---------+--------------------+
|ACTUAL_ELAPSED_TIME|AIR_TIME|ARR_DELAY|ARR_TIME|CANCELLATION_CODE|CANCELLED|CARRIER_DELAY|CRS_ARR_TIME|CRS_DEP_TIME|CRS_ELAPSED_TIME|DEP_DELAY|DEP_TIME|DEST|DISTANCE|DIVERTED|   FL_DATE|LATE_AIRCRAFT_DELAY|NAS_DELAY|OP_CARRIER|OP_CARRIER_FL_NUM|ORIGIN|SECURITY_DELAY|TAXI_IN|TAXI_OUT|WEATHER_DELAY|WHEELS_OFF|WHEELS_ON|                 _id|
+-------------------+--------+---------+--------+-----------------+---------+-------------+------------+------------+----------------+---------+--------+----+--------+--------+----------+-------------------+---------+----------+-----------------+------+--------------+-------+--------+-------------+----------+------

Retrieve the number of airports present in the dataframe

In [9]:
# Select the distinct airport codes from the ORIGIN column
airports_list = df.select("ORIGIN").distinct().rdd.flatMap(lambda x: x).collect()

# Print the list of airports
print("List of Airports :", airports_list)
print('Number of airports available :',len(airports_list))



[Stage 4:>                                                          (0 + 4) / 4]

List of Airports : ['MSY', 'GEG', 'BUR', 'SNA', 'GRB', 'GTF', 'IDA', 'GRR', 'EUG', 'GSO', 'PVD', 'MYR', 'OAK', 'MSN', 'COD', 'BTM', 'FAR', 'DCA', 'CID', 'LWS', 'HLN', 'LEX', 'RDM', 'ORF', 'CRW', 'CWA', 'SAV', 'CMH', 'MOD', 'CAK', 'MOB', 'PNS', 'CEC', 'LIH', 'IAH', 'HNL', 'SHV', 'CVG', 'SJC', 'BUF', 'LGA', 'CDC', 'BGM', 'PSE', 'PSG', 'FSM', 'MQT', 'SCC', 'MLU', 'GTR', 'WRG', 'FLO', 'SCE', 'EVV', 'KTN', 'TRI', 'CDV', 'TYR', 'ADK', 'CHO', 'ERI', 'TOL', 'TLH', 'LWB', 'MWH', 'DLG', 'GST', 'UTM', 'SPN', 'RFD', 'INL', 'PPG', 'IFP', 'PVU', 'JLN', 'ESC', 'SWO', 'PIB', 'CIU', 'IAG', 'JMS', 'GCK', 'YNG', 'OGS', 'LBF', 'SLN', 'EAR', 'DRT', 'DIK', 'MCN', 'ENV', 'RDD', 'HPN', 'AUS', 'GCC', 'MLI', 'SJU', 'ATW', 'AVL', 'GJT', 'LGB', 'BFL', 'RNO', 'SRQ', 'SBN', 'JAC', 'CHS', 'RSW', 'TUL', 'IPL', 'HRL', 'AMA', 'ISP', 'BOS', 'MAF', 'EWR', 'LAS', 'BIS', 'JAN', 'ITO', 'XNA', 'DLH', 'DEN', 'SGU', 'ALB', 'CPR', 'LNK', 'IAD', 'PSP', 'SBA', 'BOI', 'IYK', 'BRO', 'DRO', 'RKS', 'SEA', 'LAN', 'LRD', 'MCI', 'FLG', 

                                                                                

In [10]:
# Considering the 20 airports from the list as provided in the project description
airports= ['IAD', 'ORD', 'LAS', 'BWI', 'SLC', 'BNA', 'JFK', 'SEA', 'CVG', 'BOS', 'HOU', 'DTW', 'DEN', 'ORL', 'PIT', 'MIA', 'LAX', 'AUS', 'ABQ', 'SDF', 'MRY']


## Airports with largest number of Departures

In [11]:
# Group the data by the airport and count the number of departures
airport_departures = df.groupBy("ORIGIN").count()

# Sort the airports by the number of departures in descending order
sorted_airports = airport_departures.orderBy(desc("count"))

# Display the airports with the largest number of departures
# sorted_airports.show(10)

# Get the top 10 airports with the largest number of departures
top_airports = sorted_airports.limit(10).toPandas()

# Extract the airport codes and departure counts
airports = top_airports["ORIGIN"].tolist()
departure_counts = top_airports["count"].tolist()

# Create a bar chart using Plotly
fig = go.Figure(data=go.Bar(x=airports, y=departure_counts))

# Customize the chart layout
fig.update_layout(
    title="10 Airports with Largest Departures",
    xaxis_title="Airports",
    yaxis_title="Number of Departures",
    xaxis_tickangle=-45,
)

# Display the chart
fig.show()

print('Airport with largest number of departures:', airports[0])

                                                                                

Airport with largest number of departures: ATL


## Airports with largest number of Arrivals

In [12]:
# Group the data by the airport and count the number of arrivals
airport_arrivals = df.groupBy("DEST").count()

# Sort the airports by the number of departures in arrivals order
sorted_airports = airport_arrivals.orderBy(desc("count"))

# Display the airports with the largest number of arrivals
# sorted_airports.show(10)

# Get the top 10 airports with the largest number of arrivals
top_airports = sorted_airports.limit(10).toPandas()

# Extract the airport codes and arrivals counts
airports = top_airports["DEST"].tolist()
arrival_counts = top_airports["count"].tolist()

# Create a bar chart using Plotly
fig = go.Figure(data=go.Bar(x=airports, y=arrival_counts))

# Customize the chart layout
fig.update_layout(
    title="10 Airports with Largest Arrivals",
    xaxis_title="Airports",
    yaxis_title="Number of Arrivals",
    xaxis_tickangle=-45,
)

# Display the chart
fig.show()

print('Airport with largest number of arrivals:', airports[0])

                                                                                

Airport with largest number of arrivals: ATL


## Busiest Airports

In [11]:
# Group the data by the airport and count the number of departures and arrivals
airport_traffic = df.groupBy("ORIGIN", "DEST").count()

# Calculate the total traffic for each airport (sum of departures and arrivals)
airport_total_traffic = airport_traffic.groupBy("ORIGIN").sum("count")

# Sort the airports by the total traffic in descending order
sorted_airports = airport_total_traffic.orderBy(desc("sum(count)"))

# Display the busiest airport
busiest_airport = sorted_airports.first()
print("Busiest Airport:", busiest_airport["ORIGIN"])

[Stage 23:>                                                         (0 + 1) / 1]

Busiest Airport: ATL


                                                                                

In [7]:

# Convert the Spark DataFrame to a Pandas DataFrame
airport_traffic_pd = airport_traffic.toPandas()

# Plot the number of flights using Plotly
fig = px.bar(airport_traffic_pd, x="ORIGIN", y="count", color="DEST",
             labels={"ORIGIN": "Origin", "count": "Number of Flights", "DEST": "Destination"},
             title="Number of Flights between Origin and Destination")

# Customize the layout
fig.update_layout(xaxis_tickangle=-45)

# Display the chart
fig.show()

                                                                                

In [10]:
# Get the top 5 busiest airports
top_airports = sorted_airports.limit(10).toPandas()

# Extract the airport codes and total traffic counts
airports = top_airports["ORIGIN"].tolist()
traffic_counts = top_airports["sum(count)"].tolist()

# Calculate the percentage of traffic for each airport
total_traffic = sum(traffic_counts)
percentages = [(count / total_traffic) * 100 for count in traffic_counts]

# Create a pie chart using Plotly
fig = go.Figure(data=go.Pie(labels=airports, values=percentages))

# Customize the chart layout
fig.update_layout(title="Percentage of Traffic for 10 Busiest Airports")

# Display the chart
fig.show()


## Airports with Depature Delays

In [6]:
# Filter the dataframe to include only the rows with departure delays
delayed_flights = df.filter(df.DEP_DELAY > 0)

# Group the data by the airport and count the number of departure delays
departure_delays = delayed_flights.groupBy("ORIGIN").count()

# Sort the airports by the number of departure delays in descending order
sorted_airports = departure_delays.orderBy(desc("count"))

# Display the top 10 airports with the most departure delays
# sorted_airports.show(10)

# Get the top 10 airports with the highest number of departure delays
top_airports = sorted_airports.limit(10).toPandas()

# Extract the airport codes and departure delay counts
airports = top_airports["ORIGIN"].tolist()
delay_counts = top_airports["count"].tolist()

# Create a bar chart using Plotly
fig = go.Figure(data=go.Bar(x=airports, y=delay_counts))

# Customize the chart layout
fig.update_layout(
    title="Top 10 Airports with Departure Delays",
    xaxis_title="Airports",
    yaxis_title="Number of Departure Delays",
    xaxis_tickangle=-45,
)

# Display the chart
fig.show()

                                                                                

## Airports with Arrival Delays

In [7]:
# Filter the dataframe to include only the rows with arrival delays
delayed_flights = df.filter(df.ARR_DELAY > 0)

# Group the data by the airport and count the number of arrival delays
arrival_delays = delayed_flights.groupBy("DEST").count()

# Sort the airports by the number of arrival delays in descending order
sorted_airports = arrival_delays.orderBy(desc("count"))

# Display the top 10 airports with the most arrival delays
top_airports = sorted_airports.limit(10).toPandas()

# Extract the airport codes and arrival delay counts
airports = top_airports["DEST"].tolist()
delay_counts = top_airports["count"].tolist()

# Create a bar chart using Plotly
fig = go.Figure(data=go.Bar(x=airports, y=delay_counts))

# Customize the chart layout
fig.update_layout(
    title="Top 10 Airports with Arrival Delays",
    xaxis_title="Airports",
    yaxis_title="Number of Arrival Delays",
    xaxis_tickangle=-45,
)

# Display the chart
fig.show()

                                                                                

### Grouping: Calculating the average delay per day, for every Airport

In [6]:
# Group by 'ORIGIN' and 'FL_DATE' and calculate the average of 'ARR_DELAY'
result = df.groupBy("ORIGIN", "FL_DATE").agg(avg("ARR_DELAY").alias("averageArrDelay"))

# Group the result by 'ORIGIN' and collect 'FL_DATE' and 'averageArrDelay' as an array
final_result = result.groupBy("ORIGIN").agg(collect_list(struct("FL_DATE", "averageArrDelay")).alias("dates"))

# Unpack the array into separate columns
unpacked_result = final_result.select(
    "ORIGIN",
    explode("dates").alias("FL_ARR_DELAY")
).select(
    "ORIGIN",
    col("FL_ARR_DELAY.FL_DATE").alias("FL_DATE"),
    col("FL_ARR_DELAY.averageArrDelay").alias("ARR_DELAY")
)

# Sort the result by 'FL_DATE' and 'ORIGIN' in ascending order
sorted_result = unpacked_result.orderBy("ORIGIN", "FL_DATE")

# Show the final result
sorted_result.show(10)

[Stage 3:>                                                          (0 + 1) / 1]

+------+-------+-------------------+
|ORIGIN|FL_DATE|          ARR_DELAY|
+------+-------+-------------------+
|   ABE| 1/1/09|               -5.5|
|   ABE| 1/2/09|                3.0|
|   ABE| 1/3/09|               29.0|
|   ABI| 1/1/09|-14.285714285714286|
|   ABI| 1/2/09| -6.857142857142857|
|   ABI| 1/3/09| -6.833333333333333|
|   ABQ| 1/1/09|0.20833333333333334|
|   ABQ| 1/2/09|  7.559633027522936|
|   ABQ| 1/3/09|               7.33|
|   ABY| 1/1/09|              -19.0|
+------+-------+-------------------+
only showing top 10 rows



                                                                                

## ML Model to predict the Arrival Delay

In [7]:

# Filter out rows with null values in the target column (ARR_DELAY)
filtered_df = df.filter(df.ARR_DELAY.isNotNull())

# Select relevant features for prediction
selected_df = filtered_df.select("CRS_ARR_TIME", "CRS_DEP_TIME", "DEP_DELAY", "DISTANCE","AIR_TIME", "DEP_TIME","ARR_DELAY")

# Split the dataset into training and testing sets
(train_df, test_df) = selected_df.randomSplit([0.7, 0.3], seed=42)


In [8]:

# Create a feature vector by combining input features
assembler = VectorAssembler(inputCols=[ 'ARR_DELAY',"DEP_TIME","AIR_TIME", "DEP_DELAY", "DISTANCE"], outputCol="features")
assembled_train_df = assembler.transform(train_df)

# Initialize and train the Random Forest Regressor model
rf = RandomForestRegressor(labelCol="ARR_DELAY", featuresCol="features")
model = rf.fit(assembled_train_df)

# Make predictions on the test set
assembled_test_df = assembler.transform(test_df)
predictions = model.transform(assembled_test_df)

# Display predicted delays and actual delays
predictions.select("features", "ARR_DELAY", "prediction").show(10)


                                                                                

+--------------------+---------+-------------------+
|            features|ARR_DELAY|         prediction|
+--------------------+---------+-------------------+
|[48.0,2156.0,213....|       48| 44.579181421817665|
|[-12.0,2110.0,138...|      -12| -8.005874580176892|
|[-12.0,2113.0,82....|      -12| -10.99608685033416|
|[22.0,2148.0,78.0...|       22| 27.363640686090612|
|[32.0,2251.0,56.0...|       32| 22.190164309890456|
|[-13.0,1852.0,152...|      -13|  -9.61463433943352|
|[-7.0,1855.0,153....|       -7|-4.0308530217675385|
|[22.0,2144.0,128....|       22|  22.50509606028022|
|[10.0,2146.0,121....|       10|  2.485768025382524|
|[1.0,2200.0,101.0...|        1| 1.5879432539370817|
+--------------------+---------+-------------------+
only showing top 10 rows



In [9]:
# Define the evaluation metric as Root Mean Squared Error
evaluator = RegressionEvaluator(labelCol="ARR_DELAY", predictionCol="prediction", metricName="rmse")

# Calculate the RMSE
rmse = evaluator.evaluate(predictions)

# Print the RMSE
print("Root Mean Squared Error (RMSE):", rmse)


[Stage 22:>                                                         (0 + 1) / 1]

Root Mean Squared Error (RMSE): 14.574476414401946


                                                                                

## ML Model to predict the Cancellations

In [24]:
# Select the relevant features for the prediction
selected_features = [ 'DISTANCE']

# Create a vector assembler to combine the selected features into a feature vector column
assembler = VectorAssembler(inputCols=selected_features, outputCol='features')

# Transform the dataset to include the feature vector column
dataset = assembler.transform(df).select('features', 'CANCELLED')

# Split the dataset into training and test sets
(train_data, test_data) = dataset.randomSplit([0.7, 0.3], seed=42)

# Create a Decision Tree Classifier model
dt = DecisionTreeClassifier(featuresCol='features', labelCol='CANCELLED')

# Train the model on the training data
model = dt.fit(train_data)

# Make predictions on the test data
predictions = model.transform(test_data)

# Show the predicted cancellations
predictions.select("features", 'CANCELLED', 'prediction').show()

+--------+---------+----------+
|features|CANCELLED|prediction|
+--------+---------+----------+
|  [31.0]|        0|       0.0|
|  [49.0]|        0|       0.0|
|  [49.0]|        0|       0.0|
|  [49.0]|        0|       0.0|
|  [49.0]|        0|       0.0|
|  [49.0]|        0|       0.0|
|  [49.0]|        0|       0.0|
|  [49.0]|        0|       0.0|
|  [56.0]|        0|       0.0|
|  [56.0]|        0|       0.0|
|  [56.0]|        0|       0.0|
|  [56.0]|        0|       0.0|
|  [56.0]|        0|       0.0|
|  [56.0]|        0|       0.0|
|  [56.0]|        0|       0.0|
|  [56.0]|        0|       0.0|
|  [56.0]|        0|       0.0|
|  [56.0]|        0|       0.0|
|  [56.0]|        0|       0.0|
|  [64.0]|        0|       0.0|
+--------+---------+----------+
only showing top 20 rows



In [26]:
# Define the evaluation metric as Root Mean Squared Error
evaluator = RegressionEvaluator(labelCol="CANCELLED", predictionCol="prediction", metricName="rmse")

# Calculate the RMSE
rmse = evaluator.evaluate(predictions)

# Print the RMSE
print("Root Mean Squared Error (RMSE):", rmse)

Root Mean Squared Error (RMSE): 0.10404798103595465


### GraphX to determine indegrees for Airports

In [6]:
# Select the relevant columns for airports and flights
airport_data = df.select(col("ORIGIN").alias("id")).distinct()
flight_data = df.select(col("ORIGIN").alias("src"), col("DEST").alias("dst"))

# Create vertices DataFrame for airports
vertices = airport_data

# Create edges DataFrame for flights
edges = flight_data

# Create a GraphFrame
graph = GraphFrame(vertices, edges)

# Perform graph operations using GraphX API for calculating the in-degree of airports
in_degree = graph.inDegrees

# Show the in-degree of airports
in_degree.show(10)



+---+--------+
| id|inDegree|
+---+--------+
|BGM|    5497|
|PSE|    7906|
|MSY|  422540|
|GEG|  110422|
|BUR|  246970|
|SNA|  413826|
|GRB|   50912|
|GTF|   18918|
|IDA|   27094|
|GRR|  120614|
+---+--------+
only showing top 10 rows



                                                                                

23/05/14 20:54:04 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 195570 ms exceeds timeout 120000 ms
23/05/14 20:54:04 WARN SparkContext: Killing executors is not supported by current scheduler.
23/05/14 20:54:10 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:322)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:80)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:641)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1111)
	at org.apache.spark.executor.Executor.$anonfun$heartbeater$1(Executor.scala:244)
	at sc

 In Graph Theory, the in-degree of a vertex (in this case, an airport) represents the number of incoming edges (flights) to that vertex.

## Number of flights between Airports

In [12]:
# Extract the ORIGIN, DEST, and count columns
origin_airports = airport_traffic.select("ORIGIN").rdd.flatMap(lambda x: x).collect()
dest_airports = airport_traffic.select("DEST").rdd.flatMap(lambda x: x).collect()
flight_counts = airport_traffic.select("count").rdd.flatMap(lambda x: x).collect()

# Create a heatmap using Plotly
fig = go.Figure(data=go.Heatmap(
    z=flight_counts,
    x=dest_airports,
    y=origin_airports,
    colorscale='Viridis',
))

# Customize the chart layout
fig.update_layout(
    title="Number of Flights between ORIGIN and DEST",
    xaxis_title="DEST Airports",
    yaxis_title="ORIGIN Airports",
     width=3000,  
    height=3000,  
)

# Display the chart
fig.show()

                                                                                