In [None]:
%pip install kagglehub
import kagglehub

# Download latest version
path = kagglehub.dataset_download("usdot/flight-delays")

print("Path to dataset files:", path)


In [None]:
import pandas as pd
import os

# Assuming the path variable from the previous cell contains the directory
# where the CSV files are located
csv_files = [f for f in os.listdir(path) if f.endswith(".csv")]

# Create an empty dictionary to store the dataframes
dataframes = {}

# Loop through the files and load them into pandas dataframes
for file in csv_files:
    file_path = os.path.join(path, file)
    # Use the file name (without extension) as the key in the dictionary
    df_name = os.path.splitext(file)[0]
    dataframes[df_name] = pd.read_csv(file_path)

# Now you have a dictionary called 'dataframes' where each key is the name
# of the CSV file and each value is the corresponding pandas dataframe.
# For example, to access the dataframe from 'flights.csv', you would use:
# dataframes['flights']

# Print the keys of the dictionary to see the names of the dataframes



In [None]:
df_flights = dataframes.get("flights")

In [None]:
df_flights.head()

In [None]:
df_flights.shape

In [None]:
df_airlines = dataframes.get("airlines")

In [None]:
df_airlines.head(20)

In [None]:
df_airports = dataframes.get("airports")

In [None]:
df_airports.head()

In [None]:
df_flights_sample = df_flights.sample(n=5000, random_state=42)

df_flights_sample.to_csv("flights.csv", index=False)
df_airlines.to_csv("airlines.csv", index=False)
df_airports.to_csv("airports.csv", index=False)

In [None]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder.appName("Example").getOrCreate()


In [None]:
df = spark.read.csv("flights.csv", header=True)

In [None]:
df.show()

In [11]:
from sqlalchemy import create_engine
import pandas as pd

# Create PostgreSQL connection
db_url = "postgresql://postgres:suman@localhost/text2sql"
engine = create_engine(db_url)

# Read CSV files using pandas (we already have pandas imported)
flights_df = pd.read_csv("flights.csv")
airlines_df = pd.read_csv("airlines.csv")
airports_df = pd.read_csv("airports.csv")

flights_df.columns = flights_df.columns.str.lower()
airlines_df.columns = airlines_df.columns.str.lower()
airports_df.columns = airports_df.columns.str.lower()

# Write dataframes to PostgreSQL
flights_df.to_sql("flights", engine, if_exists="replace", index=False)
airlines_df.to_sql("airlines", engine, if_exists="replace", index=False)
airports_df.to_sql("airports", engine, if_exists="replace", index=False)


print("Data successfully imported to PostgreSQL database")

Data successfully imported to PostgreSQL database


In [4]:
airlines_df

Unnamed: 0,IATA_CODE,AIRLINE
0,UA,United Air Lines Inc.
1,AA,American Airlines Inc.
2,US,US Airways Inc.
3,F9,Frontier Airlines Inc.
4,B6,JetBlue Airways
5,OO,Skywest Airlines Inc.
6,AS,Alaska Airlines Inc.
7,NK,Spirit Air Lines
8,WN,Southwest Airlines Co.
9,DL,Delta Air Lines Inc.


In [5]:
airports_df

Unnamed: 0,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
0,ABE,Lehigh Valley International Airport,Allentown,PA,USA,40.65236,-75.44040
1,ABI,Abilene Regional Airport,Abilene,TX,USA,32.41132,-99.68190
2,ABQ,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919
3,ABR,Aberdeen Regional Airport,Aberdeen,SD,USA,45.44906,-98.42183
4,ABY,Southwest Georgia Regional Airport,Albany,GA,USA,31.53552,-84.19447
...,...,...,...,...,...,...,...
317,WRG,Wrangell Airport,Wrangell,AK,USA,56.48433,-132.36982
318,WYS,Westerly State Airport,West Yellowstone,MT,USA,44.68840,-111.11764
319,XNA,Northwest Arkansas Regional Airport,Fayetteville/Springdale/Rogers,AR,USA,36.28187,-94.30681
320,YAK,Yakutat Airport,Yakutat,AK,USA,59.50336,-139.66023


In [13]:
query = """
SELECT
    f.year,
    f.month,
    al.airline,
    orig.airport as origin_airport,
    dest.airport as destination_airport,
    f.departure_delay,
    f.arrival_delay,
    f.distance
FROM flights f
JOIN airlines al ON f.airline = al.iata_code
JOIN airports orig ON f.origin_airport = orig.iata_code
JOIN airports dest ON f.destination_airport = dest.iata_code
LIMIT 10;
"""
# query = """
# select airline from airlines;
# """


# Execute query using the existing engine connection
result = pd.read_sql_query(query, engine)
result

Unnamed: 0,year,month,airline,origin_airport,destination_airport,departure_delay,arrival_delay,distance
0,2015,4,Atlantic Southeast Airlines,Fort Wayne International Airport,Detroit Metropolitan Airport,-5.0,-13.0,128
1,2015,1,Alaska Airlines Inc.,McCarran International Airport,Seattle-Tacoma International Airport,-12.0,-12.0,867
2,2015,7,Southwest Airlines Co.,Oakland International Airport,Seattle-Tacoma International Airport,-4.0,-8.0,672
3,2015,5,Southwest Airlines Co.,St. Louis International Airport at Lambert Field,Dallas Love Field,153.0,152.0,546
4,2015,7,United Air Lines Inc.,George Bush Intercontinental Airport,Honolulu International Airport,8.0,-2.0,3904
5,2015,9,Delta Air Lines Inc.,Huntsville International Airport,Hartsfield-Jackson Atlanta International Airport,16.0,-5.0,151
6,2015,9,United Air Lines Inc.,George Bush Intercontinental Airport,Tampa International Airport,1.0,-16.0,787
7,2015,6,Alaska Airlines Inc.,Seattle-Tacoma International Airport,Phoenix Sky Harbor International Airport,-4.0,-12.0,1107
8,2015,12,American Airlines Inc.,John F. Kennedy International Airport (New Yor...,Los Angeles International Airport,-8.0,-12.0,2475
9,2015,1,Southwest Airlines Co.,Manchester-Boston Regional Airport,Baltimore-Washington International Airport,-4.0,-27.0,377
