<a href="https://colab.research.google.com/github/meozbrls/Flight_Prices_Analysis/blob/buse/notebooks%20/Analysis%26KPI/%20airport_coordinates.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# ==================================================
# 0) Gerekli kurulumlar
# ==================================================
!pip install -q gdown pyarrow

# ==================================================
# 1) Imports
# ==================================================
import gdown, os
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq

# ==================================================
# 2) Klasör yapısı
# ==================================================
os.makedirs("data", exist_ok=True)
os.makedirs("out", exist_ok=True)
print("Environment Ready: Directories created.")


Environment Ready: Directories created.


In [12]:
AIRPORT_FILE_ID = "1f073sUgb_YPBd3_IMlyR6sQgTqIfJJFZ"
AIRPORT_PATH = "data/us_airports.csv"

FLIGHT_FILE_ID = "1TrX_MuNS-EvrjKutCqzLnOa2W6X6Y3Dm"
FLIGHT_PATH = "data/flight_prices_clean.csv"   # <-- CSV olduğunu söyledin

if not os.path.exists(AIRPORT_PATH):
    gdown.download(f"https://drive.google.com/uc?id={AIRPORT_FILE_ID}", AIRPORT_PATH, quiet=False)

if not os.path.exists(FLIGHT_PATH):
    gdown.download(f"https://drive.google.com/uc?id={FLIGHT_FILE_ID}", FLIGHT_PATH, quiet=False)

print("Files ready:", os.listdir("data"))


Files ready: ['flight_prices_clean.parquet', 'route_geo_prices.csv', 'us_airports.csv', 'route_geo_prices.parquet', 'flight_prices_clean.csv', 'coordinate_routes.csv', 'flightprices.csvn2ed7meb.part', 'flightprices.csv']


In [13]:
import os
os.listdir("data")

['flight_prices_clean.parquet',
 'route_geo_prices.csv',
 'us_airports.csv',
 'route_geo_prices.parquet',
 'flight_prices_clean.csv',
 'coordinate_routes.csv',
 'flightprices.csvn2ed7meb.part',
 'flightprices.csv']

In [14]:
# ================================================== # 4) Airports data — sadece gerekli kolonlar # ==================================================
airports = pd.read_csv( AIRPORT_PATH,
                       usecols=["IATA", "LATITUDE", "LONGITUDE"] )
airports = airports.rename(columns={ "IATA": "airport",
                                    "LATITUDE": "lat",
                                     "LONGITUDE": "lon" })
airports["airport"] = airports["airport"].astype(str).str.strip().str.upper()
airports = airports.dropna(subset=["airport", "lat", "lon"]).drop_duplicates("airport")
# ================================================== # 5) Flight prices data — sadece gerekli kolonlar # ==================================================
flight_cols = [ "startingAirport", "destinationAirport", "totalFare", "searchDate", "flightDate" ]
INPUT_CSV = "data/flightprices.csv"
OUTPUT_CSV = "data/coordinate_routes.csv"
CHUNK_SIZE = 300_000

# Clear existing output file if it exists
if os.path.exists(OUTPUT_CSV): os.remove(OUTPUT_CSV)
# Read and write in chunks
chunks = pd.read_csv(INPUT_CSV, usecols=flight_cols, chunksize=CHUNK_SIZE, low_memory=False)
for i, chunk in enumerate(chunks):
  chunk.to_csv(OUTPUT_CSV, index=False, mode="a", header=(i==0))# Log progress for every chunk processed
  print(f"Chunk {i+1} is writen: {chunk.shape}")
print("New optimized CSV created:", OUTPUT_CSV)

Chunk 1 is writen: (300000, 5)
Chunk 2 is writen: (300000, 5)
Chunk 3 is writen: (300000, 5)
Chunk 4 is writen: (300000, 5)
Chunk 5 is writen: (300000, 5)
Chunk 6 is writen: (300000, 5)
Chunk 7 is writen: (300000, 5)
Chunk 8 is writen: (300000, 5)
Chunk 9 is writen: (300000, 5)
Chunk 10 is writen: (300000, 5)
Chunk 11 is writen: (300000, 5)
Chunk 12 is writen: (300000, 5)
Chunk 13 is writen: (300000, 5)
Chunk 14 is writen: (300000, 5)
Chunk 15 is writen: (300000, 5)
Chunk 16 is writen: (300000, 5)
Chunk 17 is writen: (300000, 5)
Chunk 18 is writen: (300000, 5)
Chunk 19 is writen: (300000, 5)
Chunk 20 is writen: (300000, 5)
Chunk 21 is writen: (300000, 5)
Chunk 22 is writen: (300000, 5)
Chunk 23 is writen: (300000, 5)
Chunk 24 is writen: (300000, 5)
Chunk 25 is writen: (300000, 5)
Chunk 26 is writen: (300000, 5)
Chunk 27 is writen: (300000, 5)
Chunk 28 is writen: (300000, 5)
Chunk 29 is writen: (300000, 5)
Chunk 30 is writen: (300000, 5)
Chunk 31 is writen: (300000, 5)
Chunk 32 is write

In [15]:
flight_df = pd.read_csv("data/coordinate_routes.csv", low_memory=False)
print(flight_df.shape)

(11774933, 5)


In [16]:
flight_df["startingAirport"] = flight_df["startingAirport"].astype(str).str.strip().str.upper()
flight_df["destinationAirport"] = flight_df["destinationAirport"].astype(str).str.strip().str.upper()

In [17]:
# ================================================== # 6) Join #1 — startingAirport koordinatları # ==================================================
flight_df = ( flight_df.merge( airports,
                               left_on="startingAirport",
                               right_on="airport",
                               how="left"
                               )
.rename(columns={"lat": "start_lat", "lon": "start_lon"})
.drop(columns=["airport"])
 )

# ================================================== # 7) Join #2 — destinationAirport koordinatları # ==================================================
flight_df = ( flight_df.merge( airports,
                               left_on="destinationAirport",
                               right_on="airport",
                               how="left"
                               )
.rename(columns={"lat": "dest_lat", "lon": "dest_lon"})
.drop(columns=["airport"])
)

# ================================================== # 8) Koordinatı eksik olanları çıkar (Looker için temiz) # ==================================================
flight_df = flight_df.dropna( subset=["start_lat", "start_lon", "dest_lat", "dest_lon"] )

In [18]:
# ================================================== # 9) Route-level aggregate (Looker Studio için ideal tablo) # ==================================================
route_geo = ( flight_df
             .groupby( [ "startingAirport", "destinationAirport", "start_lat", "start_lon", "dest_lat", "dest_lon" ], as_index=False )
             .agg( flight_count=("totalFare", "size"), median_price=("totalFare", "median"), avg_price=("totalFare", "mean") ) )
route_geo["route"] = ( route_geo["startingAirport"] + "-" + route_geo["destinationAirport"] )

route_geo.head()

Unnamed: 0,startingAirport,destinationAirport,start_lat,start_lon,dest_lat,dest_lon,flight_count,median_price,avg_price,route
0,ATL,BOS,33.640444,-84.426944,42.364348,-71.005179,67815,308.59,325.157262,ATL-BOS
1,ATL,CLT,33.640444,-84.426944,35.214011,-80.943126,37417,339.1,322.12807,ATL-CLT
2,ATL,DEN,33.640444,-84.426944,39.858408,-104.667002,45813,428.6,433.128975,ATL-DEN
3,ATL,DFW,33.640444,-84.426944,32.895951,-97.0372,41610,271.6,295.187255,ATL-DFW
4,ATL,DTW,33.640444,-84.426944,42.212059,-83.348836,52270,297.61,315.294873,ATL-DTW


In [19]:
# ==================================================
# 8) Kaydet
# ==================================================
OUT_CSV = "out/route_geo_prices.csv"
route_geo.to_csv(OUT_CSV, index=False)

print("✅ Done. Total routes:", len(route_geo))
print("Saved:", OUT_CSV)

✅ Done. Total routes: 235
Saved: out/route_geo_prices.csv


In [21]:
# Download the files
from google.colab import files
files.download("out/route_geo_prices.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>