# ✈️ Flight Prices Cleaning Notebook

This notebook reads a CSV containing flight price data, cleans the 'price' column, filters out low prices, and sorts the data.

In [76]:
import re
import pandas as pd
import numpy as np

def duration_to_hours(duration_str):
    if isinstance(duration_str, str):
        hours = 0
        minutes = 0
        hr_match = re.search(r'(\d+)\s*hr', duration_str)
        min_match = re.search(r'(\d+)\s*min', duration_str)
        if hr_match:
            hours = int(hr_match.group(1))
        if min_match:
            minutes = int(min_match.group(1))
        
        # Guard: if minutes >= 60, normalize
        hours += minutes // 60
        minutes = minutes % 60

        total_hours = hours + minutes / 60
        return round(total_hours, 2)
    else:
        return None

## 📖 Read and clean the CSV file

In [77]:
# Replace with your actual file path if needed
file_path = 'results\\1747686091.5883362\\results.csv'
df = pd.read_csv(file_path)

# Remove commas, convert to numeric, turn errors into NaN
df['price'] = pd.to_numeric(df['price'].astype(str).str.replace(',', ''), errors='coerce')
# Drop rows with NaN values in 'price' column
df = df.dropna(subset=['price'])

# Apply the duration_to_hours function to create a new column
df['duration_hours'] = df['duration'].apply(duration_to_hours)
# Drop rows with NaN values in 'duration_hours' column
df = df.dropna(subset=['duration_hours'])

# Add a new column 'days'
df['outbound'] = pd.to_datetime(df['outbound'])
df['inbound'] = pd.to_datetime(df['inbound'])
df['days'] = (df['inbound'] - df['outbound']).dt.days

# Remove rows containing 'RHO' in 'from' or 'to' columns
df = df[~df['to'].str.contains('RHO', na=False)]

## 🔍 Remove durations up to X hours

In [78]:
df = df[df['duration_hours'] < 7]

## 🔍 Remove prices up to X euro

In [79]:
df = df[df['price'] < 1000]

## 📊 Sort by price ascending

In [80]:
df = df.sort_values(by='price', ascending=True)

# Final result
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,timestamp,from,to,outbound,inbound,price,airline,time,duration,type,duration_hours,days
0,1.747686e+09,BLQ,AYT,2025-08-08,2025-08-23,428.0,"SWISS, SunExpressOperated by Helvetic",3:15PM 10:45PM,6 hr 30 min,1 stop,6.50,15
1,1.747686e+09,VCE,AYT,2025-08-07,2025-08-22,436.0,"SWISS, SunExpressOperated by Helvetic",2:50PM 10:45PM,6 hr 55 min,1 stop,6.92,15
2,1.747686e+09,BLQ,AYT,2025-08-08,2025-08-23,440.0,3 hr 5 min VIE,1:25PM 8:55PM,6 hr 30 min,1 stop,6.50,15
3,1.747686e+09,BLQ,AYT,2025-08-07,2025-08-22,440.0,3 hr 5 min VIE,9:50AM 5:20PM,6 hr 30 min,1 stop,6.50,15
4,1.747686e+09,BLQ,AYT,2025-08-07,2025-08-22,448.0,"Air Dolomiti, SunExpressLufthansa",1:25PM 8:55PM,6 hr 30 min,1 stop,6.50,15
...,...,...,...,...,...,...,...,...,...,...,...,...
156,1.747686e+09,VCE,BJV,2025-08-07,2025-08-22,977.0,1 hr 20 min IST,9:20AM 4:15PM,5 hr 55 min,1 stop,5.92,15
157,1.747686e+09,FCO,GZP,2025-08-09,2025-08-24,980.0,Turkish Airlines,7:00AM 2:55PM,6 hr 55 min,1 stop,6.92,15
158,1.747686e+09,FCO,DLM,2025-08-09,2025-08-24,982.0,Pegasus,7:00AM 1:35PM,5 hr 35 min,1 stop,5.58,15
159,1.747686e+09,NAP,BJV,2025-08-07,2025-08-22,989.0,"FCO, SAW",9:20AM 4:15PM,5 hr 55 min,1 stop,5.92,15


## 📊  Find the cheapest flight per route

In [81]:
best_per_route = df.loc[df.groupby(['from', 'to'])['price'].idxmin()]
best_per_route

Unnamed: 0,timestamp,from,to,outbound,inbound,price,airline,time,duration,type,duration_hours,days
56,1747687000.0,BGY,AYT,2025-08-04,2025-08-19,608.0,"AJetOperated by Turkish Airlines, Turkish Airl...",12:45PM 7:50PM,6 hr 5 min,1 stop,6.08,15
46,1747687000.0,BGY,BJV,2025-08-05,2025-08-20,567.0,AJetOperated by BBN Airlines,12:45PM 7:20PM,5 hr 35 min,1 stop,5.58,15
43,1747687000.0,BGY,DLM,2025-08-07,2025-08-22,567.0,AJetOperated by Turkish Airlines,12:45PM 8:35PM,6 hr 50 min,1 stop,6.83,15
0,1747686000.0,BLQ,AYT,2025-08-08,2025-08-23,428.0,"SWISS, SunExpressOperated by Helvetic",3:15PM 10:45PM,6 hr 30 min,1 stop,6.5,15
139,1747686000.0,BLQ,DLM,2025-08-04,2025-08-19,872.0,Turkish Airlines,2:55PM 10:00PM,6 hr 5 min,1 stop,6.08,15
23,1747686000.0,FCO,AYT,2025-08-06,2025-08-21,503.0,AJet,12:30PM 7:00PM,5 hr 30 min,1 stop,5.5,15
24,1747686000.0,FCO,BJV,2025-08-06,2025-08-21,503.0,AJetOperated by Turkish Airlines,12:30PM 7:10PM,5 hr 40 min,1 stop,5.67,15
52,1747686000.0,FCO,DLM,2025-08-08,2025-08-23,591.0,AJetOperated by Turkish Airlines,12:30PM 7:05PM,5 hr 35 min,1 stop,5.58,15
132,1747686000.0,FCO,GZP,2025-08-04,2025-08-19,870.0,3 hr 25 min CPH,7:00AM 2:55PM,6 hr 55 min,1 stop,6.92,15
8,1747686000.0,LIN,AYT,2025-08-06,2025-08-21,463.0,2 hr 55 min FRA,10:40AM 6:25PM,6 hr 45 min,1 stop,6.75,15


## 📊   Find the shortest flight per route

In [82]:
df['price_per_hour'] = (df['price'] / df['duration_hours']).round(2)
best_tradeoffs = df.sort_values(by='price_per_hour')
best_tradeoffs

Unnamed: 0,timestamp,from,to,outbound,inbound,price,airline,time,duration,type,duration_hours,days,price_per_hour
1,1.747686e+09,VCE,AYT,2025-08-07,2025-08-22,436.0,"SWISS, SunExpressOperated by Helvetic",2:50PM 10:45PM,6 hr 55 min,1 stop,6.92,15,63.01
0,1.747686e+09,BLQ,AYT,2025-08-08,2025-08-23,428.0,"SWISS, SunExpressOperated by Helvetic",3:15PM 10:45PM,6 hr 30 min,1 stop,6.50,15,65.85
6,1.747686e+09,NAP,AYT,2025-08-04,2025-08-19,457.0,"SWISS, SunExpressOperated by Air Baltic",2:50PM 10:45PM,6 hr 55 min,1 stop,6.92,15,66.04
5,1.747686e+09,NAP,AYT,2025-08-05,2025-08-20,457.0,"SWISS, SunExpressOperated by Helvetic",2:50PM 10:45PM,6 hr 55 min,1 stop,6.92,15,66.04
3,1.747686e+09,BLQ,AYT,2025-08-07,2025-08-22,440.0,3 hr 5 min VIE,9:50AM 5:20PM,6 hr 30 min,1 stop,6.50,15,67.69
...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,1.747686e+09,VCE,BJV,2025-08-07,2025-08-22,896.0,2 hr IST,2:05PM 8:20PM,5 hr 15 min,1 stop,5.25,15,170.67
158,1.747686e+09,FCO,DLM,2025-08-09,2025-08-24,982.0,Pegasus,7:00AM 1:35PM,5 hr 35 min,1 stop,5.58,15,175.99
155,1.747686e+09,VCE,AYT,2025-08-04,2025-08-19,970.0,Pegasus,2:55PM 9:20PM,5 hr 25 min,1 stop,5.42,15,178.97
160,1.747686e+09,NAP,AYT,2025-08-07,2025-08-22,989.0,"SWISS, SunExpressOperated by Helvetic",9:20AM 3:45PM,5 hr 25 min,1 stop,5.42,15,182.47


## 💾 Save results to CSV file

In [83]:
df = df.drop(columns=['duration_hours'])
df.to_csv(f'{file_path}_cleaned.csv', index=False)
