# Flightdata Preprocessor

In [1]:
import pandas as pd
from datetime import datetime
from datetime import timedelta
import logging
import matplotlib.pyplot as plt
import sys

In [352]:
sys.path.append('./')

In [351]:
df = pd.read_csv('../assets/data.txt', sep="\t", encoding="utf-8")
del df['nix']

In [171]:
df.head()

Unnamed: 0,Flight,Date,Origin,Destination,Equipment,Flight Time,ETD,ATD,ETA,ATA
0,LH922,18-Oct-20,Frankfurt (FRA),London (LHR),A20N (D-AING),1:09,21:30,21:42,22:10,Landed 21:50
1,LH922,17-Oct-20,Frankfurt (FRA),London (LHR),A20N (D-AINL),1:03,21:30,22:04,22:10,Landed 22:08
2,LH922,16-Oct-20,Frankfurt (FRA),London (LHR),A320 (D-AIUY),1:07,21:30,21:41,22:10,Landed 21:47
3,LH922,15-Oct-20,Frankfurt (FRA),London (LHR),A20N (D-AIJB),1:08,21:30,21:39,22:10,Landed 21:48
4,LH922,14-Oct-20,Frankfurt (FRA),London (LHR),A20N (D-AINJ),1:06,21:30,21:34,22:10,Landed 21:40


In [172]:
def clean_data(df):
    logging.warning(f'Received df of len {len(df)}...')
    for _ in ['Origin', 'Destination', 'Equipment']:
        df = df[df[_].str.contains("\)")]
    for _ in ['Flight Time', 'ETD', 'ATD', 'ETA', 'ATA']:
        df = df[~(df[_].str.contains("-")) & ~(df[_].str.contains("—"))]
    df = df[(df['ATA'].str.contains("Landed"))]
    df['ATA'] = df['ATA'].str.replace('Landed ','')
    df['ATA'] = df['ATA'].str.replace(u'Landed\xa0','')
    logging.warning(f'Returned df of len {len(df)}...')
    return df


In [173]:
_df = clean_data(df)



In [174]:
def clean_codes(df):
    _df = df.copy()
    for _ in ['Origin', 'Destination']:
        _df[[f'{_}_City', f'{_}_Code']] = _df[_].str.split(u"\xa0\(", n = 1, expand = True)
    _df[['Equipment_Type', 'Equipment_Reg']] = _df['Equipment'].str.split(u"\xa0\(", n = 1, expand = True)
    for _ in ['Equipment_Reg', 'Origin_Code', 'Destination_Code']:
        _df[_] = _df[_].str.replace(")", "")
    return _df[['Flight', 'Date', 'Equipment_Type', 'Equipment_Reg', 'Origin_City', 'Origin_Code', 'Destination_City', 'Destination_Code', 'Flight Time', 'ETD', 'ETA', 'ATD', 'ATA']]

In [175]:
_df = clean_codes(_df)

In [201]:
def get_carrier_code(df):
    _df = df.copy()
    carrier = {}
    carrier_list = []
    for index, row in _df.iterrows():
        if row['Flight'][:2] not in carrier.keys():
            carrier[row['Flight'][:2]] = ""
        carrier_list.append(row['Flight'][:2])
    return carrier, carrier_list

In [202]:
carrier, carrier_list = get_carrier_code(_df)

In [214]:
def set_carrier_info(df, carrier, carrier_class, carrier_list):
    _df = df.copy()
    _df['Carrier_Name'] = carrier_list
    _df['Carrier_Name'] = _df['Carrier_Name'].map(carrier)
    _df['Carrier_Class'] = _df['Carrier_Name'].map(carrier_class)
    return _df

In [218]:
_df = set_carrier_info(_df, carrier, carrier_class, carrier_list)

In [208]:
carrier = {
    'LH': 'Lufthansa',
    'BA': 'British Airways',
    'LO': 'LOT',
    'SK': 'SAS',
    'UX': 'Air Europa',
    'IB': 'Iberia',
    'KL': 'KLM Royal Dutch Airlines',
    'DY': 'Norwegian',
    'ET': 'Ethiopian',
    'FR': 'Ryanair',
    'HV': 'Transavia',
    'VY': 'Vueling',
    'EI': 'Aer Lingus',
    'U2': 'EasyJet'
    }

In [217]:
carrier_class = {
    'Lufthansa': 0,
    'British Airways': 0,
    'LOT': 0,
    'SAS': 0,
    'Air Europa': 0,
    'Iberia': 0,
    'KLM Royal Dutch Airlines': 0,
    'Norwegian': 1,
    'Ethiopian': 0,
    'Ryanair': 1,
    'Transavia': 1,
    'Vueling': 1,
    'Aer Lingus': 0,
    'EasyJet': 1
    }

In [227]:
def detect_dates(df):
    _df = df.copy()
    date_list = []
    for index, row in _df.iterrows():
        date_list.append(datetime.strptime(row['Date'], '%d-%b-%y').date().strftime('%Y-%m-%d'))
        
    _df['Date'] = date_list
    return _df

In [228]:
_df = detect_dates(_df)

In [324]:
def get_time_delta(df):
    _df = df.copy()
    deltas = []
    for index, row in _df.iterrows():
        tdelta = datetime.strptime(row['ATA'], '%H:%M') - datetime.strptime(row['ETA'], '%H:%M')
        #if tdelta.days < 0:
        #tdelta = timedelta(seconds = tdelta.seconds, microseconds = tdelta.microseconds)
        if datetime.strptime(row['ATA'], '%H:%M') < datetime.strptime(row['ETA'], '%H:%M'):
            deltas.append(-1 * int((datetime.strptime('00:00', '%H:%M') - tdelta).time().minute))
        else:
            deltas.append(int((datetime.strptime('00:00', '%H:%M') + tdelta).time().minute))

    _df['Timedelta'] = deltas
    return _df

In [325]:
_df = get_time_delta(_df)

In [333]:
def get_departure_groups(df):
    _df = df.copy()
    dep_groups = []
    for index, row in _df.iterrows():
        if datetime.strptime(row['ETD'], '%H:%M') <= datetime.strptime('07:30', '%H:%M'):
            dep_groups.append('Red Eye')
        elif datetime.strptime(row['ETD'], '%H:%M') <= datetime.strptime('12:00', '%H:%M'):
            dep_groups.append('Morning')
        elif datetime.strptime(row['ETD'], '%H:%M') <= datetime.strptime('18:00', '%H:%M'):
            dep_groups.append('Afternoon')
        elif datetime.strptime(row['ETD'], '%H:%M') <= datetime.strptime('21:30', '%H:%M'):
            dep_groups.append('Evening')
        elif datetime.strptime(row['ETD'], '%H:%M') <= datetime.strptime('23:59', '%H:%M'):
            dep_groups.append('Night')
        else:
            dep_groups.append(None)
            logging.warning('invalid time?')
    _df['ETD_Group'] = dep_groups
    return _df

In [335]:
_df = get_departure_groups(_df)

In [337]:
_df.to_csv('../assets/data_prep.txt', sep="\t", encoding="utf-8", index=False)

In [3]:
df = pd.read_csv('../assets/data_prep.txt', sep="\t", encoding="utf-8")

In [None]:
def vote_majority(df):
    _df = df.copy()
    return _df

In [11]:
grp = df.groupby(['Flight', 'Origin_City', 'Destination_City'])

In [21]:
for key, values in grp.items():
    print(key)

AttributeError: 'DataFrameGroupBy' object has no attribute 'items'

In [27]:
grp.count().duplicated()

Flight  Origin_City  Destination_City
BA423   Amsterdam    London              False
BA430   London       Amsterdam           False
BA431   Amsterdam    London              False
BA440   London       Amsterdam           False
BA458   London       Madrid              False
                                         ...  
VY8300  Barcelona    Amsterdam            True
VY8301  Amsterdam    Barcelona            True
VY8720  Barcelona    Dublin               True
VY8721  Calgary      Barcelona            True
        Dublin       Barcelona            True
Length: 291, dtype: bool

In [30]:
grpx = grp.count()

In [43]:
group_test = grpx.reset_index()

In [44]:
group_test

Unnamed: 0,Flight,Origin_City,Destination_City,Date,Equipment_Type,Equipment_Reg,Origin_Code,Destination_Code,Flight Time,ETD,ETA,ATD,ATA,Carrier_Name,Carrier_Class,Timedelta,ETD_Group
0,BA423,Amsterdam,London,834,834,834,834,834,834,834,834,834,834,834,834,834,834
1,BA430,London,Amsterdam,973,973,973,973,973,973,973,973,973,973,973,973,973,973
2,BA431,Amsterdam,London,986,986,986,986,986,986,986,986,986,986,986,986,986,986
3,BA440,London,Amsterdam,913,913,913,913,913,913,913,913,913,913,913,913,913,913
4,BA458,London,Madrid,914,914,914,914,914,914,914,914,914,914,914,914,914,914
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
286,VY8300,Barcelona,Amsterdam,917,917,917,917,917,917,917,917,917,917,917,917,917,917
287,VY8301,Amsterdam,Barcelona,918,918,918,918,918,918,918,918,918,918,918,918,918,918
288,VY8720,Barcelona,Dublin,689,689,689,689,689,689,689,689,689,689,689,689,689,689
289,VY8721,Calgary,Barcelona,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [None]:
for index, row in grpx.iterrows():
    