In [167]:
import pandas as pd
from datetime import datetime
import logging

In [169]:
df = pd.read_csv('../assets/data.txt', sep="\t", encoding="utf-8")
del df['nix']

In [171]:
df.head()

Unnamed: 0,Flight,Date,Origin,Destination,Equipment,Flight Time,ETD,ATD,ETA,ATA
0,LH922,18-Oct-20,Frankfurt (FRA),London (LHR),A20N (D-AING),1:09,21:30,21:42,22:10,Landed 21:50
1,LH922,17-Oct-20,Frankfurt (FRA),London (LHR),A20N (D-AINL),1:03,21:30,22:04,22:10,Landed 22:08
2,LH922,16-Oct-20,Frankfurt (FRA),London (LHR),A320 (D-AIUY),1:07,21:30,21:41,22:10,Landed 21:47
3,LH922,15-Oct-20,Frankfurt (FRA),London (LHR),A20N (D-AIJB),1:08,21:30,21:39,22:10,Landed 21:48
4,LH922,14-Oct-20,Frankfurt (FRA),London (LHR),A20N (D-AINJ),1:06,21:30,21:34,22:10,Landed 21:40


In [172]:
def clean_data(df):
    logging.warning(f'Received df of len {len(df)}...')
    for _ in ['Origin', 'Destination', 'Equipment']:
        df = df[df[_].str.contains("\)")]
        #logging.warning(f'Kicked out len {len(df)}...')
    for _ in ['Flight Time', 'ETD', 'ATD', 'ETA', 'ATA']:
        df = df[~(df[_].str.contains("-")) & ~(df[_].str.contains("—"))]
    df = df[(df['ATA'].str.contains("Landed"))]
    df['ATA'] = df['ATA'].str.replace('Landed ','')
    df['ATA'] = df['ATA'].str.replace(u'Landed\xa0','')
    logging.warning(f'Returned df of len {len(df)}...')
    return df


In [173]:
_df = clean_data(df)



In [174]:
def clean_codes(df):
    _df = df.copy()
    for _ in ['Origin', 'Destination']:
        _df[[f'{_}_City', f'{_}_Code']] = _df[_].str.split(u"\xa0\(", n = 1, expand = True)
    _df[['Equipment_Type', 'Equipment_Reg']] = _df['Equipment'].str.split(u"\xa0\(", n = 1, expand = True)
    for _ in ['Equipment_Reg', 'Origin_Code', 'Destination_Code']:
        _df[_] = _df[_].str.replace(")", "")
    return _df[['Flight', 'Date', 'Equipment_Type', 'Equipment_Reg', 'Origin_City', 'Origin_Code', 'Destination_City', 'Destination_Code', 'Flight Time', 'ETD', 'ETA', 'ATD', 'ATA']]

In [175]:
_df = clean_codes(_df)

In [201]:
def get_carrier_code(df):
    _df = df.copy()
    carrier = {}
    carrier_list = []
    for index, row in _df.iterrows():
        if row['Flight'][:2] not in carrier.keys():
            carrier[row['Flight'][:2]] = ""
        carrier_list.append(row['Flight'][:2])
    return carrier, carrier_list

In [202]:
carrier, carrier_list = get_carrier_code(_df)

In [214]:
def set_carrier_info(df, carrier, carrier_class, carrier_list):
    _df = df.copy()
    _df['Carrier_Name'] = carrier_list
    _df['Carrier_Name'] = _df['Carrier_Name'].map(carrier)
    _df['Carrier_Class'] = _df['Carrier_Name'].map(carrier_class)
    return _df

In [218]:
_df = set_carrier_info(_df, carrier, carrier_class, carrier_list)

In [220]:
_df.head()

Unnamed: 0,Flight,Date,Equipment_Type,Equipment_Reg,Origin_City,Origin_Code,Destination_City,Destination_Code,Flight Time,ETD,ETA,ATD,ATA,Carrier_Name,Carrier_Class
0,LH922,18-Oct-20,A20N,D-AING,Frankfurt,FRA,London,LHR,1:09,21:30,22:10,21:42,21:50,Lufthansa,0
1,LH922,17-Oct-20,A20N,D-AINL,Frankfurt,FRA,London,LHR,1:03,21:30,22:10,22:04,22:08,Lufthansa,0
2,LH922,16-Oct-20,A320,D-AIUY,Frankfurt,FRA,London,LHR,1:07,21:30,22:10,21:41,21:47,Lufthansa,0
3,LH922,15-Oct-20,A20N,D-AIJB,Frankfurt,FRA,London,LHR,1:08,21:30,22:10,21:39,21:48,Lufthansa,0
4,LH922,14-Oct-20,A20N,D-AINJ,Frankfurt,FRA,London,LHR,1:06,21:30,22:10,21:34,21:40,Lufthansa,0


In [208]:
carrier = {
    'LH': 'Lufthansa',
    'BA': 'British Airways',
    'LO': 'LOT',
    'SK': 'SAS',
    'UX': 'Air Europa',
    'IB': 'Iberia',
    'KL': 'KLM Royal Dutch Airlines',
    'DY': 'Norwegian',
    'ET': 'Ethiopian',
    'FR': 'Ryanair',
    'HV': 'Transavia',
    'VY': 'Vueling',
    'EI': 'Aer Lingus',
    'U2': 'EasyJet'
    }

In [217]:
carrier_class = {
    'Lufthansa': 0,
    'British Airways': 0,
    'LOT': 0,
    'SAS': 0,
    'Air Europa': 0,
    'Iberia': 0,
    'KLM Royal Dutch Airlines': 0,
    'Norwegian': 1,
    'Ethiopian': 0,
    'Ryanair': 1,
    'Transavia': 1,
    'Vueling': 1,
    'Aer Lingus': 0,
    'EasyJet': 1
    }

In [None]:
def separate_number_chars(s):
    res = re.split('([-+]?\d+\.\d+)|([-+]?\d+)', s.strip())
    res_f = [r.strip() for r in res if r is not None and r.strip() != '']
    return res_f

In [227]:
def detect_dates(df):
    _df = df.copy()
    date_list = []
    for index, row in _df.iterrows():
        date_list.append(datetime.strptime(row['Date'], '%d-%b-%y').date().strftime('%Y-%m-%d'))
        
    _df['Date'] = date_list
    return _df

In [228]:
_df = detect_dates(_df)

In [241]:
datetime.strptime('11:18', '%H:%M') - datetime.strptime('1:18', '%H:%M')

AttributeError: 'datetime.timedelta' object has no attribute 'strftime'

In [254]:
from datetime import timedelta

In [324]:
def get_time_delta(df):
    _df = df.copy()
    deltas = []
    for index, row in _df.iterrows():
        tdelta = datetime.strptime(row['ATA'], '%H:%M') - datetime.strptime(row['ETA'], '%H:%M')
        #if tdelta.days < 0:
        #tdelta = timedelta(seconds = tdelta.seconds, microseconds = tdelta.microseconds)
        if datetime.strptime(row['ATA'], '%H:%M') < datetime.strptime(row['ETA'], '%H:%M'):
            deltas.append(-1 * int((datetime.strptime('00:00', '%H:%M') - tdelta).time().minute))
        else:
            deltas.append(int((datetime.strptime('00:00', '%H:%M') + tdelta).time().minute))

    _df['Timedelta'] = deltas
    return _df

In [325]:
_df = get_time_delta(_df)

In [296]:
datetime.strptime('00:00', '%H:%M').time()

datetime.time(0, 0)

In [326]:
_df

Unnamed: 0,Flight,Date,Equipment_Type,Equipment_Reg,Origin_City,Origin_Code,Destination_City,Destination_Code,Flight Time,ETD,ETA,ATD,ATA,Carrier_Name,Carrier_Class,Timedelta
0,LH922,2020-10-18,A20N,D-AING,Frankfurt,FRA,London,LHR,1:09,21:30,22:10,21:42,21:50,Lufthansa,0,-20
1,LH922,2020-10-17,A20N,D-AINL,Frankfurt,FRA,London,LHR,1:03,21:30,22:10,22:04,22:08,Lufthansa,0,-2
2,LH922,2020-10-16,A320,D-AIUY,Frankfurt,FRA,London,LHR,1:07,21:30,22:10,21:41,21:47,Lufthansa,0,-23
3,LH922,2020-10-15,A20N,D-AIJB,Frankfurt,FRA,London,LHR,1:08,21:30,22:10,21:39,21:48,Lufthansa,0,-22
4,LH922,2020-10-14,A20N,D-AINJ,Frankfurt,FRA,London,LHR,1:06,21:30,22:10,21:34,21:40,Lufthansa,0,-30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186701,DY4251,2017-11-25,B738,LN-NID,Stockholm,ARN,Barcelona,BCN,3:37,6:30,9:55,6:58,10:36,Norwegian,1,41
186702,DY4251,2017-11-18,B738,LN-NHB,Stockholm,ARN,Barcelona,BCN,3:08,6:30,9:55,6:33,09:41,Norwegian,1,-14
186703,DY4251,2017-11-11,B738,LN-NHF,Stockholm,ARN,Barcelona,BCN,2:54,6:30,9:55,6:56,09:51,Norwegian,1,-4
186704,DY4251,2017-11-04,B738,LN-NHD,Stockholm,ARN,Barcelona,BCN,3:29,6:30,9:55,6:40,10:10,Norwegian,1,15
