# Flugpreis Vorhersage - Kaufen oder Warten?
## Projektarbeit Data Mining
___
### Wintersemester 2021/22
### Gruppe G:
Max Grundmann - s0559326
### Inhalte
1. Problemanalyse
2. Explorative Datenanalyse
3. Weitere Features
4. Praktische Überlegungen
___
## 1. Datenvorbereitung

In [193]:
import pandas as pd
import numpy as np
import os
from sklearn.impute import SimpleImputer
from numpy import asarray
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime, timedelta

In [194]:
dirname = os.getcwd()
filename = os.path.join(dirname, '../Data/raw/train_set.csv')

raw_data = pd.read_csv(filename, index_col=0)



In [195]:
def calculate_delta_till_next_flight(df, forward=True):
    temp = df.copy()

    temp = temp.apply(lambda x: x['Flight_Date'] + timedelta(hours=x['Departure_hour']), axis=1)
    temp = pd.DataFrame(temp)
    temp = temp.sort_values(by=0, axis=0, ascending=forward)
    temp = pd.DataFrame(temp[0].unique()).reset_index(drop=True)

    if forward:
        temp['delta'] = temp[0] - temp[0].shift()
        column_name = 'last_departure'
    else:
        temp['delta'] = temp[0].shift() - temp[0]
        column_name = 'next_departure'

    temp['delta'] = temp['delta'].dt.seconds / 60 / 60
    temp['delta'] = temp['delta'].fillna(0)
    temp.rename(columns={ 0: 'Flight_Date_Time', 'delta' :column_name,}, inplace=True)

    df_temp = df.copy()
    df_temp['Flight_Date_Time'] = df_temp.apply(lambda x: x['Flight_Date'] + timedelta(hours=x['Departure_hour']), axis=1)
    
    return pd.merge(left=df_temp, right=temp, how='inner', on='Flight_Date_Time').drop(columns='Flight_Date_Time', axis=1)

In [196]:
def get_price_history(df):
    temp = df.copy()
    temp = temp.sort_values(by='Request_Date', axis=0, ascending=True)
    temp['Price_In_Eur'] = temp['Price_In_Eur'].astype(str)

    price_history = temp.groupby(['flight_unique_id'])['Price_In_Eur'].apply(','.join).reset_index()
    return price_history

In [197]:
def merge_price_history(df):
    prices = get_price_history(df)
    temp = df.copy()
    
    split_to_columns = prices
    split_to_columns['flight_unique_id'] = prices['flight_unique_id']

    return pd.merge(left=temp, right=split_to_columns, how='left', on='flight_unique_id')

In [198]:
def get_previous_requests(df):
    temp = df.copy()
    temp = temp.sort_values(['flight_unique_id', 'Request_Date'])
    unique_flights = temp['flight_unique_id'].unique()

    requests_counter = 0
    flight_id_index = 0
    current_flight = unique_flights[flight_id_index]
    number_of_requests_per_row = []

    for index, row in  temp.iterrows():
        if row['flight_unique_id'] != current_flight:       
            flight_id_index += 1
            current_flight = unique_flights[flight_id_index]
            requests_counter = 0
        number_of_requests_per_row.append(requests_counter)
        requests_counter += 1

    temp['previous_requests'] = number_of_requests_per_row
    return temp

In [199]:
def get_last_n_prices(df, n=10):
    last_n_requests = n
    Prices = []
    temp = df.copy()

    for row in temp.itertuples():
        row_prices = []
        for i in range(row.previous_requests):
            if len(row_prices) >= last_n_requests:
                row_prices.pop(0)    
            row_prices.append(getattr(row, 'Price_In_Eur_y').split(',')[i])
        Prices.append(','.join(row_prices))

    temp['Prices_cut'] = Prices
    split_to_columns = temp['Prices_cut'].str.split(',', expand=True)
    split_to_columns = split_to_columns.apply(pd.to_numeric)
    split_to_columns.columns = split_to_columns.columns.map(str)

    return pd.concat([temp.drop(['Prices_cut', 'Price_In_Eur_y'], 1), split_to_columns], axis=1).rename(columns={'Price_In_Eur_x' : 'Price_In_Eur'})

In [200]:
last_n_requests = 40

def prep_data(data):
    # Datentypen ändern
    data['Flight_Date'] = pd.to_datetime(data['Flight_Date'])
    data['Request_Date'] = pd.to_datetime(data['Request_Date'])
    
    # One Hot Encoding für Routen-Bezeichnungen
    data = pd.get_dummies(data,prefix=['route'], columns = ['route_abb'], drop_first=False)
    
    # Flag, wenn die Anfrage die letzte Anfrage vor dem Flug ist
    is_last_request = pd.DataFrame(data.groupby('flight_unique_id')['Request_Date'].max()).reset_index()
    is_last_request['is_last_request'] = 1

    data = data.merge(is_last_request, 
                      on=['flight_unique_id', 'Request_Date'], 
                      how='left')
    data['is_last_request'] = data['is_last_request'].fillna(0)
    data['is_last_request'] = data['is_last_request'].astype(int)
    
    # Abstand zu letztem und nächsten Flug berechnen
    forward = calculate_delta_till_next_flight(data, forward=True).reset_index(drop=True)
    backward = calculate_delta_till_next_flight(forward, forward=False)
    data = backward

    # Anzahl der bisherigen Requests als Feature hinzufügen
    data = get_previous_requests(data)
    
    # Preishistorie berechnen
    data = merge_price_history(data)
    data = get_last_n_prices(data, last_n_requests)

    # Datumsfelder in einzelne Bestandteile zerlegen
    data['flight_weekday'] = data['Flight_Date'].dt.weekday
    data['flight_day'] = data['Flight_Date'].dt.day
    data['flight_month'] = data['Flight_Date'].dt.month 
    data['flight_is_weekend'] = data['flight_weekday'] >= 5

    data['request_weekday'] = data['Request_Date'].dt.weekday
    data['request_day'] = data['Request_Date'].dt.day
    data['request_month'] = data['Request_Date'].dt.month
    data['request_is_weekend'] = data['request_weekday'] >= 5
    
    data['request_hour'] = data['Request_Date'].dt.hour
    
    # Cyclische Features in Sinus und Cosinus Repräsentation umwandeln
    # Quelle: https://www.mikulskibartosz.name/time-in-machine-learning/
    def encode(data, col, max_val):
        data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
        data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
        return data

    data = encode(data, 'request_weekday', 7)
    data = encode(data, 'request_month', 12)
    data = encode(data, 'request_day', 365)
    data = encode(data, 'request_hour', 24)

    data = encode(data, 'flight_weekday', 7)
    data = encode(data, 'flight_month', 12)
    data = encode(data, 'flight_day', 365)
    data = encode(data, 'Departure_hour', 24)
    
    # Tage bis zum Flug berechnen
    data['Request_Date_w/o_Time'] = pd.to_datetime(data['Request_Date']).dt.date
    data['days_remaining'] = (pd.to_datetime(data['Flight_Date']).dt.date - data['Request_Date_w/o_Time']).dt.days
    data.drop(['Request_Date_w/o_Time'],1, inplace=True)
    
    # Relevante Feiertage im Zeitraum der Daten, die in Berlin und oder Frankfurt gelten
    # sowie Public Holidays in Großbritannien. 
    feiertage = {
        '2019-06-09':'Pfingstsonntag',
        '2019-06-10':'Pfingstmontag',
        '2019-06-20':'Fronleichnam',
        '2019-06-20':'Schulferien Beginn',
        '2019-08-02':'Schulferien Ende',
        '2019-08-26':'Summer Bank Holidays',
        '2019-07-15':'School Summer Holidays Beginn',
        '2019-09-06':'School Summer Holidays End'}

    feiertage_df = pd.DataFrame(feiertage.items(), columns=['Datum_Feiertag', 'Feiertag_Bezeichnung'])
    feiertage_df['Datum_Feiertag'] = pd.to_datetime(feiertage_df['Datum_Feiertag'])
    
    from datetime import datetime

    day_diff_list = []
    for index, row in feiertage_df.iterrows():
        day_diff_list.append(abs((data['Flight_Date'] - row['Datum_Feiertag']).dt.days))
        
    feiertage_diff_df = pd.concat(day_diff_list, axis=1)
    feiertage_diff_df = feiertage_diff_df.min(axis=1)
    feiertage_diff_df = feiertage_diff_df.reset_index().drop('index', 1)
    feiertage_diff_df.columns = ['Days_Untill_Event']

    data = pd.concat([data, feiertage_diff_df], axis=1)

    # Features skalieren
    scaler = MinMaxScaler()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    dont_scale = ['buy', 'is_last_request']
    to_be_scaled = data.select_dtypes(include=numerics).drop(dont_scale, axis=1)
    data.drop(to_be_scaled.columns, axis=1, inplace=True)
    scaled = pd.DataFrame(scaler.fit_transform(to_be_scaled), columns=to_be_scaled.columns)
    data = pd.concat([data.reset_index(), scaled.reset_index()], axis=1)
    
    # Nicht mehr benötigte Spalten entfernen
    data.drop(['Request_Date', 
               'Flight_Date', 
               'min_future_price_in_Eur', 
               'index', 
               'Departure_hour', 
               'flight_weekday', 
               'flight_day', 
               'flight_month', 
               'request_weekday', 
               'request_day', 
               'request_month', 
            #    'days_remaining',
               'request_hour', 
            #    'Days_Untill_Event',
            #    'previous_requests',
               'flight_unique_id'], inplace=True, axis=1)
    
    # Boolean in Int umwandeln
    data['request_is_weekend'] = data['request_is_weekend'].astype(int)
    data['flight_is_weekend'] = data['flight_is_weekend'].astype(int)
    
    return data

In [201]:
data = prep_data(raw_data)

  return pd.concat([temp.drop(['Prices_cut', 'Price_In_Eur_y'], 1), split_to_columns], axis=1).rename(columns={'Price_In_Eur_x' : 'Price_In_Eur'})
  data.drop(['Request_Date_w/o_Time'],1, inplace=True)
  feiertage_diff_df = feiertage_diff_df.reset_index().drop('index', 1)


In [202]:
filename = os.path.join(dirname, f'../Data/prepped/train_set_n{last_n_requests}.csv')

if input('Datei speichern? y/n') == "y":
    data.to_csv(filename)
    print('Datei gespeichernt.')
else:
    print('Nicht gespeichert.')

Datei gespeichernt.
