In [None]:
import matplotlib as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
train_data_path = 'data/Data_Train.xlsx'
data_df = pd.read_excel(train_data_path)

In [None]:
data_df.info()

# Data cleaning
Deal with missing values

In [None]:
# remove missing values
data_df.dropna(inplace=True)

In [None]:
data_df_copy = data_df.copy()

In [None]:
data_df.head(3)

## Data preprocessing & extract derived features

In [None]:
# change columns to DateTime
def change_to_datetime(col):
    data_df[col] = pd.to_datetime(data_df[col])

In [None]:
columns_to_change = ['Date_of_Journey', 'Dep_Time', 'Arrival_Time']

In [None]:
for c in columns_to_change:
    change_to_datetime(c)

In [None]:
data_df['journey_day'] = data_df['Date_of_Journey'].dt.day
data_df['journey_month'] = data_df['Date_of_Journey'].dt.month
data_df['journey_year'] = data_df['Date_of_Journey'].dt.year

In [None]:
# extract hour and minute
def extract_hour_minute(df, col):
    df[f'{col}_hour'] = df[col].dt.hour
    df[f'{col}_minute'] = df[col].dt.minute
    return df.head(3)

In [None]:
cols_to_extract = ['Dep_Time', 'Arrival_Time']
for c in cols_to_extract:
    extract_hour_minute(data_df, c)
data_df.drop(cols_to_extract, axis=1, inplace=True)

# Data Analysis

In [None]:
def flight_departure_time(hour):
    if (hour>4) and (hour<=8):
        return 'Early Morning'
    elif (hour>8) and (hour<=12):
        return 'Morning'
    elif (hour>12) and (hour<=16):
        return 'Noon'
    elif (hour>16) and (hour<=20):
        return 'Evening'
    elif (hour>20) and (hour<=24):
        return 'Night'
    else:
        return 'Late Night'

In [None]:
import plotly
import cufflinks as cf
from cufflinks.offline import go_offline
from plotly.offline import plot, iplot, init_notebook_mode, download_plotlyjs
init_notebook_mode(connected=True)
cf.go_offline()

In [None]:
data_df['Dep_Time_hour'].apply(flight_departure_time).value_counts().plot(kind='bar')

In [None]:
# transforming duration
def preprocess_duration(duration):
    if 'h' not in duration:
        duration = f'0h {duration}'
    elif 'm' not in duration:
        duration = f'{duration} 0m'
    return duration

In [None]:
data_df['Duration'] = data_df['Duration'].apply(preprocess_duration)

In [None]:
data_df['Duration_hours'] = data_df['Duration'].apply(lambda x : int(x.split(' ')[0][0:-1]))
data_df['Duration_mins'] = data_df['Duration'].apply(lambda x : int(x.split(' ')[1][0:-1]))

In [None]:
def total_duration(hour, min):
    return (hour*60)+min

data_df['Duration_total_mins'] = data_df.apply(
    lambda x: total_duration(
        hour=x['Duration_hours'],
        min=x['Duration_mins']
    ),
    axis=1
)

In [None]:
data_df['Duration_total_mins']

### Does duration impacts in price or not

In [None]:
sns.scatterplot(x='Duration_total_mins', y='Price', hue='Total_Stops', data=data_df)

In [None]:
sns.lmplot(x='Duration_total_mins', y='Price', data=data_df)

In [None]:
# price x airline
sns.boxplot(y='Price', x='Airline', data=data_df.sort_values('Price', ascending=False))
plt.pyplot.xticks(rotation='vertical')
plt.pyplot.show()