# Challenge

What is Preppin' Data? 

Preppin' Data is a website that posts weekly real world data preparation challenges for data professionals to solve using Tableau Prep. However, to demonstrate my SQL and Python capabilities I will be solving the challenges with these tools. 

This weeks challenge: https://preppindata.blogspot.com/2024/01/2024-week-2-average-price-analysis.html 

In [1]:
import pandas as pd

In [3]:
path1 = '/Users/Mark1/Documents/Data Science/preppin_data/2024/week_2/data/input/PD 2024 Wk 1 Output Flow Card.csv'
path2 = '/Users/Mark1/Documents/Data Science/preppin_data/2024/week_2/data/input/PD 2024 Wk 1 Output Non-Flow Card.csv'

In [None]:
# input the two csv files
df1 = pd.read_csv(path1)
df2 = pd.read_csv(path2)

print(df1.head())
print(df2.head())

         Date Flight Number      From        To            Class   Price  \
0  22/07/2024         PA010     Tokyo  New York          Economy  2380.0   
1  20/04/2024         PA002  New York    London          Economy  3490.0   
2  23/01/2024         PA010     Tokyo  New York  Premium Economy   825.0   
3  05/06/2024         PA006     Tokyo    London      First Class   618.0   
4  30/03/2024         PA004     Perth    London      First Class   446.0   

  Flow Card?  Bags Checked   Meal Type  
0        Yes             0    Egg Free  
1        Yes             1       Vegan  
2        Yes             1  Vegetarian  
3        Yes             3       Vegan  
4        Yes             1    Nut Free  
         Date Flight Number      From        To            Class   Price  \
0  28/09/2024         PA008     Perth  New York          Economy  1855.0   
1  01/10/2024         PA008     Perth  New York   Business Class   634.8   
2  04/03/2024         PA007  New York     Perth   Business Class   45

In [9]:
# union the files together
df = pd.concat([df1, df2], axis=0)
df.head()

Unnamed: 0,Date,Flight Number,From,To,Class,Price,Flow Card?,Bags Checked,Meal Type
0,22/07/2024,PA010,Tokyo,New York,Economy,2380.0,Yes,0,Egg Free
1,20/04/2024,PA002,New York,London,Economy,3490.0,Yes,1,Vegan
2,23/01/2024,PA010,Tokyo,New York,Premium Economy,825.0,Yes,1,Vegetarian
3,05/06/2024,PA006,Tokyo,London,First Class,618.0,Yes,3,Vegan
4,30/03/2024,PA004,Perth,London,First Class,446.0,Yes,1,Nut Free


In [11]:
# convert the date field to a Quarter Number instead and name this field 'Quarter'
df['Quarter'] = pd.to_datetime(df['Date'], dayfirst=True).dt.quarter
df.head()

Unnamed: 0,Date,Flight Number,From,To,Class,Price,Flow Card?,Bags Checked,Meal Type,Quarter
0,22/07/2024,PA010,Tokyo,New York,Economy,2380.0,Yes,0,Egg Free,3
1,20/04/2024,PA002,New York,London,Economy,3490.0,Yes,1,Vegan,2
2,23/01/2024,PA010,Tokyo,New York,Premium Economy,825.0,Yes,1,Vegetarian,1
3,05/06/2024,PA006,Tokyo,London,First Class,618.0,Yes,3,Vegan,2
4,30/03/2024,PA004,Perth,London,First Class,446.0,Yes,1,Nut Free,1


In [88]:
# aggregate the data by quarter, flow card, and class. Then calculate the median, min, and max price.
df_agg = df.groupby(['Quarter', 'Flow Card?', 'Class']).agg({'Price': ['median', 'min', 'max']}).reset_index().copy()
df_agg.head()

Unnamed: 0_level_0,Quarter,Flow Card?,Class,Price,Price,Price
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,median,min,max
0,1,No,Business Class,574.8,241.2,834.0
1,1,No,Economy,2340.0,1030.0,3455.0
2,1,No,First Class,438.0,204.0,699.0
3,1,No,Premium Economy,1075.0,515.0,1702.5
4,1,Yes,Business Class,523.2,249.6,840.0


In [89]:
# Reset the column names to make them easier to work with
df_agg.columns = ['Quarter', 'Flow Card?', 'Class', 'median', 'min', 'max']
df_agg.head()

Unnamed: 0,Quarter,Flow Card?,Class,median,min,max
0,1,No,Business Class,574.8,241.2,834.0
1,1,No,Economy,2340.0,1030.0,3455.0
2,1,No,First Class,438.0,204.0,699.0
3,1,No,Premium Economy,1075.0,515.0,1702.5
4,1,Yes,Business Class,523.2,249.6,840.0


In [98]:
# pivot the data to have a column per class for each quarter and whether the passenger has a flow card or not
df_pivot = pd.pivot_table(df_agg, 
                          values='median',
                          index=['Quarter', 'Flow Card?'], 
                          columns='Class').copy()
df_pivot.head()

Unnamed: 0_level_0,Class,Business Class,Economy,First Class,Premium Economy
Quarter,Flow Card?,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,No,574.8,2340.0,438.0,1075.0
1,Yes,523.2,2325.0,447.5,1160.0
2,No,553.8,2325.0,445.0,1205.0
2,Yes,517.8,2290.0,459.0,1071.25
3,No,490.8,2285.0,487.0,1125.0


In [101]:
# The pricing is incorrect for the Class. Rename the columns to the correct class names
df_pivot = df_pivot.rename(columns={
    'First Class': 'Economy',
    'Business Class': 'Premium Economy',
    'Premium Economy': 'Business Class',
    'Economy': 'First Class'})
column_order = ['Economy', 'Premium Economy', 'Business Class', 'First Class']
df_pivot[column_order].head()

Unnamed: 0_level_0,Class,Economy,Premium Economy,Business Class,First Class
Quarter,Flow Card?,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,No,438.0,574.8,1075.0,2340.0
1,Yes,447.5,523.2,1160.0,2325.0
2,No,445.0,553.8,1205.0,2325.0
2,Yes,459.0,517.8,1071.25,2290.0
3,No,487.0,490.8,1125.0,2285.0
