# Challenge

What is Preppin' Data? 

Preppin' Data is a website that posts weekly real world data preparation challenges for data professionals to solve using Tableau Prep. However, to demonstrate my SQL and Python capabilities I will be solving the challenges with these tools. 

https://preppindata.blogspot.com/2024/01/2024-week-2-average-price-analysis.html 

# Install and Import

In [35]:
import pandas as pd
import pandasql as ps
import sqlite3

# Get Data

## Read Data

In [55]:
path1 = '/Users/Mark1/Documents/Data Science/preppin_data/2024/week_2/data/input/PD 2024 Wk 1 Output Flow Card.csv'
path2 = '/Users/Mark1/Documents/Data Science/preppin_data/2024/week_2/data/input/PD 2024 Wk 1 Output Non-Flow Card.csv'

In [None]:
# Connect to an SQLite database (in memory for temporary use)
conn = sqlite3.connect(':memory:')  # Use ':memory:' for a temporary database
cursor = conn.cursor()

In [41]:
# Read the CSV into a pandas DataFrame
df1 = pd.read_csv(path1)


In [39]:
df1.head()

Unnamed: 0,Date,Flight Number,From,To,Class,Price,Flow Card?,Bags Checked,Meal Type
0,22/07/2024,PA010,Tokyo,New York,Economy,2380.0,Yes,0,Egg Free
1,20/04/2024,PA002,New York,London,Economy,3490.0,Yes,1,Vegan
2,23/01/2024,PA010,Tokyo,New York,Premium Economy,825.0,Yes,1,Vegetarian
3,05/06/2024,PA006,Tokyo,London,First Class,618.0,Yes,3,Vegan
4,30/03/2024,PA004,Perth,London,First Class,446.0,Yes,1,Nut Free


In [38]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1883 entries, 0 to 1882
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           1883 non-null   object 
 1   Flight Number  1883 non-null   object 
 2   From           1883 non-null   object 
 3   To             1883 non-null   object 
 4   Class          1883 non-null   object 
 5   Price          1883 non-null   float64
 6   Flow Card?     1883 non-null   object 
 7   Bags Checked   1883 non-null   int64  
 8   Meal Type      1594 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 132.5+ KB


In [56]:
# Read the CSV into a pandas DataFrame
df2 = pd.read_csv(path2)

## Create and Load SQLite DB table

In [45]:
cursor.execute(
"""
CREATE TABLE table1 (
    "Date" TEXT,
    "Flight Number" TEXT,
    "From" TEXT,
    "To" TEXT,
    "Class" TEXT,
    "Price" REAL,
    "Flow Card?" TEXT,
    "Bags Checked" INTEGER,
    "Meal Type" TEXT
);
""")

OperationalError: table table1 already exists

In [46]:
# Load the DataFrame into the SQLite table
df1.to_sql('table1', conn, if_exists='replace', index=False)

1883

In [53]:
# Verify the schema using PRAGMA
query = """ 
PRAGMA table_info('table1')
"""

query_result = pd.read_sql_query(query, conn)
query_result

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,Date,TEXT,0,,0
1,1,Flight Number,TEXT,0,,0
2,2,From,TEXT,0,,0
3,3,To,TEXT,0,,0
4,4,Class,TEXT,0,,0
5,5,Price,REAL,0,,0
6,6,Flow Card?,TEXT,0,,0
7,7,Bags Checked,INTEGER,0,,0
8,8,Meal Type,TEXT,0,,0


In [None]:
query = """ 
SELECT * 
FROM table1 
LIMIT 5
"""

query_result = pd.read_sql_query(query, conn)
query_result

Unnamed: 0,Date,Flight Number,From,To,Class,Price,Flow Card?,Bags Checked,Meal Type
0,22/07/2024,PA010,Tokyo,New York,Economy,2380.0,Yes,0,Egg Free
1,20/04/2024,PA002,New York,London,Economy,3490.0,Yes,1,Vegan
2,23/01/2024,PA010,Tokyo,New York,Premium Economy,825.0,Yes,1,Vegetarian
3,05/06/2024,PA006,Tokyo,London,First Class,618.0,Yes,3,Vegan
4,30/03/2024,PA004,Perth,London,First Class,446.0,Yes,1,Nut Free


In [57]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1895 entries, 0 to 1894
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           1895 non-null   object 
 1   Flight Number  1895 non-null   object 
 2   From           1895 non-null   object 
 3   To             1895 non-null   object 
 4   Class          1895 non-null   object 
 5   Price          1895 non-null   float64
 6   Flow Card?     1895 non-null   object 
 7   Bags Checked   1895 non-null   int64  
 8   Meal Type      1595 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 133.4+ KB


In [58]:
df2.head()

Unnamed: 0,Date,Flight Number,From,To,Class,Price,Flow Card?,Bags Checked,Meal Type
0,28/09/2024,PA008,Perth,New York,Economy,1855.0,No,2,Vegetarian
1,01/10/2024,PA008,Perth,New York,Business Class,634.8,No,0,Vegetarian
2,04/03/2024,PA007,New York,Perth,Business Class,458.4,No,3,Nut Free
3,25/02/2024,PA010,Tokyo,New York,Premium Economy,1435.0,No,0,
4,29/03/2024,PA004,Perth,London,Economy,2730.0,No,2,Vegan


In [59]:
cursor.execute(
"""
CREATE TABLE table2 (
    "Date" TEXT,
    "Flight Number" TEXT,
    "From" TEXT,
    "To" TEXT,
    "Class" TEXT,
    "Price" REAL,
    "Flow Card?" TEXT,
    "Bags Checked" INTEGER,
    "Meal Type" TEXT
);
""")

<sqlite3.Cursor at 0x116cb1110>

In [60]:
# Load the DataFrame into the SQLite table
df2.to_sql('table2', conn, if_exists='replace', index=False)

1895

In [61]:
query = """ 
SELECT * 
FROM table2 
LIMIT 5
"""

query_result = pd.read_sql_query(query, conn)
query_result

Unnamed: 0,Date,Flight Number,From,To,Class,Price,Flow Card?,Bags Checked,Meal Type
0,28/09/2024,PA008,Perth,New York,Economy,1855.0,No,2,Vegetarian
1,01/10/2024,PA008,Perth,New York,Business Class,634.8,No,0,Vegetarian
2,04/03/2024,PA007,New York,Perth,Business Class,458.4,No,3,Nut Free
3,25/02/2024,PA010,Tokyo,New York,Premium Economy,1435.0,No,0,
4,29/03/2024,PA004,Perth,London,Economy,2730.0,No,2,Vegan


# Data Preparation

In [None]:
# Close the connection
conn.close()