In [1]:
# Import library
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

In [2]:
# Declare engine variables
user = "root"
password = ""
port = 3306
database = "airline_quality"

In [3]:
# Create engine to query data from database
engine = create_engine(
    "mysql+mysqldb://%s:%s@localhost:%i/%s" % (user, password, port, database)
)

In [4]:
# Retrieve airline passenger satisfaction from database
select_command = "SELECT * FROM airline_passenger_satisfaction"
passenger_satis = pd.read_sql_query(select_command, engine)
passenger_satis.head()

Unnamed: 0,ID,Gender,Age,Customer Type,Type of Travel,Class,Flight Distance,Departure Delay,Arrival Delay,Departure and Arrival Time Convenience,...,On-board Service,Seat Comfort,Leg Room Service,Cleanliness,Food and Drink,In-flight Service,In-flight Wifi Service,In-flight Entertainment,Baggage Handling,Satisfaction
0,1,Male,48,First-time,Business,Business,821,2,5,3,...,3,5,2,5,5,5,3,5,5,Neutral or Dissatisfied
1,2,Female,35,Returning,Business,Business,821,26,39,2,...,5,4,5,5,3,5,2,5,5,Satisfied
2,3,Male,41,Returning,Business,Business,853,0,0,4,...,3,5,3,5,5,3,4,3,3,Satisfied
3,4,Male,50,Returning,Business,Business,1905,0,0,2,...,5,5,5,4,4,5,2,5,5,Satisfied
4,5,Female,49,Returning,Business,Business,3470,0,1,3,...,3,4,4,5,4,3,3,3,3,Satisfied


In [5]:
# Retrieve airline comments from database
select_command = "SELECT * FROM airline_comments"
passenger_comment = pd.read_sql_query(select_command, engine)
passenger_comment.tail()

Unnamed: 0,ID,Airline Name,Verified,Date Flown,Value For Money,Recommend
4100,4332,American Eagle,False,October 2015,1,no
4101,4333,American Eagle,False,September 2015,5,yes
4102,4334,American Eagle,False,October 2015,1,no
4103,4335,American Eagle,False,August 2015,4,yes
4104,4336,American Eagle,False,August 2015,4,yes


In [6]:
# Select values what have airline comments
selected_passenger_satis = passenger_satis[passenger_satis["ID"] <= 4336]

In [7]:
# Perform join 2 datasets
merge_df = pd.merge(
    selected_passenger_satis, passenger_comment, left_on="ID", right_on="ID", how="left"
).reset_index(drop=True)

In [8]:
# Convert customer Type value to dimension table
merge_df["customer_type"] = np.where(merge_df["Customer Type"] == "First-time", 1, 2)

In [9]:
# Convert Type of Travel to dimension table
merge_df["travel_type"] = np.where(merge_df["Type of Travel"] == "Business", 1, 2)

In [10]:
# Convert Class to dimension table
merge_df["class_type"] = np.where(
    merge_df["Class"] == "Business",
    1,
    np.where(merge_df["Class"] == "Economy", 2, 3),
)

In [11]:
# Convert customer satisfaction to dimension table
merge_df["satis_level"] = np.where(
    merge_df["Satisfaction"] == "Neutral or Dissatisfied", 1, 2
)

In [12]:
# Check null value for "Value for Money" variable
merge_df["Value For Money"].isnull().sum()

231

In [13]:
# Fill N/A value for "Value for Money" variable
merge_df["Value For Money"] = merge_df["Value For Money"].fillna(0)

In [14]:
# Check null value for Value For Money variable after filled N/A values
merge_df["Value For Money"].isnull().sum()

0

In [15]:
# Change float datatype into integer data type
merge_df["Value For Money"] = merge_df["Value For Money"].astype(int)

In [16]:
# Grouped by "Date Flown" variable and selected only variable what impact to airline
grouped_customer = (
    merge_df.groupby(
        [
            "Date Flown",
            "Gender",
            "Age",
            "customer_type",
            "travel_type",
            "class_type",
            "satis_level",
            "Verified",
            "Recommend",
        ]
    )
    .count()
    .reset_index()
)[
    [
        "Date Flown",
        "Gender",
        "Age",
        "customer_type",
        "travel_type",
        "class_type",
        "satis_level",
        "Verified",
        "Recommend",
    ]
]

In [17]:
# Export grouped customer from selected criteria as the CSV file
grouped_customer.to_csv("../../file/grouped_customer.csv", index=True)

In [18]:
# Extract scores and other values from evaluate form for each customer
evaluate_score = merge_df[
    [
        "Departure Delay",
        "Arrival Delay",
        "Departure and Arrival Time Convenience",
        "Ease of Online Booking",
        "Check-in Service",
        "Online Boarding",
        "Gate Location",
        "On-board Service",
        "Seat Comfort",
        "Leg Room Service",
        "Cleanliness",
        "Food and Drink",
        "In-flight Service",
        "In-flight Wifi Service",
        "In-flight Entertainment",
        "Baggage Handling",
        "Value For Money",
    ]
]

In [19]:
# Export evaluate score filled by passengers
evaluate_score.to_csv("../../file/evaluate_score.csv", index=True)