<img width="300" src="https://cdn.cnn.com/cnnnext/dam/assets/210810173434-4-pan-am-707-economy-meals-credit-anne-sweeney-full-169.jpg" align='left'>

# First Look
Here we take a first look at the dataset and its features to get an overview and to develop a strategy for
the cleaning process.

In [1]:
import pandas as pd
import numpy as np
import yaml
import sys
import os

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
sys.path.insert(0, os.path.abspath('../src'))
import mylib.cleaning as clean

In [3]:
pd.options.display.max_columns = None

#### Load configs from yaml file

In [4]:
with open('../params.yaml') as file:
    config = yaml.safe_load(file)

#### Read dataset

In [5]:
data = pd.read_csv(config['data']['raw'])

In [6]:
data.head(3)

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,1,5,3,5,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,3,1,3,1,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,2,5,5,5,5,4,3,4,4,4,5,0,0.0,satisfied


#### Normalize column names

In [7]:
data = clean.normalize_column_names(data)
print(data.columns)

Index(['id', 'gender', 'customer_type', 'age', 'type_of_travel', 'class',
       'flight_distance', 'inflight_wifi_service',
       'departure_arrival_time_convenient', 'ease_of_online_booking',
       'gate_location', 'food_and_drink', 'online_boarding', 'seat_comfort',
       'inflight_entertainment', 'on_board_service', 'leg_room_service',
       'baggage_handling', 'checkin_service', 'inflight_service',
       'cleanliness', 'departure_delay_in_minutes', 'arrival_delay_in_minutes',
       'satisfaction'],
      dtype='object')


#### Get dataset information

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129880 entries, 0 to 129879
Data columns (total 24 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   id                                 129880 non-null  int64  
 1   gender                             129880 non-null  object 
 2   customer_type                      129880 non-null  object 
 3   age                                129880 non-null  int64  
 4   type_of_travel                     129880 non-null  object 
 5   class                              129880 non-null  object 
 6   flight_distance                    129880 non-null  int64  
 7   inflight_wifi_service              129880 non-null  int64  
 8   departure_arrival_time_convenient  129880 non-null  int64  
 9   ease_of_online_booking             129880 non-null  int64  
 10  gate_location                      129880 non-null  int64  
 11  food_and_drink                     1298

Luckily we only have some nan values in column arrival_delay_in_minutes ...

#### Take a look at the categoricals

In [9]:
data.describe(include=object).T

Unnamed: 0,count,unique,top,freq
gender,129880,2,Female,65899
customer_type,129880,2,Loyal Customer,106100
type_of_travel,129880,2,Business travel,89693
class,129880,3,Business,62160
satisfaction,129880,2,neutral or dissatisfied,73452


#### Column `gender`
- Rename 'Female' to 'F'
- Rename 'Male' to 'M'

In [10]:
data['gender'].value_counts()

Female    65899
Male      63981
Name: gender, dtype: int64

#### Column `customer_type`
- Rename 'Loyal Customer' to 'Loyal'
- Rename 'disloyal Customer' to 'Disloyal'

In [11]:
data['customer_type'].value_counts()

Loyal Customer       106100
disloyal Customer     23780
Name: customer_type, dtype: int64

#### Column `type_of_travel`
- Rename 'Business travel' to 'Business'
- Rename 'Personal Travel' to 'Private'

In [12]:
data['type_of_travel'].value_counts()

Business travel    89693
Personal Travel    40187
Name: type_of_travel, dtype: int64

#### Columns `class`

In [13]:
data['class'].value_counts()

Business    62160
Eco         58309
Eco Plus     9411
Name: class, dtype: int64

### This is our dependent variable
#### Column `satisfaction`

- Rename column to 'satisfied'
- Change 'neutral or dissatisfied' to False
- Change 'satisfied' to False
- Change column type to boolean

In [14]:
data['satisfaction'].value_counts()

neutral or dissatisfied    73452
satisfied                  56428
Name: satisfaction, dtype: int64

#### Take a look at the numericals

In [15]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,129880.0,64940.5,37493.270818,1.0,32470.75,64940.5,97410.25,129880.0
age,129880.0,39.427957,15.11936,7.0,27.0,40.0,51.0,85.0
flight_distance,129880.0,1190.316392,997.452477,31.0,414.0,844.0,1744.0,4983.0
inflight_wifi_service,129880.0,2.728696,1.32934,0.0,2.0,3.0,4.0,5.0
departure_arrival_time_convenient,129880.0,3.057599,1.526741,0.0,2.0,3.0,4.0,5.0
ease_of_online_booking,129880.0,2.756876,1.40174,0.0,2.0,3.0,4.0,5.0
gate_location,129880.0,2.976925,1.27852,0.0,2.0,3.0,4.0,5.0
food_and_drink,129880.0,3.204774,1.329933,0.0,2.0,3.0,4.0,5.0
online_boarding,129880.0,3.252633,1.350719,0.0,2.0,3.0,4.0,5.0
seat_comfort,129880.0,3.441361,1.319289,0.0,2.0,4.0,5.0,5.0


#### Let's take a look at the nan values in 'arrival_delay_in_minutes'

In [16]:
percent_nan = (data['arrival_delay_in_minutes'].isna().sum() / len(data)) * 100
print(f"Percent nan: {round(percent_nan, 2)} %")

Percent nan: 0.3 %


In [17]:
n_nan = len(data[data['arrival_delay_in_minutes'].isna()])
print(f'Number of nan rows: {n_nan}')

Number of nan rows: 393


In [18]:
data[data['arrival_delay_in_minutes'].isna()]['satisfaction'].value_counts()

neutral or dissatisfied    227
satisfied                  166
Name: satisfaction, dtype: int64

Only 393 rows (0.3%) contain Nan values in column 'arrival_delay_in_minutes'.<br>
So we will drop all these rows ...