In [32]:
## Import
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report


In [2]:
def get_invalid_values(dataframe: pd.DataFrame):
    """
    Take a pandas df as argument, looks for the items 
    in an invalid list. returns a pd df with
    the columns: column, nulls, invalids, 
    and the unique values.
    
    Args:
        df(pd.DataFrame): a pdDataFrame.
    """
    # Running validation on the argument recieved
    assert type(dataframe) == pd.DataFrame, f'{dataframe}, is not a pandas df.'
    df = dataframe
    
    invalid_list =\
    [np.nan, None, [], {}, 'NaN', 'Null','NULL'\
     ,'None','NA','?','-', '--','.','', ' ', '   ']
    
    invalids = []
    uniques = []
    result = pd.DataFrame({
        'nulls': df.isnull().sum(),
    })
    for c in df.columns:
        invalids.append(df[c].isin(invalid_list).sum())
        uniques.append(df[c].unique())
    result['invalids'] = invalids
    result['unique_item'] = uniques
    return(result.head(len(df.columns)))

In [3]:
def miss_df(dataframe: pd.DataFrame):

    """
    Take a pandas df as argument, returns another one
    with  basic information about missing data
    Args:
        df(pd.DataFrame): a pdDataFrame.
    """

    # Running validation on the argument recieved
    assert type(dataframe) == pd.DataFrame, f'{dataframe}, is not a pandas df.'
    df = dataframe.copy()
    total_missing = df.isnull().sum().sort_values(ascending=False)
    percent_missing = (df.isnull().sum() / df.isnull().count()) * 100
    missing_data = pd.concat([total_missing, percent_missing], axis=1, keys=['Total', 'Percent'])
    return missing_data.head(len(df.columns))

In [4]:
## Reading the data
df = pd.read_csv("Invistico_Airline.csv")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129880 entries, 0 to 129879
Data columns (total 22 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   satisfaction                       129880 non-null  object 
 1   Customer Type                      129880 non-null  object 
 2   Age                                129880 non-null  int64  
 3   Type of Travel                     129880 non-null  object 
 4   Class                              129880 non-null  object 
 5   Flight Distance                    129880 non-null  int64  
 6   Seat comfort                       129880 non-null  int64  
 7   Departure/Arrival time convenient  129880 non-null  int64  
 8   Food and drink                     129880 non-null  int64  
 9   Gate location                      129880 non-null  int64  
 10  Inflight wifi service              129880 non-null  int64  
 11  Inflight entertainment             1298

In [153]:
miss_df(df)

Unnamed: 0,Total,Percent
Arrival Delay in Minutes,393,0.302587
Customer Type,0,0.0
Departure Delay in Minutes,0,0.0
Online boarding,0,0.0
Cleanliness,0,0.0
Checkin service,0,0.0
Baggage handling,0,0.0
Leg room service,0,0.0
On-board service,0,0.0
Ease of Online booking,0,0.0


In [6]:
## Droping missing values that are a very small pct.
df.dropna(inplace=True)

In [7]:
## Obtaining multiple information like
## 
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 129487 entries, 0 to 129879
Data columns (total 22 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   satisfaction                       129487 non-null  object 
 1   Customer Type                      129487 non-null  object 
 2   Age                                129487 non-null  int64  
 3   Type of Travel                     129487 non-null  object 
 4   Class                              129487 non-null  object 
 5   Flight Distance                    129487 non-null  int64  
 6   Seat comfort                       129487 non-null  int64  
 7   Departure/Arrival time convenient  129487 non-null  int64  
 8   Food and drink                     129487 non-null  int64  
 9   Gate location                      129487 non-null  int64  
 10  Inflight wifi service              129487 non-null  int64  
 11  Inflight entertainment             129487 no

In [8]:
## 
def observe_data_type(dataframe: pd.DataFrame, max_class):

    """
    Args: 
        df(pd.DataFrame): a pdDataFrame.
        max_class: max number to be considered a categorical feature.
        Each case is different and EDA will tell.

    Returns:
        A new dataset with information related to unique values
        per column
    
    """
    # Running validation on the argument recieved
    assert type(dataframe) == pd.DataFrame, f'{dataframe}, is not a pandas df.'
    num_uniques = []
    column_name = []
    check = []
    resu = {}
    result = pd.DataFrame({})
    
    for c in df.columns:
        
        num_uniques.append(len(df[c].unique()))
        column_name.append(c)
        if 2 < len(df[c].unique()) < max_class:
            check.append('yes')
        else: check.append('no')
            
    
    result['columns_name'] = column_name
    result['Num_uniques'] = num_uniques
    result['possible_categorical'] = check
    return(result) 
    

In [9]:
observe_data_type(df, 10)

Unnamed: 0,columns_name,Num_uniques,possible_categorical
0,satisfaction,2,no
1,Customer Type,2,no
2,Age,75,no
3,Type of Travel,2,no
4,Class,3,yes
5,Flight Distance,5397,no
6,Seat comfort,6,yes
7,Departure/Arrival time convenient,6,yes
8,Food and drink,6,yes
9,Gate location,6,yes


In [158]:
## Data types in sets for transformations:

## Target needs to be transformed to binary
target = df['satisfaction']

## All numerical numerical features 
num_features = df.copy().select_dtypes(['int64', 'float64']).columns
cat_features = df.copy().select_dtypes(['object']).columns
## Categorical strings, 
## which contains binary and ordinal data
cat_str = ['Customer Type', 'Type of Travel', 'Class']

## Ordinal
ordinal = ['Seat comfort','Departure/Arrival time convenient', 'Food and drink', 'Gate location',
       'Inflight wifi service', 'Inflight entertainment', 'Online support',
       'Ease of Online booking', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Cleanliness', 'Online boarding']

In [159]:
cat_features
for i in cat_str:
    print(df[i].unique())

['Loyal Customer' 'disloyal Customer']
['Personal Travel' 'Business travel']
['Eco' 'Business' 'Eco Plus']


In [21]:
## Convert Strings from a categorical feature to numerical.

def str_encoder(dataframe, str_to_encode):
    """
    The str_encoder function will convert these columns 
    to numerical ordinal or binary numerical columns and 
    then return a new DataFrame with the transformed columns 
    plus the original columns that were not transformed.
    
    Arguments:
        df: A Pandas Data frame.
        str_to_encode: a list of columns to tranform 
        from the same data set.
    Returns:
        A new data set made of the transformed columns
        plus the original with out the previously transformed
        data.
    """
    ## make a copy 
    df_o = dataframe.copy()
    df_e = df_o[str_to_encode].copy()
    
    le = LabelEncoder()
    str_encoded = df_e.apply(le.fit_transform)

    ## Drop strings categories
    df_o_e = df_o.drop(str_to_encode, axis=1)

    ## Add transformed data
    full = df_o_e.join(str_encoded)
    
    return full

cat_str = ['satisfaction','Customer Type','Type of Travel', 'Class']

encoded_string = str_encoder(df, cat_str)


In [23]:
encoded_string.head()

Unnamed: 0,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction,Customer Type,Type of Travel,Class
0,65,265,0,0,0,2,2,4,2,3,3,0,3,5,3,2,0,0.0,1,0,1,1
1,47,2464,0,0,0,3,0,2,2,3,4,4,4,2,3,2,310,305.0,1,0,1,0
2,15,2138,0,0,0,3,2,0,2,2,3,3,4,4,4,2,0,0.0,1,0,1,1
3,60,623,0,0,0,3,3,4,3,1,1,0,1,4,1,3,0,0.0,1,0,1,1
4,70,354,0,0,0,3,4,3,4,2,2,0,2,4,2,5,0,0.0,1,0,1,1


In [12]:
for i in cat_str:
    print(encoded_string[i].unique())

[1 0]
[0 1]
[1 0]
[1 0 2]


In [26]:
miss_df(encoded_string)

Unnamed: 0,Total,Percent
Age,0,0.0
Flight Distance,0,0.0
Type of Travel,0,0.0
Customer Type,0,0.0
satisfaction,0,0.0
Arrival Delay in Minutes,0,0.0
Departure Delay in Minutes,0,0.0
Online boarding,0,0.0
Cleanliness,0,0.0
Checkin service,0,0.0


In [25]:
get_invalid_values(encoded_string)

Unnamed: 0,nulls,invalids,unique_item
Age,0,0,"[65, 47, 15, 60, 70, 30, 66, 10, 56, 22, 58, 3..."
Flight Distance,0,0,"[265, 2464, 2138, 623, 354, 1894, 227, 1812, 7..."
Seat comfort,0,0,"[0, 1, 4, 5, 2, 3]"
Departure/Arrival time convenient,0,0,"[0, 1, 2, 3, 4, 5]"
Food and drink,0,0,"[0, 1, 2, 3, 4, 5]"
Gate location,0,0,"[2, 3, 4, 1, 5, 0]"
Inflight wifi service,0,0,"[2, 0, 3, 4, 5, 1]"
Inflight entertainment,0,0,"[4, 2, 0, 3, 5, 1]"
Online support,0,0,"[2, 3, 4, 5, 1, 0]"
Ease of Online booking,0,0,"[3, 2, 1, 5, 4, 0]"


In [27]:
info = observe_data_type(encoded_string, 10)
info

Unnamed: 0,columns_name,Num_uniques,possible_categorical
0,satisfaction,2,no
1,Customer Type,2,no
2,Age,75,no
3,Type of Travel,2,no
4,Class,3,yes
5,Flight Distance,5397,no
6,Seat comfort,6,yes
7,Departure/Arrival time convenient,6,yes
8,Food and drink,6,yes
9,Gate location,6,yes


In [28]:
## filtering the columns that need OneHoteEncode
## make sure to take the target out :)

to_hot_encode = info.loc[info['possible_categorical'] == 'yes', :]['columns_name'].tolist()
to_hot_encode

['Class',
 'Seat comfort',
 'Departure/Arrival time convenient',
 'Food and drink',
 'Gate location',
 'Inflight wifi service',
 'Inflight entertainment',
 'Online support',
 'Ease of Online booking',
 'On-board service',
 'Leg room service',
 'Baggage handling',
 'Checkin service',
 'Cleanliness',
 'Online boarding']

In [35]:
for i in to_hot_encode:
    print(encoded_string[i].unique())

[1 0 2]
[0 1 4 5 2 3]
[0 1 2 3 4 5]
[0 1 2 3 4 5]
[2 3 4 1 5 0]
[2 0 3 4 5 1]
[4 2 0 3 5 1]
[2 3 4 5 1 0]
[3 2 1 5 4 0]
[3 4 1 2 5 0]
[0 4 3 2 5 1]
[3 4 1 2 5]
[5 2 4 3 1 0]
[3 4 1 2 5 0]
[2 3 5 4 1 0]


In [64]:
def hot_encoder(dataframe, ordinal_to_hot_encode):
    
    """
    The hot_encoder function will convert the ordinal_to_hot_encode
    columns from numerical ordinal and then return a new DataFrame 
    with the transformed columns plus the original columns that were not transformed.
    
    Arguments:
        df: A Pandas Data frame.
        str_to_encode: a list of columns to tranform 
        from the same data set.
    Returns:
        A new data set made of the transformed columns
        plus the original with out the previously transformed
        data.
    """
    ## Original set
    original = dataframe.copy()
    
    ## Set minus the Transformed columns
    nums = original.drop(ordinal_to_hot_encode, axis=1)
    
    ## this transfomr to str
    str_encoded = original[ordinal_to_hot_encode].astype(str)

    ## Using get_dummies
    encoded_set = pd.get_dummies(str_encoded, dtype=int)

    
    ## Using join with the nums data set and encoded_set
    full = nums.join(encoded_set)
    
    return full


In [65]:
final = hot_encoder(encoded_string, to_hot_encode)
miss_df(final)

Unnamed: 0,Total,Percent
Age,0,0.0
On-board service_1,0,0.0
Leg room service_4,0,0.0
Leg room service_3,0,0.0
Leg room service_2,0,0.0
Leg room service_1,0,0.0
Leg room service_0,0,0.0
On-board service_5,0,0.0
On-board service_4,0,0.0
On-board service_3,0,0.0


In [66]:
final.tail()

Unnamed: 0,Age,Flight Distance,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction,Customer Type,Type of Travel,Class_0,Class_1,Class_2,Seat comfort_0,Seat comfort_1,Seat comfort_2,Seat comfort_3,Seat comfort_4,Seat comfort_5,Departure/Arrival time convenient_0,Departure/Arrival time convenient_1,Departure/Arrival time convenient_2,Departure/Arrival time convenient_3,Departure/Arrival time convenient_4,Departure/Arrival time convenient_5,Food and drink_0,Food and drink_1,Food and drink_2,Food and drink_3,Food and drink_4,Food and drink_5,Gate location_0,Gate location_1,Gate location_2,Gate location_3,Gate location_4,Gate location_5,Inflight wifi service_0,Inflight wifi service_1,Inflight wifi service_2,Inflight wifi service_3,Inflight wifi service_4,Inflight wifi service_5,Inflight entertainment_0,Inflight entertainment_1,Inflight entertainment_2,Inflight entertainment_3,Inflight entertainment_4,Inflight entertainment_5,Online support_0,Online support_1,Online support_2,Online support_3,Online support_4,Online support_5,Ease of Online booking_0,Ease of Online booking_1,Ease of Online booking_2,Ease of Online booking_3,Ease of Online booking_4,Ease of Online booking_5,On-board service_0,On-board service_1,On-board service_2,On-board service_3,On-board service_4,On-board service_5,Leg room service_0,Leg room service_1,Leg room service_2,Leg room service_3,Leg room service_4,Leg room service_5,Baggage handling_1,Baggage handling_2,Baggage handling_3,Baggage handling_4,Baggage handling_5,Checkin service_0,Checkin service_1,Checkin service_2,Checkin service_3,Checkin service_4,Checkin service_5,Cleanliness_0,Cleanliness_1,Cleanliness_2,Cleanliness_3,Cleanliness_4,Cleanliness_5,Online boarding_0,Online boarding_1,Online boarding_2,Online boarding_3,Online boarding_4,Online boarding_5
129875,29,1731,0,0.0,1,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0
129876,63,2087,174,172.0,0,1,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
129877,69,2320,155,163.0,0,1,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
129878,66,2450,193,205.0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
129879,38,4307,185,186.0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0
