In [1]:
import pandas as pd
import numpy as np

In [1]:
from sklearn.preprocessing import OneHotEncoder

from lightgbm import LGBMClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

In [2]:
from pycaret.classification import *

In [6]:
train = pd.read_csv("data/train.csv")

test = pd.read_csv("data/test.csv")

# sample = pd.read_csv("data/sample_submission.csv")


In [19]:
train.columns

Index(['id', 'Gender', 'Customer Type', 'Age', 'Type of Travel', 'Class',
       'Flight Distance', 'Seat comfort', 'Departure/Arrival time convenient',
       'Food and drink', 'Gate location', 'Inflight wifi service',
       'Inflight entertainment', 'Online support', 'Ease of Online booking',
       'On-board service', 'Leg room service', 'Baggage handling',
       'Checkin service', 'Cleanliness', 'Online boarding',
       'Departure Delay in Minutes', 'Arrival Delay in Minutes', 'target'],
      dtype='object')

In [41]:
setup_clf = setup(data = train, target='target', n_jobs = -1)

Unnamed: 0,Description,Value
0,session_id,1667
1,Target,target
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(3000, 24)"
5,Missing Values,0
6,Numeric Features,5
7,Categorical Features,18
8,Ordinal Features,0
9,High Cardinality Features,0


In [None]:
top5 = compare_models(sort='Accuracy', n_select=10)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9262,0.982,0.9331,0.9358,0.9343,0.85,0.8503,0.018
et,Extra Trees Classifier,0.9195,0.9785,0.9288,0.9283,0.9284,0.8364,0.8368,0.064
gbc,Gradient Boosting Classifier,0.9104,0.9728,0.9212,0.9203,0.9204,0.8179,0.8186,0.095
rf,Random Forest Classifier,0.91,0.9756,0.9195,0.9208,0.9199,0.8171,0.8177,0.05
lr,Logistic Regression,0.8942,0.9613,0.9017,0.9099,0.9055,0.7854,0.7861,0.117
lda,Linear Discriminant Analysis,0.8923,0.96,0.8983,0.9094,0.9036,0.7816,0.7821,0.008
ridge,Ridge Classifier,0.8914,0.0,0.8992,0.9072,0.903,0.7796,0.78,0.004
ada,Ada Boost Classifier,0.8885,0.9564,0.8932,0.9077,0.9,0.774,0.7749,0.033
dt,Decision Tree Classifier,0.8614,0.8585,0.8814,0.8738,0.8771,0.7181,0.7191,0.005
nb,Naive Bayes,0.8523,0.9244,0.8407,0.8913,0.8649,0.7023,0.7043,0.003


In [53]:
tuned_top5 = [tune_model(i) for i in top5]

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8381,0.9095,0.8305,0.875,0.8522,0.6735,0.6746
1,0.8667,0.9395,0.8305,0.9245,0.875,0.733,0.7379
2,0.8619,0.9367,0.8559,0.8938,0.8745,0.7212,0.722
3,0.881,0.9592,0.8644,0.9189,0.8908,0.7602,0.7619
4,0.8381,0.9256,0.8729,0.8443,0.8583,0.6696,0.6701
5,0.8381,0.9054,0.822,0.8818,0.8509,0.6743,0.6763
6,0.8571,0.9343,0.8475,0.8929,0.8696,0.7119,0.7131
7,0.8381,0.9083,0.8559,0.8559,0.8559,0.6711,0.6711
8,0.8238,0.9158,0.8559,0.8347,0.8452,0.6408,0.6411
9,0.8852,0.9451,0.8814,0.9123,0.8966,0.7676,0.7682


In [45]:
blender_top5 = blend_models(estimator_list=tuned_top5, method='soft', choose_better = True)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.919,0.9825,0.9068,0.9469,0.9264,0.8366,0.8375
1,0.919,0.9727,0.9153,0.9391,0.927,0.8362,0.8365
2,0.9095,0.9773,0.9407,0.9024,0.9212,0.8151,0.8161
3,0.9286,0.9856,0.9492,0.9256,0.9372,0.8544,0.8548
4,0.9333,0.9792,0.9492,0.9333,0.9412,0.8643,0.8644
5,0.8952,0.9767,0.8898,0.9211,0.9052,0.7882,0.7888
6,0.8905,0.9669,0.8983,0.906,0.9021,0.7778,0.7778
7,0.9095,0.9753,0.9068,0.9304,0.9185,0.8169,0.8172
8,0.8857,0.9675,0.9237,0.879,0.9008,0.7662,0.7675
9,0.9187,0.9722,0.9237,0.9316,0.9277,0.8348,0.8348


In [46]:
stack_top5 = stack_models(estimator_list=tuned_top5, method='auto', choose_better = True)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9333,0.9834,0.9153,0.9643,0.9391,0.8656,0.867
1,0.9238,0.9783,0.9322,0.9322,0.9322,0.8452,0.8452
2,0.919,0.9785,0.9407,0.9174,0.9289,0.835,0.8353
3,0.919,0.9852,0.9492,0.9106,0.9295,0.8346,0.8356
4,0.9333,0.9749,0.9407,0.9407,0.9407,0.8646,0.8646
5,0.9,0.9748,0.9068,0.9145,0.9106,0.7971,0.7972
6,0.9048,0.9692,0.9153,0.9153,0.9153,0.8066,0.8066
7,0.9381,0.9638,0.9322,0.9565,0.9442,0.8747,0.8751
8,0.9048,0.9744,0.9407,0.8952,0.9174,0.8052,0.8065
9,0.9282,0.9818,0.9322,0.9402,0.9362,0.8542,0.8542


In [49]:
stack_top7 = stack_models(estimator_list=tuned_top5, method='auto', choose_better = True)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9429,0.9843,0.9237,0.9732,0.9478,0.8848,0.8862
1,0.9238,0.9769,0.9322,0.9322,0.9322,0.8452,0.8452
2,0.9238,0.9796,0.9407,0.925,0.9328,0.8449,0.845
3,0.9286,0.9858,0.9492,0.9256,0.9372,0.8544,0.8548
4,0.9429,0.9746,0.9492,0.9492,0.9492,0.8839,0.8839
5,0.9095,0.9733,0.9237,0.916,0.9198,0.816,0.816
6,0.919,0.97,0.9322,0.9244,0.9283,0.8354,0.8354
7,0.9381,0.969,0.9237,0.9646,0.9437,0.875,0.876
8,0.919,0.9772,0.9492,0.9106,0.9295,0.8346,0.8356
9,0.9426,0.9828,0.9492,0.9492,0.9492,0.8832,0.8832


In [54]:
stack_top10 = stack_models(estimator_list=tuned_top5, method='auto', choose_better = True)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9333,0.9843,0.9153,0.9643,0.9391,0.8656,0.867
1,0.9333,0.9772,0.9322,0.9483,0.9402,0.8649,0.8651
2,0.9286,0.9779,0.9492,0.9256,0.9372,0.8544,0.8548
3,0.9286,0.9858,0.9492,0.9256,0.9372,0.8544,0.8548
4,0.9381,0.9754,0.9492,0.9412,0.9451,0.8741,0.8742
5,0.9048,0.9734,0.9153,0.9153,0.9153,0.8066,0.8066
6,0.9143,0.9704,0.9237,0.9237,0.9237,0.8259,0.8259
7,0.9381,0.9686,0.9237,0.9646,0.9437,0.875,0.876
8,0.9143,0.976,0.9407,0.9098,0.925,0.8251,0.8257
9,0.9474,0.9827,0.9576,0.9496,0.9536,0.8928,0.8929


In [56]:
final_model = finalize_model(stack_top7)

In [57]:
prediction = predict_model(final_model, data=test)

In [32]:
prediction

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,...,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Label,Score
0,1,Female,Loyal Customer,61,Personal Travel,Eco,2037,1,1,1,...,5,5,5,3,5,3,51,58.0,1,0.9675
1,2,Female,disloyal Customer,27,Business travel,Business,1846,1,1,1,...,3,4,5,4,4,1,0,0.0,0,0.9716
2,3,Female,Loyal Customer,52,Business travel,Business,1622,4,4,4,...,5,5,5,4,5,3,0,0.0,1,0.9189
3,4,Male,Loyal Customer,54,Business travel,Business,3534,4,4,4,...,2,2,2,5,2,1,0,0.0,1,0.8625
4,5,Female,Loyal Customer,41,Business travel,Eco,1471,4,3,3,...,3,1,2,5,4,4,0,0.0,1,0.8965
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1996,Female,Loyal Customer,62,Business travel,Eco Plus,54,3,1,1,...,3,3,3,3,3,1,0,0.0,0,0.8669
1996,1997,Female,Loyal Customer,33,Business travel,Business,1937,4,4,4,...,5,5,5,1,5,3,95,99.0,1,0.9891
1997,1998,Male,Loyal Customer,29,Business travel,Eco,2398,3,5,4,...,1,2,3,4,2,3,0,3.0,0,0.9747
1998,1999,Female,Loyal Customer,51,Personal Travel,Business,406,5,2,5,...,2,5,2,2,2,2,87,83.0,1,0.9494


In [33]:
pd.DataFrame(prediction['Label'])

Unnamed: 0,Label
0,1
1,0
2,1
3,1
4,1
...,...
1995,0
1996,1
1997,0
1998,1


In [58]:
# pd.DataFrame(predict_test)

# pd.DataFrame(sample['id'])

output=pd.concat([pd.DataFrame(sample['id']), pd.DataFrame(prediction['Label'])],  axis = 1)

output.rename(columns = {'Label' : 'target'}, inplace = True)
# 컬럼 이름 변경

output.to_csv('data/output_last.csv', index=False)

In [48]:
category_col = train.columns.difference(['id', 'Age','Flight Distrance','Departure Delay in Minutes', 'Arrival Delay in Minutes'])

for i in train.columns.difference(['id', 'Age','target','Flight Distrance','Departure Delay in Minutes', 'Arrival Delay in Minutes']):
    train['{}'.format(i)] = train['{}'.format(i)].astype('category')

for i in test.columns.difference(['id', 'Age','target','Flight Distrance','Departure Delay in Minutes', 'Arrival Delay in Minutes']):
    test['{}'.format(i)] = test['{}'.format(i)].astype('category')

In [50]:
train.info()
#타입 변형 확인

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 24 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   id                                 3000 non-null   int64   
 1   Gender                             3000 non-null   category
 2   Customer Type                      3000 non-null   category
 3   Age                                3000 non-null   int64   
 4   Type of Travel                     3000 non-null   category
 5   Class                              3000 non-null   category
 6   Flight Distance                    3000 non-null   category
 7   Seat comfort                       3000 non-null   category
 8   Departure/Arrival time convenient  3000 non-null   category
 9   Food and drink                     3000 non-null   category
 10  Gate location                      3000 non-null   category
 11  Inflight wifi service              3000 non

In [125]:
# train, valid 데이터 전처리

In [51]:
x=train[train.columns.difference(['id','target'])]

y = train['target']

x_test=test[test.columns.difference(['id','target'])]

x_train, x_valid, y_train, y_valid = train_test_split(x,y, test_size = 0.2, random_state = 42)

x_train,  y_test = train_test_split(x,y, test_size = 0.2, random_state = 42)

In [105]:
pd.DataFrame(predict_test)

pd.DataFrame(sample['id'])

output=pd.concat([pd.DataFrame(sample['id']), pd.DataFrame(predict_test)],  axis = 1)

output.rename(columns = {0 : 'target'}, inplace = True)
# 컬럼 이름 변경

output.to_csv('data/output.csv', index=False)