In [1]:
import pandas as pd
import numpy as np

In [1]:
from sklearn.preprocessing import OneHotEncoder

from lightgbm import LGBMClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

In [2]:
from pycaret.classification import *

In [6]:
train = pd.read_csv("data/train.csv")

test = pd.read_csv("data/test.csv")

# sample = pd.read_csv("data/sample_submission.csv")


In [19]:
train.columns

Index(['id', 'Gender', 'Customer Type', 'Age', 'Type of Travel', 'Class',
       'Flight Distance', 'Seat comfort', 'Departure/Arrival time convenient',
       'Food and drink', 'Gate location', 'Inflight wifi service',
       'Inflight entertainment', 'Online support', 'Ease of Online booking',
       'On-board service', 'Leg room service', 'Baggage handling',
       'Checkin service', 'Cleanliness', 'Online boarding',
       'Departure Delay in Minutes', 'Arrival Delay in Minutes', 'target'],
      dtype='object')

In [25]:
setup_clf = setup(data = train, target='target', n_jobs = -1)

Unnamed: 0,Description,Value
0,session_id,6019
1,Target,target
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(3000, 24)"
5,Missing Values,0
6,Numeric Features,5
7,Categorical Features,18
8,Ordinal Features,0
9,High Cardinality Features,0


In [27]:
top5 = compare_models(sort='Accuracy', n_select=3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9276,0.9835,0.9298,0.9384,0.9339,0.8538,0.8541,0.018
et,Extra Trees Classifier,0.9204,0.978,0.9272,0.9288,0.9278,0.8392,0.8396,0.06
gbc,Gradient Boosting Classifier,0.9152,0.9768,0.9194,0.926,0.9226,0.8288,0.829,0.1
rf,Random Forest Classifier,0.9147,0.9747,0.9219,0.9237,0.9225,0.8276,0.8283,0.05
lr,Logistic Regression,0.8971,0.9618,0.9064,0.9068,0.9064,0.7921,0.7925,0.118
ada,Ada Boost Classifier,0.8961,0.9578,0.9029,0.9077,0.9051,0.7903,0.7907,0.033
ridge,Ridge Classifier,0.8895,0.0,0.8969,0.9016,0.8991,0.7769,0.7772,0.004
lda,Linear Discriminant Analysis,0.889,0.9606,0.8969,0.9009,0.8987,0.7759,0.7762,0.008
dt,Decision Tree Classifier,0.8718,0.8697,0.8917,0.8786,0.8847,0.7405,0.7415,0.005
nb,Naive Bayes,0.8542,0.9261,0.8301,0.8973,0.8622,0.7079,0.7106,0.004


In [28]:
tuned_top5 = [tune_model(i) for i in top5]

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8857,0.9753,0.8957,0.8957,0.8957,0.7693,0.7693
1,0.919,0.9813,0.9304,0.9224,0.9264,0.8365,0.8365
2,0.9048,0.9685,0.9304,0.8992,0.9145,0.8071,0.8077
3,0.9571,0.9815,0.9565,0.9649,0.9607,0.9136,0.9136
4,0.9286,0.9876,0.9043,0.963,0.9327,0.8568,0.8587
5,0.919,0.9786,0.9224,0.9304,0.9264,0.8365,0.8365
6,0.9524,0.9847,0.9397,0.9732,0.9561,0.9041,0.9048
7,0.9143,0.9747,0.9138,0.9298,0.9217,0.827,0.8272
8,0.919,0.9841,0.931,0.9231,0.927,0.8361,0.8362
9,0.9091,0.9699,0.8957,0.9364,0.9156,0.8172,0.8182


In [29]:
blender_top5 = blend_models(estimator_list=tuned_top5, method='soft', choose_better = True)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9095,0.9746,0.913,0.9211,0.917,0.8176,0.8176
1,0.919,0.982,0.9304,0.9224,0.9264,0.8365,0.8365
2,0.9048,0.972,0.9304,0.8992,0.9145,0.8071,0.8077
3,0.9476,0.9845,0.9565,0.9483,0.9524,0.8942,0.8942
4,0.9476,0.9912,0.9217,0.9815,0.9507,0.895,0.897
5,0.9143,0.9808,0.931,0.9153,0.9231,0.8263,0.8265
6,0.9476,0.9842,0.9397,0.9646,0.952,0.8944,0.8948
7,0.9333,0.9777,0.931,0.9474,0.9391,0.8655,0.8656
8,0.9333,0.981,0.9483,0.9322,0.9402,0.8649,0.8651
9,0.8947,0.971,0.8957,0.9115,0.9035,0.7877,0.7879


In [30]:
final_model = finalize_model(blender_top5)

In [31]:
prediction = predict_model(final_model, data=test)

In [32]:
prediction

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,...,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Label,Score
0,1,Female,Loyal Customer,61,Personal Travel,Eco,2037,1,1,1,...,5,5,5,3,5,3,51,58.0,1,0.9675
1,2,Female,disloyal Customer,27,Business travel,Business,1846,1,1,1,...,3,4,5,4,4,1,0,0.0,0,0.9716
2,3,Female,Loyal Customer,52,Business travel,Business,1622,4,4,4,...,5,5,5,4,5,3,0,0.0,1,0.9189
3,4,Male,Loyal Customer,54,Business travel,Business,3534,4,4,4,...,2,2,2,5,2,1,0,0.0,1,0.8625
4,5,Female,Loyal Customer,41,Business travel,Eco,1471,4,3,3,...,3,1,2,5,4,4,0,0.0,1,0.8965
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1996,Female,Loyal Customer,62,Business travel,Eco Plus,54,3,1,1,...,3,3,3,3,3,1,0,0.0,0,0.8669
1996,1997,Female,Loyal Customer,33,Business travel,Business,1937,4,4,4,...,5,5,5,1,5,3,95,99.0,1,0.9891
1997,1998,Male,Loyal Customer,29,Business travel,Eco,2398,3,5,4,...,1,2,3,4,2,3,0,3.0,0,0.9747
1998,1999,Female,Loyal Customer,51,Personal Travel,Business,406,5,2,5,...,2,5,2,2,2,2,87,83.0,1,0.9494


In [33]:
pd.DataFrame(prediction['Label'])

Unnamed: 0,Label
0,1
1,0
2,1
3,1
4,1
...,...
1995,0
1996,1
1997,0
1998,1


In [34]:
# pd.DataFrame(predict_test)

# pd.DataFrame(sample['id'])

output=pd.concat([pd.DataFrame(sample['id']), pd.DataFrame(prediction['Label'])],  axis = 1)

output.rename(columns = {'Label' : 'target'}, inplace = True)
# 컬럼 이름 변경

output.to_csv('data/output1.csv', index=False)

In [48]:
category_col = train.columns.difference(['id', 'Age','Flight Distrance','Departure Delay in Minutes', 'Arrival Delay in Minutes'])

for i in train.columns.difference(['id', 'Age','target','Flight Distrance','Departure Delay in Minutes', 'Arrival Delay in Minutes']):
    train['{}'.format(i)] = train['{}'.format(i)].astype('category')

for i in test.columns.difference(['id', 'Age','target','Flight Distrance','Departure Delay in Minutes', 'Arrival Delay in Minutes']):
    test['{}'.format(i)] = test['{}'.format(i)].astype('category')

In [50]:
train.info()
#타입 변형 확인

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 24 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   id                                 3000 non-null   int64   
 1   Gender                             3000 non-null   category
 2   Customer Type                      3000 non-null   category
 3   Age                                3000 non-null   int64   
 4   Type of Travel                     3000 non-null   category
 5   Class                              3000 non-null   category
 6   Flight Distance                    3000 non-null   category
 7   Seat comfort                       3000 non-null   category
 8   Departure/Arrival time convenient  3000 non-null   category
 9   Food and drink                     3000 non-null   category
 10  Gate location                      3000 non-null   category
 11  Inflight wifi service              3000 non

In [125]:
# train, valid 데이터 전처리

In [51]:
x=train[train.columns.difference(['id','target'])]

y = train['target']

x_test=test[test.columns.difference(['id','target'])]

x_train, x_valid, y_train, y_valid = train_test_split(x,y, test_size = 0.2, random_state = 42)

x_train,  y_test = train_test_split(x,y, test_size = 0.2, random_state = 42)

In [105]:
pd.DataFrame(predict_test)

pd.DataFrame(sample['id'])

output=pd.concat([pd.DataFrame(sample['id']), pd.DataFrame(predict_test)],  axis = 1)

output.rename(columns = {0 : 'target'}, inplace = True)
# 컬럼 이름 변경

output.to_csv('data/output.csv', index=False)