# Pycaret with cleaned data

## Virtual environment (to prevent version conflict)

In [1]:
!pip install pycaret



## Data

In [2]:
import pandas as pd

In [3]:
train_df=pd.read_csv("../data/cleaned_train.csv")
test_df=pd.read_csv("../data/cleaned_test.csv")

train_df.head()

Unnamed: 0.1,Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,days_birth,days_employed,work_phone,home_phone,email,occup_type,family_size,begin_month,credit,CODE
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,13899,4709,0,0,0,No job,2.0,6,1.0,F-13899202500.0Commercial associate
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,11380,1540,0,0,1,Laborers,3.0,5,1.0,F-11380247500.0Commercial associate
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,19087,4434,0,1,0,Managers,2.0,22,2.0,M-19087450000.0Working
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,15088,2092,0,1,0,Sales staff,2.0,37,0.0,F-15088202500.0Commercial associate
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,15037,2105,0,0,0,Managers,2.0,26,2.0,F-15037157500.0State servant


In [4]:
train_df.isnull().sum()

Unnamed: 0       0
gender           0
car              0
reality          0
child_num        0
income_total     0
income_type      0
edu_type         0
family_type      0
house_type       0
days_birth       0
days_employed    0
work_phone       0
home_phone       0
email            0
occup_type       0
family_size      0
begin_month      0
credit           0
CODE             0
dtype: int64

In [5]:
train_df.drop(['Unnamed: 0'], axis=1, inplace=True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24817 entries, 0 to 24816
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   gender         24817 non-null  object 
 1   car            24817 non-null  object 
 2   reality        24817 non-null  object 
 3   child_num      24817 non-null  int64  
 4   income_total   24817 non-null  float64
 5   income_type    24817 non-null  object 
 6   edu_type       24817 non-null  object 
 7   family_type    24817 non-null  object 
 8   house_type     24817 non-null  object 
 9   days_birth     24817 non-null  int64  
 10  days_employed  24817 non-null  int64  
 11  work_phone     24817 non-null  int64  
 12  home_phone     24817 non-null  int64  
 13  email          24817 non-null  int64  
 14  occup_type     24817 non-null  object 
 15  family_size    24817 non-null  float64
 16  begin_month    24817 non-null  int64  
 17  credit         24817 non-null  float64
 18  CODE  

In [6]:
def to_category(df, colnames):
    for colname in colnames:
        df[colname] = df[colname].astype('category')

In [7]:
colnames = ['gender','car','reality','income_type','edu_type','family_type','house_type','occup_type','work_phone','home_phone','email','CODE']

to_category(train_df, colnames)

In [8]:
train_df_copy = train_df.copy()
test_df_copy = test_df.copy()
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24817 entries, 0 to 24816
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   gender         24817 non-null  category
 1   car            24817 non-null  category
 2   reality        24817 non-null  category
 3   child_num      24817 non-null  int64   
 4   income_total   24817 non-null  float64 
 5   income_type    24817 non-null  category
 6   edu_type       24817 non-null  category
 7   family_type    24817 non-null  category
 8   house_type     24817 non-null  category
 9   days_birth     24817 non-null  int64   
 10  days_employed  24817 non-null  int64   
 11  work_phone     24817 non-null  category
 12  home_phone     24817 non-null  category
 13  email          24817 non-null  category
 14  occup_type     24817 non-null  category
 15  family_size    24817 non-null  float64 
 16  begin_month    24817 non-null  int64   
 17  credit         24817 non-null  

## Numerical Data: Scaling

In [9]:
from sklearn.preprocessing import StandardScaler

scaling_cols = ['child_num','income_total','days_birth','days_employed','family_size','begin_month']

std_scaler = StandardScaler()
std_scaler.fit(train_df[scaling_cols])

train_scaled = std_scaler.transform(train_df[scaling_cols])
test_scaled = std_scaler.transform(test_df[scaling_cols]) # Apply to the test set using the scaler that was fitted with train data!!

train_scaled = pd.DataFrame(train_scaled, columns=scaling_cols)
test_scaled = pd.DataFrame(test_scaled, columns=scaling_cols)

train_scaled.shape, test_scaled.shape

((24817, 6), (10000, 6))

## Categorical Data: One-hot Encoding

## Merge scaled numerical data with encoded categorical data

In [10]:
Train = pd.concat([train_scaled,train_df[colnames]],axis=1) # 원래 train_encoded
Train = pd.concat([Train,train_df['credit']],axis=1)

Test = pd.concat([test_scaled,test_df[colnames]],axis=1)

Train.shape, Test.shape

((24817, 19), (10000, 18))

In [11]:
Train.columns

Index(['child_num', 'income_total', 'days_birth', 'days_employed',
       'family_size', 'begin_month', 'gender', 'car', 'reality', 'income_type',
       'edu_type', 'family_type', 'house_type', 'occup_type', 'work_phone',
       'home_phone', 'email', 'CODE', 'credit'],
      dtype='object')

## Create validation data

In [12]:
x_data = Train.drop(['credit'], axis=1)
y_data = Train['credit']

In [13]:
#from sklearn.model_selection import train_test_split
#x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=777)

## Pycaret

In [14]:
#!pip uninstall scikit-learn
#!pip install scikit-learn==0.23.2
#!pip uninstall numpy
#!pip install numpy==1.20.0


In [15]:
#!pip install scikit-learn==0.23.2

#sklearn.__version__

In [16]:
#!pip uninstall sklearn
#!pip install scikit-learn==0.23.2
#!pip install imbalanced-learn==0.7.0

In [17]:
#from pycaret.utils import enable_colab
#enable_colab()
#!pip install pycaret

In [18]:
from pycaret.utils import enable_colab
from pycaret.classification import *

# setup
pycaret_model = setup(Train, target = 'credit', train_size = 0.8, fold = 5, fold_shuffle=True)

Unnamed: 0,Description,Value
0,session_id,6801
1,Target,credit
2,Target Type,Multiclass
3,Label Encoded,
4,Original Data,"(24817, 19)"
5,Missing Values,False
6,Numeric Features,6
7,Categorical Features,12
8,Ordinal Features,False
9,High Cardinality Features,False


In [19]:
# Add metric
from sklearn.metrics import log_loss
add_metric('logloss', 'LogLoss', log_loss, greater_is_better=False, target="pred_proba")

Name                                                           LogLoss
Display Name                                                   LogLoss
Score Function                   <function log_loss at 0x7fbecd7bac10>
Scorer               make_scorer(log_loss, greater_is_better=False,...
Target                                                      pred_proba
Args                                                                {}
Greater is Better                                                False
Multiclass                                                        True
Custom                                                            True
Name: logloss, dtype: object

In [20]:
# Performance Comparison
best5 = compare_models(fold = 5, n_select = 5, sort = 'LogLoss')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss,TT (Sec)
svm,SVM - Linear Kernel,0.6726,0.0,0.4495,0.6501,0.6196,0.2251,0.2651,0.0,10.318
ridge,Ridge Classifier,0.6706,0.0,0.4979,0.6442,0.644,0.2836,0.2971,0.0,5.106
lr,Logistic Regression,0.6736,0.7194,0.4404,0.6516,0.6171,0.2213,0.2626,0.7789,63.672
lightgbm,Light Gradient Boosting Machine,0.6934,0.6872,0.4259,0.6871,0.617,0.2327,0.3147,0.7801,1.77
gbc,Gradient Boosting Classifier,0.6905,0.6478,0.4128,0.7194,0.6048,0.2106,0.3083,0.7995,63.522
dummy,Dummy Classifier,0.6378,0.5,0.3333,0.4068,0.4967,0.0,0.0,0.8869,0.612
ada,Ada Boost Classifier,0.6909,0.6188,0.4155,0.7027,0.6066,0.2116,0.3114,1.0766,6.7
rf,Random Forest Classifier,0.6736,0.7263,0.516,0.6512,0.658,0.3259,0.3305,1.2213,18.062
et,Extra Trees Classifier,0.6623,0.6736,0.4983,0.6367,0.6434,0.2927,0.2989,3.0123,31.184
knn,K Neighbors Classifier,0.6145,0.651,0.4518,0.5873,0.5969,0.2028,0.206,3.8301,125.838


## Parameter Tuning

In [21]:
lgbm = create_model('lightgbm')
lgbm_tuned = tune_model(lgbm, optimize = 'LogLoss')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.6968,0.685,0.4191,0.7446,0.6129,0.2315,0.329,0.7824
1,0.687,0.6763,0.4054,0.7449,0.5971,0.1946,0.2989,0.7948
2,0.692,0.6749,0.4124,0.707,0.6058,0.213,0.3138,0.791
3,0.6849,0.6583,0.4042,0.6843,0.5966,0.1966,0.2858,0.8021
4,0.6866,0.6779,0.4053,0.7367,0.5974,0.1966,0.2952,0.7952
Mean,0.6895,0.6745,0.4093,0.7235,0.602,0.2065,0.3045,0.7931
Std,0.0044,0.0088,0.0057,0.024,0.0064,0.0142,0.0152,0.0065


In [22]:
lgbm_tuned

LGBMClassifier(bagging_fraction=0.5, bagging_freq=0, boosting_type='gbdt',
               class_weight=None, colsample_bytree=1.0, feature_fraction=0.5,
               importance_type='split', learning_rate=0.05, max_depth=-1,
               min_child_samples=41, min_child_weight=0.001, min_split_gain=0.6,
               n_estimators=120, n_jobs=-1, num_leaves=60, objective=None,
               random_state=6801, reg_alpha=0.0001, reg_lambda=0.0001,
               silent='warn', subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

