# Pycaret with cleaned data

## Virtual environment (to prevent version conflict)

In [1]:
!pip install pycaret



## Data

In [2]:
import pandas as pd

In [3]:
train_df=pd.read_csv("../data/cleaned_train.csv")
test_df=pd.read_csv("../data/cleaned_test.csv")

train_df.head()

Unnamed: 0.1,Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,days_birth,days_employed,work_phone,home_phone,email,occup_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,13899,4709,0,0,0,No job,2.0,6,1.0
1,1,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,15088,2092,0,1,0,Sales staff,2.0,37,0.0
2,2,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,15037,2105,0,0,0,Managers,2.0,26,2.0
3,3,F,N,Y,2,270000.0,Working,Secondary / secondary special,Married,House / apartment,13413,4996,0,0,1,High skill tech staff,4.0,18,1.0
4,4,F,N,N,0,315000.0,Working,Secondary / secondary special,Separated,House / apartment,17570,1978,0,0,1,Core staff,1.0,41,2.0


In [4]:
train_df.isnull().sum()

Unnamed: 0       0
gender           0
car              0
reality          0
child_num        0
income_total     0
income_type      0
edu_type         0
family_type      0
house_type       0
days_birth       0
days_employed    0
work_phone       0
home_phone       0
email            0
occup_type       0
family_size      0
begin_month      0
credit           0
dtype: int64

In [5]:
train_df.drop(['Unnamed: 0'], axis=1, inplace=True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23392 entries, 0 to 23391
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   gender         23392 non-null  object 
 1   car            23392 non-null  object 
 2   reality        23392 non-null  object 
 3   child_num      23392 non-null  int64  
 4   income_total   23392 non-null  float64
 5   income_type    23392 non-null  object 
 6   edu_type       23392 non-null  object 
 7   family_type    23392 non-null  object 
 8   house_type     23392 non-null  object 
 9   days_birth     23392 non-null  int64  
 10  days_employed  23392 non-null  int64  
 11  work_phone     23392 non-null  int64  
 12  home_phone     23392 non-null  int64  
 13  email          23392 non-null  int64  
 14  occup_type     23392 non-null  object 
 15  family_size    23392 non-null  float64
 16  begin_month    23392 non-null  int64  
 17  credit         23392 non-null  float64
dtypes: flo

In [6]:
train_df['credit'] = train_df['credit'].astype('category')

In [7]:
train_df_copy = train_df.copy()
test_df_copy = test_df.copy()
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23392 entries, 0 to 23391
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   gender         23392 non-null  object  
 1   car            23392 non-null  object  
 2   reality        23392 non-null  object  
 3   child_num      23392 non-null  int64   
 4   income_total   23392 non-null  float64 
 5   income_type    23392 non-null  object  
 6   edu_type       23392 non-null  object  
 7   family_type    23392 non-null  object  
 8   house_type     23392 non-null  object  
 9   days_birth     23392 non-null  int64   
 10  days_employed  23392 non-null  int64   
 11  work_phone     23392 non-null  int64   
 12  home_phone     23392 non-null  int64   
 13  email          23392 non-null  int64   
 14  occup_type     23392 non-null  object  
 15  family_size    23392 non-null  float64 
 16  begin_month    23392 non-null  int64   
 17  credit         23392 non-null  

## Numerical Data: Scaling

In [8]:
from sklearn.preprocessing import StandardScaler

scaling_cols = ['child_num','income_total','days_birth','days_employed','family_size','begin_month']

std_scaler = StandardScaler()
std_scaler.fit(train_df[scaling_cols])

train_scaled = std_scaler.transform(train_df[scaling_cols])
test_scaled = std_scaler.transform(test_df[scaling_cols]) # Apply to the test set using the scaler that was fitted with train data!!

train_scaled = pd.DataFrame(train_scaled, columns=scaling_cols)
test_scaled = pd.DataFrame(test_scaled, columns=scaling_cols)

train_scaled.shape, test_scaled.shape

((23392, 6), (10000, 6))

## Categorical Data: One-hot Encoding

In [9]:
onehot_cols = ['gender','car','reality','income_type','edu_type','family_type','house_type','occup_type','work_phone','home_phone','email']
data = pd.concat([train_df[onehot_cols], test_df[onehot_cols]]) # One-hot encoding using the combination of train and test data

In [10]:
data = pd.get_dummies(data)
data.head()

Unnamed: 0,work_phone,home_phone,email,gender_F,gender_M,car_N,car_Y,reality_N,reality_Y,income_type_Commercial associate,...,occup_type_Low-skill Laborers,occup_type_Managers,occup_type_Medicine staff,occup_type_No job,occup_type_Private service staff,occup_type_Realty agents,occup_type_Sales staff,occup_type_Secretaries,occup_type_Security staff,occup_type_Waiters/barmen staff
0,0,0,0,1,0,1,0,1,0,1,...,0,0,0,1,0,0,0,0,0,0
1,0,1,0,1,0,1,0,0,1,1,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,1,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,1,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,1,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# data separation
data = data.reset_index(drop=True)
train_encoded = data.loc[:23391]
test_encoded = data.loc[23392:]
test_encoded = test_encoded.reset_index(drop=True)
train_encoded.shape, test_encoded.shape

((23392, 49), (10000, 49))

## Merge scaled numerical data with encoded categorical data

In [12]:
Train = pd.concat([train_scaled,train_encoded],axis=1)
Train = pd.concat([Train,train_df['credit']],axis=1)

Test = pd.concat([test_scaled,test_encoded],axis=1)

Train.shape, Test.shape

((23392, 56), (10000, 55))

In [13]:
Train.columns

Index(['child_num', 'income_total', 'days_birth', 'days_employed',
       'family_size', 'begin_month', 'work_phone', 'home_phone', 'email',
       'gender_F', 'gender_M', 'car_N', 'car_Y', 'reality_N', 'reality_Y',
       'income_type_Commercial associate', 'income_type_Pensioner',
       'income_type_State servant', 'income_type_Student',
       'income_type_Working', 'edu_type_Academic degree',
       'edu_type_Higher education', 'edu_type_Incomplete higher',
       'edu_type_Lower secondary', 'edu_type_Secondary / secondary special',
       'family_type_Civil marriage', 'family_type_Married',
       'family_type_Separated', 'family_type_Single / not married',
       'family_type_Widow', 'house_type_Co-op apartment',
       'house_type_House / apartment', 'house_type_Municipal apartment',
       'house_type_Office apartment', 'house_type_Rented apartment',
       'house_type_With parents', 'occup_type_Accountants',
       'occup_type_Cleaning staff', 'occup_type_Cooking staff',
    

## Create validation data

In [14]:
x_data = Train.drop(['credit'], axis=1)
y_data = Train['credit']

In [15]:
#from sklearn.model_selection import train_test_split
#x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=777)

## Pycaret

In [16]:
#!pip uninstall scikit-learn
#!pip install scikit-learn==0.23.2
#!pip uninstall numpy
#!pip install numpy==1.20.0


In [17]:
#!pip install scikit-learn==0.23.2

#sklearn.__version__

In [18]:
#!pip uninstall sklearn
#!pip install scikit-learn==0.23.2
#!pip install imbalanced-learn==0.7.0

In [19]:
#from pycaret.utils import enable_colab
#enable_colab()
#!pip install pycaret

In [20]:
from pycaret.utils import enable_colab
from pycaret.classification import *

# setup
pycaret_model = setup(Train, target = 'credit', train_size = 0.8, fold = 5, fold_shuffle=True)

Unnamed: 0,Description,Value
0,session_id,2073
1,Target,credit
2,Target Type,Multiclass
3,Label Encoded,"0.0: 0, 1.0: 1, 2.0: 2"
4,Original Data,"(23392, 56)"
5,Missing Values,False
6,Numeric Features,52
7,Categorical Features,3
8,Ordinal Features,False
9,High Cardinality Features,False


In [21]:
# Add metric
from sklearn.metrics import log_loss
add_metric('logloss', 'LogLoss', log_loss, greater_is_better=False, target="pred_proba")

Name                                                           LogLoss
Display Name                                                   LogLoss
Score Function                   <function log_loss at 0x7fefbe9430d0>
Scorer               make_scorer(log_loss, greater_is_better=False,...
Target                                                      pred_proba
Args                                                                {}
Greater is Better                                                False
Multiclass                                                        True
Custom                                                            True
Name: logloss, dtype: object

In [24]:
# Performance Comparison
best5 = compare_models(fold = 5, n_select = 5, sort = 'LogLoss')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss,TT (Sec)
svm,SVM - Linear Kernel,0.6513,0.0,0.3333,0.4242,0.5138,-0.0,-0.0004,0.0,0.25
ridge,Ridge Classifier,0.6522,0.0,0.3362,0.5518,0.5191,0.0084,0.0422,0.0,0.036
lightgbm,Light Gradient Boosting Machine,0.7082,0.7018,0.43,0.698,0.6355,0.2492,0.3288,0.7536,0.214
gbc,Gradient Boosting Classifier,0.7016,0.6588,0.4131,0.6573,0.6204,0.2202,0.3068,0.7799,2.128
lr,Logistic Regression,0.653,0.6126,0.3379,0.5496,0.522,0.0139,0.0558,0.8477,0.662
lda,Linear Discriminant Analysis,0.6517,0.6123,0.3381,0.56,0.5228,0.0142,0.0477,0.8479,0.096
dummy,Dummy Classifier,0.6513,0.5,0.3333,0.4242,0.5138,0.0,0.0,0.8686,0.016
rf,Random Forest Classifier,0.7084,0.747,0.5269,0.6829,0.6876,0.3648,0.3743,0.8735,0.65
ada,Ada Boost Classifier,0.7014,0.6312,0.4084,0.6361,0.6162,0.2111,0.3083,1.0778,0.236
et,Extra Trees Classifier,0.6782,0.7097,0.5042,0.6527,0.6595,0.3068,0.3132,1.4008,0.61


## Parameter Tuning

In [25]:
lgbm = create_model('lightgbm')
lgbm_tuned = tune_model(lgbm, optimize = 'LogLoss')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.7075,0.6924,0.4223,0.7,0.6296,0.2403,0.3268,0.7661
1,0.7091,0.6849,0.4243,0.7296,0.6315,0.2395,0.3339,0.7637
2,0.7013,0.6915,0.4119,0.6596,0.6195,0.2198,0.3054,0.7649
3,0.7084,0.7028,0.4203,0.6744,0.6288,0.2393,0.3312,0.7572
4,0.7012,0.6869,0.4122,0.727,0.6185,0.2137,0.3073,0.768
Mean,0.7055,0.6917,0.4182,0.6981,0.6256,0.2305,0.3209,0.764
Std,0.0035,0.0062,0.0052,0.0278,0.0054,0.0114,0.0121,0.0037


In [26]:
lgbm_tuned

LGBMClassifier(bagging_fraction=0.7, bagging_freq=5, boosting_type='gbdt',
               class_weight=None, colsample_bytree=1.0, feature_fraction=0.4,
               importance_type='split', learning_rate=0.05, max_depth=-1,
               min_child_samples=46, min_child_weight=0.001, min_split_gain=0.2,
               n_estimators=110, n_jobs=-1, num_leaves=80, objective=None,
               random_state=2073, reg_alpha=0.001, reg_lambda=0.15,
               silent='warn', subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

In [27]:
lgbm_tuned.predict(Test)

ValueError: Number of features of the model must match the input. Model n_features_ is 52 and input n_features is 55