# Pycaret with cleaned data

## Virtual environment (to prevent version conflict)

In [1]:
!pip install pycaret



## Data

In [2]:
import pandas as pd

In [3]:
train_df=pd.read_csv("../data/cleaned_train.csv")
test_df=pd.read_csv("../data/cleaned_test.csv")

train_df.head()

Unnamed: 0.1,Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,days_birth,days_employed,work_phone,home_phone,email,occup_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,13899,4709,0,0,0,No job,2.0,6,1.0
1,1,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,15088,2092,0,1,0,Sales staff,2.0,37,0.0
2,2,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,15037,2105,0,0,0,Managers,2.0,26,2.0
3,3,F,N,Y,2,270000.0,Working,Secondary / secondary special,Married,House / apartment,13413,4996,0,0,1,High skill tech staff,4.0,18,1.0
4,4,F,N,N,0,315000.0,Working,Secondary / secondary special,Separated,House / apartment,17570,1978,0,0,1,Core staff,1.0,41,2.0


In [4]:
train_df.isnull().sum()

Unnamed: 0       0
gender           0
car              0
reality          0
child_num        0
income_total     0
income_type      0
edu_type         0
family_type      0
house_type       0
days_birth       0
days_employed    0
work_phone       0
home_phone       0
email            0
occup_type       0
family_size      0
begin_month      0
credit           0
dtype: int64

In [5]:
train_df.drop(['Unnamed: 0'], axis=1, inplace=True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23392 entries, 0 to 23391
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   gender         23392 non-null  object 
 1   car            23392 non-null  object 
 2   reality        23392 non-null  object 
 3   child_num      23392 non-null  int64  
 4   income_total   23392 non-null  float64
 5   income_type    23392 non-null  object 
 6   edu_type       23392 non-null  object 
 7   family_type    23392 non-null  object 
 8   house_type     23392 non-null  object 
 9   days_birth     23392 non-null  int64  
 10  days_employed  23392 non-null  int64  
 11  work_phone     23392 non-null  int64  
 12  home_phone     23392 non-null  int64  
 13  email          23392 non-null  int64  
 14  occup_type     23392 non-null  object 
 15  family_size    23392 non-null  float64
 16  begin_month    23392 non-null  int64  
 17  credit         23392 non-null  float64
dtypes: flo

In [6]:
def to_category(df, colnames):
    for colname in colnames:
        df[colname] = df[colname].astype('category')

In [7]:
colnames = ['gender','car','reality','income_type','edu_type','family_type','house_type','occup_type','work_phone','home_phone','email','CODE']

to_category(train_df, colnames)

KeyError: 'CODE'

In [None]:
train_df_copy = train_df.copy()
test_df_copy = test_df.copy()
train_df.info()

## Numerical Data: Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

scaling_cols = ['child_num','income_total','days_birth','days_employed','family_size','begin_month']

std_scaler = StandardScaler()
std_scaler.fit(train_df[scaling_cols])

train_scaled = std_scaler.transform(train_df[scaling_cols])
test_scaled = std_scaler.transform(test_df[scaling_cols]) # Apply to the test set using the scaler that was fitted with train data!!

train_scaled = pd.DataFrame(train_scaled, columns=scaling_cols)
test_scaled = pd.DataFrame(test_scaled, columns=scaling_cols)

train_scaled.shape, test_scaled.shape

## Categorical Data: One-hot Encoding

## Merge scaled numerical data with encoded categorical data

In [None]:
Train = pd.concat([train_scaled,train_df[colnames]],axis=1) # 원래 train_encoded
Train = pd.concat([Train,train_df['credit']],axis=1)

Test = pd.concat([test_scaled,test_df[colnames]],axis=1)

Train.shape, Test.shape

In [None]:
Train.columns

## Create validation data

In [None]:
x_data = Train.drop(['credit'], axis=1)
y_data = Train['credit']

In [None]:
#from sklearn.model_selection import train_test_split
#x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=777)

## Pycaret

In [None]:
#!pip uninstall scikit-learn
#!pip install scikit-learn==0.23.2
#!pip uninstall numpy
#!pip install numpy==1.20.0


In [None]:
#!pip install scikit-learn==0.23.2

#sklearn.__version__

In [None]:
#!pip uninstall sklearn
#!pip install scikit-learn==0.23.2
#!pip install imbalanced-learn==0.7.0

In [None]:
#from pycaret.utils import enable_colab
#enable_colab()
#!pip install pycaret

In [None]:
from pycaret.utils import enable_colab
from pycaret.classification import *

# setup
pycaret_model = setup(Train, target = 'credit', train_size = 0.8, fold = 5, fold_shuffle=True)

In [None]:
# Add metric
from sklearn.metrics import log_loss
add_metric('logloss', 'LogLoss', log_loss, greater_is_better=False, target="pred_proba")

In [None]:
# Performance Comparison
best5 = compare_models(fold = 5, n_select = 5, sort = 'LogLoss')

## Parameter Tuning

In [None]:
lgbm = create_model('lightgbm')
lgbm_tuned = tune_model(lgbm, optimize = 'LogLoss')

In [None]:
lgbm_tuned

In [None]:
lgbm_tuned.predict(Test)