# (연구&재인) MBTI(정리) – TabularPredictor 실험

신록예찬  
2023-12-21

# 1. Imports

**라이브러리**

In [1]:
import pandas as pd
import sklearn.model_selection
import sklearn.metrics
#from autogluon.multimodal import MultiModalPredictor
from autogluon.tabular import TabularPredictor
import time
import pickle
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm

**함수들**

In [2]:
def report(predictor,df_test):
    model_names = [
        'KNeighborsUnif',
        'KNeighborsDist',
        'NeuralNetFastAI',
        'LightGBMXT',
        'LightGBM',
        'RandomForestGini',
        'RandomForestEntr',
        'CatBoost',
        'ExtraTreesGini',
        'ExtraTreesEntr',
        'XGBoost',
        'NeuralNetTorch',
        'LightGBMLarge',
        'WeightedEnsemble_L2'
    ]
    labels = ['ENTP', 'ESFP', 'ISFJ', 'INTJ', 'ISFP', 'ESTP', 'INTP', 'INFJ', 'ESTJ', 'ENFP', 'ISTJ', 'ENTJ', 'INFP', 'ISTP', 'ESFJ', 'ENFJ']
    df_report = pd.concat({model_name:pd.DataFrame({label:{'PRE':0.0,'REC':0.0,'F1':0.0,'AUC':0.0} for label in labels}) for model_name in model_names})
    yhats_dct = {model_name:predictor.predict_proba(df_test,model=model_name) for model_name in model_names}    
    y = df_test['type']
    for model_name in model_names:
        yhat = yhats_dct[model_name].idxmax(axis=1)    
        for label in labels:
            if label in set(y):
                _y = (y == label)*1
                _y_pred = (yhat == label)*1
                _y_proba = yhats_dct[model_name][label]
                df_report[label][model_name]['PRE'] = sklearn.metrics.precision_score(_y,_y_pred)
                df_report[label][model_name]['REC'] = sklearn.metrics.recall_score(_y,_y_pred)
                df_report[label][model_name]['F1'] = sklearn.metrics.f1_score(_y,_y_pred)
                df_report[label][model_name]['AUC'] = sklearn.metrics.roc_auc_score(_y,_y_proba)
            else:
                pass 
    return df_report

In [3]:
def fit_predict_save(path,experiments_index):
    t1 = time.time()
    df_train = df_trains_dct[experiments_index]
    df_test = df_tests_dct[experiments_index]
    predictor = TabularPredictor(label='type', eval_metric='acc', path=f"AutogluonModels/{experiments_index}",verbosity=False)
    predictor.fit(
        df_train,
        hyperparameters = {
            'NN_TORCH': {},
            'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
            'CAT': {},
            'XGB': {},
            'FASTAI': {},
            'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],  'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
            'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
        },
    )
    df_report = report(predictor,df_test)
    df_report.to_csv(f"{path}{experiments_index}.csv")
    t2 = time.time()
    print(f"{experiments_index} -- 완료 (걸린시간 = {(t2-t1)/60:.4f} 분)")

# 2. Data

In [4]:
with open('실험셋업/df_trains_dct.pickle', 'rb') as f:    
    df_trains_dct = pickle.load(f)
with open('실험셋업/df_tests_dct.pickle', 'rb') as f:    
    df_tests_dct = pickle.load(f)    

# 3. 실험

In [5]:
df_trains_dct.keys()

dict_keys(['실험1/시나리오1', '실험1/시나리오2', '실험2/시나리오0a', '실험2/시나리오0b', '실험2/시나리오0c', '실험2/시나리오1a', '실험2/시나리오1b', '실험2/시나리오1c', '실험2/시나리오2a', '실험2/시나리오2b', '실험2/시나리오2c', '실험2/시나리오3a', '실험2/시나리오3b', '실험2/시나리오3c', '실험2/시나리오4a', '실험2/시나리오4b', '실험2/시나리오4c', '실험3/시나리오0a', '실험3/시나리오0b', '실험3/시나리오1a', '실험3/시나리오1b', '실험3/시나리오2a', '실험3/시나리오2b', '실험3/시나리오3a', '실험3/시나리오3b', '실험3/시나리오4a', '실험3/시나리오4b', '실험3/시나리오5a', '실험3/시나리오5b', '실험3/시나리오6a', '실험3/시나리오6b', '실험3/시나리오7a', '실험3/시나리오7b'])

## A. 실험1

In [13]:
# ![ -d "실험결과/실험1" ] || mkdir "실험결과/실험1"
# fit_predict_save(path='실험결과/',experiments_index='실험1/시나리오1')
# fit_predict_save(path='실험결과/',experiments_index='실험1/시나리오2')

## B. 실험2

In [None]:
![ -d "실험셋업/실험2" ] || mkdir "실험셋업/실험2"
fit_predict_save('실험결과/','실험2/시나리오0a')
fit_predict_save('실험결과/','실험2/시나리오1a')
fit_predict_save('실험결과/','실험2/시나리오1b')
fit_predict_save('실험결과/','실험2/시나리오1c')
fit_predict_save('실험결과/','실험2/시나리오2a')
fit_predict_save('실험결과/','실험2/시나리오2b')
fit_predict_save('실험결과/','실험2/시나리오2c')
fit_predict_save('실험결과/','실험2/시나리오3a')
fit_predict_save('실험결과/','실험2/시나리오3b')
fit_predict_save('실험결과/','실험2/시나리오3c')
fit_predict_save('실험결과/','실험2/시나리오4a')
fit_predict_save('실험결과/','실험2/시나리오4b')
fit_predict_save('실험결과/','실험2/시나리오4c')

실험2/시나리오0a -- 완료 (걸린시간 = 14.1333 분)
실험2/시나리오1a -- 완료 (걸린시간 = 16.1246 분)
실험2/시나리오1b -- 완료 (걸린시간 = 15.6699 분)
실험2/시나리오1c -- 완료 (걸린시간 = 15.3118 분)
실험2/시나리오2a -- 완료 (걸린시간 = 15.6923 분)
실험2/시나리오2b -- 완료 (걸린시간 = 14.2966 분)
실험2/시나리오2c -- 완료 (걸린시간 = 15.3765 분)
실험2/시나리오3a -- 완료 (걸린시간 = 17.0679 분)
실험2/시나리오3b -- 완료 (걸린시간 = 15.2641 분)
실험2/시나리오3c -- 완료 (걸린시간 = 13.3913 분)
실험2/시나리오4a -- 완료 (걸린시간 = 15.5517 분)

## C. 실험3

In [5]:
![ -d "실험결과/실험3" ] || mkdir "실험결과/실험3"
fit_predict_save('실험결과/','실험3/시나리오0a')
fit_predict_save('실험결과/','실험3/시나리오1a')
fit_predict_save('실험결과/','실험3/시나리오1b')
fit_predict_save('실험결과/','실험3/시나리오2a')
fit_predict_save('실험결과/','실험3/시나리오2b')
fit_predict_save('실험결과/','실험3/시나리오3a')
fit_predict_save('실험결과/','실험3/시나리오3b')
fit_predict_save('실험결과/','실험3/시나리오4a')
fit_predict_save('실험결과/','실험3/시나리오4b')
fit_predict_save('실험결과/','실험3/시나리오5a')
fit_predict_save('실험결과/','실험3/시나리오5b')
fit_predict_save('실험결과/','실험3/시나리오6a')
fit_predict_save('실험결과/','실험3/시나리오6b')
fit_predict_save('실험결과/','실험3/시나리오7a')
fit_predict_save('실험결과/','실험3/시나리오7b')

실험3/시나리오0a -- 완료 (걸린시간 = 4.0562 분)
실험3/시나리오1a -- 완료 (걸린시간 = 4.4870 분)
실험3/시나리오1b -- 완료 (걸린시간 = 4.6460 분)
실험3/시나리오2a -- 완료 (걸린시간 = 4.2561 분)
실험3/시나리오2b -- 완료 (걸린시간 = 3.9312 분)
실험3/시나리오3a -- 완료 (걸린시간 = 4.5184 분)
실험3/시나리오3b -- 완료 (걸린시간 = 4.0638 분)
실험3/시나리오4a -- 완료 (걸린시간 = 4.5813 분)
실험3/시나리오4b -- 완료 (걸린시간 = 4.0858 분)
실험3/시나리오5a -- 완료 (걸린시간 = 4.4831 분)
실험3/시나리오5b -- 완료 (걸린시간 = 4.2321 분)
실험3/시나리오6a -- 완료 (걸린시간 = 4.5741 분)
실험3/시나리오6b -- 완료 (걸린시간 = 4.3771 분)
실험3/시나리오7a -- 완료 (걸린시간 = 4.5339 분)
실험3/시나리오7b -- 완료 (걸린시간 = 4.1424 분)