In [33]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from preprocessing import *
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import confusion_matrix
from autogluon.tabular import TabularPredictor as tb

In [34]:
def load_config(path:str):
    with open(path) as f:
        config = json.load(f)
    return config

def load_data(path:str):
    data = pd.read_excel(path, index_col=0)
    x = data.iloc[:,:-1]
    y = data.iloc[:,-1]
    return x, y

def changed_columns_mapping(extra_metrics, prefix):
    res = {
        'score_test': f'{prefix}_loss'
    }
    for metric in extra_metrics:
        res.update({metric : f'{prefix}_{metric}'})
    return res

def get_res(res, useful_columns, prefix):
    extra_metrics = useful_columns[2:]
    return res.loc[:,useful_columns].rename(columns=changed_columns_mapping(extra_metrics, prefix)).set_index(res.iloc[:,0].values).drop(columns=['model'])

def split_dataset(x, y, test_size=0.2, random_state=0, fold_num=3):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=random_state)
    x_train = x_train.reset_index(drop=True)
    x_test = x_test.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)
    data_test = pd.concat((x_test,y_test),axis=1)
    skf = KFold(n_splits=fold_num, shuffle=True, random_state=random_state)
    dataset = []
    for i, (train_idx, val_idx) in enumerate(skf.split(x_train, y_train)):
        train_data = pd.concat((x_train.iloc[train_idx],y_train.iloc[train_idx]),axis=1)
        val_data = pd.concat((x_train.iloc[val_idx],y_train.iloc[val_idx]),axis=1)
        dataset.append([train_data,val_data])
    return dataset, data_test

def fit_and_test_model(models_config, x, y, hyperparameters, extra_metrics):
    dataset, data_test = split_dataset(x, y)
    r = []
    for data_train, data_val in dataset:
        models = tb(**models_config)
        models.fit(data_train,hyperparameters=hyperparameters)
        train_res = models.leaderboard(data_train,silent=True,extra_metrics=extra_metrics)
        cv_res = models.leaderboard(data_val,silent=True,extra_metrics=extra_metrics)
        test_res = models.leaderboard(data_test,silent=True,extra_metrics=extra_metrics)
        useful_columns = ['model','score_test']
        useful_columns.extend(extra_metrics)
        train_res = get_res(train_res, useful_columns, 'train')
        cv_res = get_res(cv_res, useful_columns, 'cv')
        test_res = get_res(test_res, useful_columns, 'test')
        r.append(pd.concat((train_res,cv_res,test_res), axis=1))
    return (r[0] + r[1] + r[2]) / 3

# def train_re(x, y):
#     hyperparameters={
#         'KNN':[{'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
#         'XGB': {},
#         'NN_TORCH':{},
#         'LR':{}
#     }
#     extra_metrics=['r2']
#     models_config = {
#         'label' : 'Sensory score',
#         'problem_type' : 'regression',
#         'eval_metric' : 'root_mean_squared_error',
#         'path' : r'.\models\reg',
#         'verbosity' : 1
#     }
#     return fit_and_test_model(models_config, x, y, hyperparameters, extra_metrics)

# def train_clf(x,y):
#     hyperparameters={
#         'KNN':[{'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
#         'FASTAI':{},
#         'XGB': {},
#         'LR':{}
#     }
#     extra_metrics=['accuracy']
#     models_config = {
#         'label' : 'class',
#         'problem_type' : 'multiclass',
#         'eval_metric' : 'log_loss',
#         'path' : r'.\models\clf',
#         'verbosity' : 1
#     }
#     return fit_and_test_model(models_config, x, y, hyperparameters, extra_metrics)

# def train_bil_clf(x,y):
#     hyperparameters={
#         'KNN':[{'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
#         'FASTAI':{},
#         'XGB': {},
#         'LR':{}
#     }
#     extra_metrics=['accuracy','precision','recall','f1','roc_auc']
#     models_config = {
#         'label' : 'label',
#         'problem_type' : 'binary',
#         'eval_metric' : 'log_loss',
#         'path' : r'.\models\bil_clf',
#         'verbosity' : 1
#     }
#     return fit_and_test_model(models_config, x, y, hyperparameters, extra_metrics)

def run(x, y, trainer, model_name):
    res = {}
    print('raw')
    res.update({'raw':trainer(x, y)})
    for method in [SG,WT,MMS,SS,SNV,D1,D2]:
        print(method.__name__)
        x_temp = pd.DataFrame(method(x.values))
        res.update({method.__name__:trainer(x_temp, y)})
    for method1 in [SG,WT]:
        for method2 in [MMS,SS,SNV,D1,D2]:
            print(method1.__name__,'-',method2.__name__)
            x_temp = pd.DataFrame(method2(method1(x.values)))
            res.update({f'{method1.__name__}-{method2.__name__}':trainer(x_temp, y)})
    for method1 in [SG,WT]:
        for method2 in [MMS,SS]:
            for method3 in [SNV,D1,D2]:
                print(method1.__name__,'-',method2.__name__,'-',method3.__name__)
                x_temp = pd.DataFrame(method3(method2(method1(x.values))))
                res.update({f'{method1.__name__}-{method2.__name__}-{method3.__name__}':trainer(x_temp, y)})
    excel = None
    for key,she in res.items():
        temp = pd.concat((she.loc[model_name,:],pd.DataFrame([key]*4, index=model_name, columns=['method'])), axis=1)
        excel = temp if excel is None else pd.concat((excel,temp))
    return excel

In [35]:
# data = pd.read_excel(r'.\dataset\花椒麻度判别.xlsx', index_col=0)
# x = data.iloc[:,:-1]
# y = data.iloc[:,-1]
# excel = run(x,y,train_re, ['KNeighborsDist', 'LinearModel', 'NeuralNetTorch', 'XGBoost'])
# excel.to_excel(r'.\result\麻度判别.xlsx')


# excel = run(x,y,train_clf, ['KNeighborsDist','LinearModel','NeuralNetFastAI','XGBoost'])
# excel.to_excel(r'.\result\产地区分.xlsx')

# data = pd.read_excel(r'.\dataset\花椒掺假检验.xlsx', index_col=0)
# ratio = [round(0.2 + i * 0.15, 2) for i in range(5)]
# for r in ratio:
#     data = pd.concat((data.loc[data.loc[:,'ratio'] == 1], data.loc[data.loc[:,'ratio'] == r]),axis=0).drop(columns='ratio')
#     x = data.iloc[:,:-1]
#     y = data.iloc[:,-1]
#     excel = run(x,y,train_bil_clf,['KNeighborsDist','LinearModel','NeuralNetFastAI','XGBoost'])
#     excel.to_excel(rf'.\result\掺假检验-{r*100}%.xlsx')

In [36]:
# data = pd.read_excel(r'.\dataset\花椒麻度判别.xlsx', index_col=0)
# x = data.iloc[:,:-1]
# y = data.iloc[:,-1]
# hyperparameters={
#     'KNN':[{'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
#     'XGB': {},
#     'NN_TORCH':{},
#     'LR':{}
# }
# extra_metrics=['r2']
# models_config = {
#     'label' : 'Sensory score',
#     'problem_type' : 'regression',
#     'eval_metric' : 'mean_squared_error',
#     'path' : r'.\models\reg',
#     'verbosity' : 2
# }
# res = fit_and_test_model(models_config, x, y, hyperparameters, extra_metrics)
# # models = tb(label='Sensory score', problem_type='regression', eval_metric='mean_squared_error', path='./', verbosity=2)
# # models.fit(data)

['KNeighborsDist', 'LinearModel', 'NeuralNetFastAI', 'XGBoost']

In [41]:
for method, config in load_config('config.json').items():
    print(f'run：{method}')
    hyperparameters = dict(config['models_choose'])
    extra_metrics = list(config['extra_metrics'])
    models_config = dict(config['models_config'])
    preserve_models = list(config['preserve_models'])
    data_path = config["data_path"]
    output_path = config['output_path']
    x, y = load_data(data_path)
    trainer = lambda  x, y : fit_and_test_model(models_config, x, y, hyperparameters, extra_metrics)
    excel = run(x, y, trainer, preserve_models)
    excel.to_excel(output_path)

run：pepper_ma




raw




SG




WT




MMS




SS




SNV




D1




D2




SG - MMS




SG - SS




SG - SNV




SG - D1




SG - D2




WT - MMS




WT - SS




WT - SNV




WT - D1




WT - D2




SG - MMS - SNV




SG - MMS - D1


Available Memory: 501 MB
Estimated XGB model size: 16 MB
Available Memory: 485 MB
Estimated XGB model size: 6 MB


SG - MMS - D2


Available Memory: 474 MB
Estimated XGB model size: 8 MB
Available Memory: 502 MB
Estimated XGB model size: 9 MB
Available Memory: 502 MB
Estimated XGB model size: 16 MB


SG - SS - SNV




SG - SS - D1




SG - SS - D2




WT - MMS - SNV




WT - MMS - D1




WT - MMS - D2




WT - SS - SNV




WT - SS - D1




WT - SS - D2




run：pepper_origin




raw




SG




WT




MMS




SS




SNV




D1




D2




SG - MMS




SG - SS




SG - SNV




SG - D1




SG - D2




WT - MMS




WT - SS




WT - SNV




WT - D1




WT - D2




SG - MMS - SNV




SG - MMS - D1




SG - MMS - D2




SG - SS - SNV




SG - SS - D1




SG - SS - D2




WT - MMS - SNV




WT - MMS - D1




WT - MMS - D2




WT - SS - SNV




WT - SS - D1




WT - SS - D2




KeyError: "['XGBoost'] not in index"

In [39]:
extra_metrics

{'r': '2'}