In [1]:
import numpy as np
import pandas as pd

import joblib
import yaml
import json

import warnings
warnings.filterwarnings("ignore")

In [2]:
config_path = '../config/params.yaml'
config = yaml.load(open(config_path), Loader=yaml.FullLoader)

In [3]:
data_test = pd.read_csv(config['preprocessing']['raw_test_path'])
data_test.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,DoctorInCharge
0,5202,69,1,0,3,38.573252,1,5.88607,2.273392,7.289339,...,1.273793,0,0,2.014531,0,0,0,0,1,XXXConfid
1,6831,72,0,0,2,17.598953,0,3.324806,9.079688,5.610519,...,7.747168,0,0,3.170876,0,0,0,0,0,XXXConfid
2,6407,80,0,0,1,34.802944,0,3.78166,5.897372,4.951744,...,0.980757,1,0,5.116213,0,0,0,0,1,XXXConfid
3,5821,71,0,0,1,29.582268,0,3.566242,1.346447,4.096637,...,3.61024,0,0,1.48302,0,1,0,0,0,XXXConfid
4,5581,61,1,1,1,31.445223,1,15.501095,7.863065,3.128334,...,1.31087,1,1,4.240209,1,0,0,0,0,XXXConfid


# Preprocessing

In [4]:
def check_columns_evaluate(data: pd.DataFrame, unique_values_path: str) -> pd.DataFrame:
    """
    Проверка на наличие признаков из train и упорядочивание признаков согласно train
    :param data: датасет test
    :param unique_values_path: путь до списока с признаками train для сравнения
    :return: датасет test
    """
    with open(unique_values_path) as json_file:
        unique_values = json.load(json_file)

    column_sequence = unique_values.keys()

    assert set(column_sequence) == set(data.columns), "Разные признаки"
    return data[column_sequence]

In [5]:
def test_preprocess(data: pd.DataFrame, **kwargs):
    """
    Пайплайн по предобработке тестовых данных
    :param data: исходный датасет
    :return: предобработанный датасет
    """
    # удаление ненужных признаков
    data = data.drop(kwargs['preprocessing']["drop_columns"],
                     axis=1,
                     errors="ignore")
    
    # замена значений
    data.replace(kwargs['preprocessing']['map_change_columns'], inplace=True)
    
    # проверка dataset на совпадение с признаками из train
    # и упорядочивание признаков согласно train
    data = check_columns_evaluate(
        data=data,
        unique_values_path=kwargs['preprocessing']["unique_values_path"])

    # трансформация колонок(масштабирование и one-hot encoding)
    column_transformer = joblib.load(kwargs['train']['col_transform_path'])

    data_transformed_raw = column_transformer.transform(data)

    data_new = pd.DataFrame(data_transformed_raw,
                            columns=column_transformer.get_feature_names_out())

    return data_new

In [6]:
data_test_proc = test_preprocess(data_test, **config)

In [7]:
data_test_proc.head()

Unnamed: 0,Ethnicity_Asian,Ethnicity_Caucasian,Ethnicity_Other,ADL,Age,CholesterolHDL,CholesterolLDL,FunctionalAssessment,MMSE,SleepQuality,...,CardiovascularDisease,Confusion,Depression,DifficultyCompletingTasks,FamilyHistoryAlzheimers,HeadInjury,Hypertension,MemoryComplaints,PersonalityChanges,Smoking
0,0.0,1.0,0.0,-0.99932,-0.664487,1.197004,-1.118082,-1.308888,-0.095844,1.240398,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,-0.60561,-0.331193,1.138226,-0.70395,0.910494,-1.175104,0.526177,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.056735,0.55759,0.306133,0.445035,-1.409355,0.25358,-1.539144,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,-1.180288,-0.442291,1.469594,0.591253,-0.507842,1.076148,1.554384,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,-0.241525,-1.55327,1.66794,-1.515077,-1.296176,-0.477494,-0.505878,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0


# Evaluate

In [8]:
model = joblib.load(config['train']['model_path'])
data_test_proc['predict'] = model.predict(data_test_proc)

In [9]:
data_test_proc.head()

Unnamed: 0,Ethnicity_Asian,Ethnicity_Caucasian,Ethnicity_Other,ADL,Age,CholesterolHDL,CholesterolLDL,FunctionalAssessment,MMSE,SleepQuality,...,Confusion,Depression,DifficultyCompletingTasks,FamilyHistoryAlzheimers,HeadInjury,Hypertension,MemoryComplaints,PersonalityChanges,Smoking,predict
0,0.0,1.0,0.0,-0.99932,-0.664487,1.197004,-1.118082,-1.308888,-0.095844,1.240398,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
1,0.0,1.0,0.0,-0.60561,-0.331193,1.138226,-0.70395,0.910494,-1.175104,0.526177,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,1.0,0.0,0.056735,0.55759,0.306133,0.445035,-1.409355,0.25358,-1.539144,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1
3,0.0,1.0,0.0,-1.180288,-0.442291,1.469594,0.591253,-0.507842,1.076148,1.554384,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,-0.241525,-1.55327,1.66794,-1.515077,-1.296176,-0.477494,-0.505878,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1
