# **Simulações**


In [1]:
# Bibliotecas
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import joblib

# Verificando quais alunos evadiram
def verifica_evadidos(row):
    if pd.isna(row['PONTO_VIRADA_2022']):  # Se PONTO_VIRADA_2022 for NaN
        return True  # O aluno evadiu
    elif pd.isna(row['PONTO_VIRADA_2020']):  # Se PONTO_VIRADA_2020 for NaN
        return False  # O aluno não evadiu
    elif pd.isna(row['PONTO_VIRADA_2021']):  # Se PONTO_VIRADA_2021 for NaN
        return True  # O aluno evadiu
    else:
        return False  # O aluno não evadiu

# Função para identificar o último ano de ponto de virada antes de NaN
def ultimo_ano(row):
    if row['EVADIU']:  # Verifica se EVADIU é verdadeiro
        if row['PONTO_VIRADA_2020'] in ['Sim', 'Não'] and pd.isna(row['PONTO_VIRADA_2021']):
            return 2020
        elif row['PONTO_VIRADA_2021'] in ['Sim', 'Não'] and pd.isna(row['PONTO_VIRADA_2022']):
            return 2021
        elif row['PONTO_VIRADA_2022'] in ['Sim', 'Não']:
            return 2022
    return pd.NA

def _generate_df_evasao(df):
    indicacao_evasao = ['NOME', 'PONTO_VIRADA_2020', 'PONTO_VIRADA_2021', 'PONTO_VIRADA_2022']
    df_evasao = df[indicacao_evasao]
    df_evasao['EVADIU'] = df_evasao.apply(verifica_evadidos, axis=1)
    # Aplicar a função para cada linha do DataFrame
    df_evasao['ULTIMO_ANO'] = df_evasao.apply(ultimo_ano, axis=1)
    return df_evasao

def _df_passos_magicos():
    df = pd.read_csv('data/raw/PEDE_PASSOS_DATASET_FIAP.csv', sep=';')
    df_evasao = _generate_df_evasao(df)
    # Mesclar os DataFrames com base na coluna 'NOME'
    df = df.merge(df_evasao[['NOME', 'EVADIU', 'ULTIMO_ANO']], on='NOME', how='left')
    # removendo linha problematica
    df = df.loc[~(df['INDE_2020'] == 'D980')]
    # substituindo null estranho
    df.loc[(df['INDE_2021'] == '#NULO!'),['INDE_2021']] = np.nan

    return df

def _get_saresp():
    return pd.read_csv('../data/processed/saresp.csv')

def _rename(df, cols_not_rename=['YEAR'], posfixo='_Y-1'):
    for col in cols_not_rename:
        df[col] = df[col].astype(str)
    for col in list(set(df.columns) - set(cols_not_rename)):
        df = df.rename(columns={col:col+posfixo})
    return df

def _get_xgb_model(path='models/xgb_model.pkl'):
    return joblib.load(path)


In [2]:
model = _get_xgb_model(path='../models/xgb_model.pkl')

In [3]:
def _test_df_cols(df):
    cols_in_last_year =  ['IDADE_Y-1', 'IAA_Y-1', 'IAN_Y-1', 'IDA_Y-1', 'IEG_Y-1', 'INDE_Y-1', 'IPP_Y-1', 'IPS_Y-1', 'IPV_Y-1']
    cols_in_current_year =  ['IDADE_Y', 'IAA_Y', 'IAN_Y', 'IDA_Y', 'IEG_Y', 'INDE_Y', 'IPP_Y', 'IPS_Y', 'IPV_Y']
    cols_identity = ['YEAR','NOME']
    cols_diff = set(cols_in_current_year + cols_in_last_year + cols_identity).difference(df.columns)
    return {
          'cols_diff':cols_diff
        , 'status_ok':True if len(cols_diff)<1 else False
        , 'df':df
    }

In [4]:
def _load_new_data(file_path):
    df = pd.read_excel(file_path)
    cols_in =  ['IDADE', 'IAA', 'IAN', 'IDA', 'IEG', 'INDE', 'IPP', 'IPS', 'IPV']
    if not(_test_df_cols(df)['status_ok']):
        return _test_df_cols(df)
    
    # add dados da saresp em Y-1
    df_saresp = _get_saresp()
    df_saresp_m1 = _rename(df_saresp, posfixo='_Y-1')
    for col in ['YEAR','IDADE_Y-1']:
        df_saresp_m1[col] = df_saresp_m1[col].astype(int)
        df[col] = df[col].astype(int)
    df = df.merge(
        df_saresp_m1
        , on=['YEAR','IDADE_Y-1']
        )
    
    # saresp no ano corrente
    df_saresp_m = _rename(df_saresp, posfixo='_Y')
    for col in ['YEAR','IDADE_Y']:
        df_saresp_m[col] = df_saresp_m[col].astype(int)
        df[col] = df[col].astype(int)
    df = df.merge(
        df_saresp_m
        , on=['YEAR','IDADE_Y']
        )
    return {
          'cols_diff':[]
        , 'status_ok':True
        , 'df':df
    }


In [5]:
response_new_data = _load_new_data('../data/processed/dado_simulacao.xlsx')
if (response_new_data['status_ok']):
    df_new_data = response_new_data['df']
else:
    print(response_new_data)

explainer dashboard

In [6]:
df_new_data.index = df_new_data['NOME']
df_new_data = df_new_data.drop(columns={'NOME'})

In [21]:
from explainerdashboard import ClassifierExplainer
explainer = ClassifierExplainer(model, 
                                df_new_data, model.predict(df_new_data))
explainer.pos_label = 1

splitting pipeline...
Detected sklearn/imblearn Pipeline and succesfully extracted final output dataframe with column names and final model...
Note: model_output=='probability'. For GradientBoostingClassifier shap values normally get calculated against X_background, but paramater X_background=None, so using X instead
Generating self.shap_explainer = shap.TreeExplainer(model, X, model_output='probability', feature_perturbation='interventional')...
Note: Shap interaction values will not be available. If shap values in probability space are not necessary you can pass model_output='logodds' to get shap values in logodds without the need for a background dataset and also working shap interaction values...


In [8]:
_aluno = 'ALUNO-1'

In [25]:
explainer.plot_roc_auc()

Calculating roc auc curves...
Calculating prediction probabilities...


In [9]:
explainer.plot_prediction_result(_aluno)

In [10]:
explainer.plot_contributions(_aluno)

Calculating shap values...
Empty DataFrame
Columns: [col, contribution, value]
Index: []


In [11]:
df_new_data.columns

Index(['YEAR', 'IDADE_Y-1', 'IAA_Y-1', 'IAN_Y-1', 'IDA_Y-1', 'IEG_Y-1',
       'INDE_Y-1', 'IPP_Y-1', 'IPS_Y-1', 'IPV_Y-1', 'IDADE_Y', 'IAA_Y',
       'IAN_Y', 'IDA_Y', 'IEG_Y', 'INDE_Y', 'IPP_Y', 'IPS_Y', 'IPV_Y',
       'TOTAL_PONTOS_LP_MEAN_Y-1', 'TOTAL_PONTOS_LP_MEDIAN_Y-1',
       'TOTAL_PONTOS_LP_STD_Y-1', 'TOTAL_PONTOS_LP_MIN_Y-1',
       'TOTAL_PONTOS_LP_MAX_Y-1', 'PORC_PONTOS_LP_MEAN_Y-1',
       'PORC_PONTOS_LP_MEDIAN_Y-1', 'PORC_PONTOS_LP_STD_Y-1',
       'PORC_PONTOS_LP_MIN_Y-1', 'PORC_PONTOS_LP_MAX_Y-1',
       'TOTAL_PONTO_MAT_MEAN_Y-1', 'TOTAL_PONTO_MAT_MEDIAN_Y-1',
       'TOTAL_PONTO_MAT_STD_Y-1', 'TOTAL_PONTO_MAT_MIN_Y-1',
       'TOTAL_PONTO_MAT_MAX_Y-1', 'PORC_PONTOS_MAT_MEAN_Y-1',
       'PORC_PONTOS_MAT_MEDIAN_Y-1', 'PORC_PONTOS_MAT_STD_Y-1',
       'PORC_PONTOS_MAT_MIN_Y-1', 'PORC_PONTOS_MAT_MAX_Y-1',
       'TOTAL_PONTOS_LP_MEAN_Y', 'TOTAL_PONTOS_LP_MEDIAN_Y',
       'TOTAL_PONTOS_LP_STD_Y', 'TOTAL_PONTOS_LP_MIN_Y',
       'TOTAL_PONTOS_LP_MAX_Y', 'PORC_PONTOS_L

In [12]:
explainer.plot_dependence(col='num__IAA_Y-1', color_col='num__IDADE_Y', highlight_index=_aluno)

In [15]:
explainer.plot_pdp(col='num__IAA_Y-1', index=_aluno)


X has feature names, but GradientBoostingClassifier was fitted without feature names


X has feature names, but GradientBoostingClassifier was fitted without feature names


X has feature names, but GradientBoostingClassifier was fitted without feature names


X has feature names, but GradientBoostingClassifier was fitted without feature names


X has feature names, but GradientBoostingClassifier was fitted without feature names


X has feature names, but GradientBoostingClassifier was fitted without feature names


X has feature names, but GradientBoostingClassifier was fitted without feature names


X has feature names, but GradientBoostingClassifier was fitted without feature names


X has feature names, but GradientBoostingClassifier was fitted without feature names


X has feature names, but GradientBoostingClassifier was fitted without feature names


X has feature names, but GradientBoostingClassifier was fitted without feature names


X has feature names, but GradientBoostingC