In [1]:
import pandas as pd
import statsmodels.api as sm 
from statstests.process import stepwise

df_salarios = pd.read_csv('data/salarios.csv')
df_salarios.head()

Unnamed: 0,nome,salario,rh1,econometria1,rh2,econometria2
0,Patricia,2525.34082,3.0,9.0,9.0,4.5
1,Luiz,2498.574951,7.0,8.0,8.0,4.0
2,Dalila,2364.686768,10.0,9.5,9.5,4.75
3,Leonor,1981.349854,10.0,7.0,7.0,3.5
4,Ovidio,1892.807983,7.0,6.0,6.0,3.1


In [2]:
modelo1_aux1 = sm.OLS.from_formula('salario ~ rh1',
                                   df_salarios).fit()

print(modelo1_aux1.summary())

                            OLS Regression Results                            
Dep. Variable:                salario   R-squared:                       0.075
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     1.048
Date:                Wed, 28 Aug 2024   Prob (F-statistic):              0.325
Time:                        20:50:35   Log-Likelihood:                -106.35
No. Observations:                  15   AIC:                             216.7
Df Residuals:                      13   BIC:                             218.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   2225.5645    205.346     10.838      0.0



**Modelo 1**

In [3]:
modelo1 = sm.OLS.from_formula('salario ~ rh1 + econometria1', df_salarios).fit()

print(modelo1.summary())

                            OLS Regression Results                            
Dep. Variable:                salario   R-squared:                       0.827
Model:                            OLS   Adj. R-squared:                  0.799
Method:                 Least Squares   F-statistic:                     28.77
Date:                Wed, 28 Aug 2024   Prob (F-statistic):           2.64e-05
Time:                        20:50:35   Log-Likelihood:                -93.759
No. Observations:                  15   AIC:                             193.5
Df Residuals:                      12   BIC:                             195.6
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     1246.6172    163.783      7.611   



In [4]:
# Procedimento Stepwise no 'modelo1'

modelo1_step = stepwise(modelo1, pvalue_limit=0.05)

Regression type: OLS 

Estimating model...: 
 salario ~ Q('rh1') + Q('econometria1')

 Discarding atribute "Q('rh1')" with p-value equal to 0.7851743044424164 

Estimating model...: 
 salario ~ Q('econometria1')

 No more atributes with p-value higher than 0.05

 Atributes discarded on the process...: 

{'atribute': "Q('rh1')", 'p-value': 0.7851743044424164}

 Model after stepwise process...: 
 salario ~ Q('econometria1') 

                            OLS Regression Results                            
Dep. Variable:                salario   R-squared:                       0.826
Model:                            OLS   Adj. R-squared:                  0.813
Method:                 Least Squares   F-statistic:                     61.85
Date:                Wed, 28 Aug 2024   Prob (F-statistic):           2.69e-06
Time:                        20:50:35   Log-Likelihood:                -93.807
No. Observations:                  15   AIC:                             191.6
Df Residuals:      



In [5]:
# Modelo 1 aux2, somente com a preditora 'econometria1'

modelo1_aux2 = sm.OLS.from_formula('salario ~ econometria1',
                                   df_salarios).fit()

print(modelo1_aux2.summary()) # igual output do procedimento Stepwise acima

                            OLS Regression Results                            
Dep. Variable:                salario   R-squared:                       0.826
Model:                            OLS   Adj. R-squared:                  0.813
Method:                 Least Squares   F-statistic:                     61.85
Date:                Wed, 28 Aug 2024   Prob (F-statistic):           2.69e-06
Time:                        20:50:35   Log-Likelihood:                -93.807
No. Observations:                  15   AIC:                             191.6
Df Residuals:                      13   BIC:                             193.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     1213.7997    109.752     11.060   



Observe que o R-suared diminui 0.001, entretanto para comparar modelos com as mesmas variáveis em quantidades de colunas diferentes, é para o Adj. R-quared que devemos olhar! E para isso ele é igual, porém com uma coluna a menos - o que o torna mais vantajoso.

In [6]:
# Modelo 1 aux3, rodando 'rh1' em função de 'econometria1'

modelo1_aux3 = sm.OLS.from_formula('rh1 ~ econometria1', # para diagnosticar a baixa significancia entre as duas
                                   df_salarios).fit()

print(modelo1_aux3.summary())

                            OLS Regression Results                            
Dep. Variable:                    rh1   R-squared:                       0.070
Model:                            OLS   Adj. R-squared:                 -0.001
Method:                 Least Squares   F-statistic:                    0.9825
Date:                Wed, 28 Aug 2024   Prob (F-statistic):              0.340
Time:                        20:50:35   Log-Likelihood:                -37.523
No. Observations:                  15   AIC:                             79.05
Df Residuals:                      13   BIC:                             80.46
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        9.6001      2.575      3.728   



In [7]:
# Disgnóstico de Multicolinearidade

# Cálculo da Tolerance (variando de 1 a 0)

tolerance1 = 1 - modelo1_aux3.rsquared
tolerance1

0.9297346300260828

In [8]:
# Cálculo do VIF (Variance Inflation Factor - variando de 1 a inf.)
VIF1 = 1/tolerance1
VIF1

# VIF próximo de 1 = ausência de multicolinearidade
# VIF elevado = presença de multicolinearidade - acima de 5 já indica um R-squared de 80%

1.0755757263467167

Diagnóstico: Ausência de multicolinearidade no modelo 1! As variáveis não se relacionam 

**Modelo 2**

In [10]:
# Modelo 2 aux1, rodando 'rh2' em função de 'econometria2'

modelo2_aux1 = sm.OLS.from_formula('rh2 ~ econometria2',
                                   df_salarios).fit()

print(modelo2_aux1.summary())

                            OLS Regression Results                            
Dep. Variable:                    rh2   R-squared:                       0.988
Model:                            OLS   Adj. R-squared:                  0.987
Method:                 Least Squares   F-statistic:                     1054.
Date:                Wed, 28 Aug 2024   Prob (F-statistic):           7.89e-14
Time:                        20:50:45   Log-Likelihood:                0.31565
No. Observations:                  15   AIC:                             3.369
Df Residuals:                      13   BIC:                             4.785
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -0.3405      0.218     -1.563   



In [11]:
# Disgnóstico de Multicolinearidade

# Cálculo da Tolerance

tolerance2 = 1 - modelo2_aux1.rsquared
tolerance2

# Cálculo do VIF
VIF2 = 1/tolerance2
VIF2

82.06145751840752

In [12]:
# Procedimento Stepwise no 'modelo2' (Boa notícia!)

modelo2 = sm.OLS.from_formula('salario ~ rh2 + econometria2', df_salarios).fit()

modelo2_step = stepwise(modelo2, pvalue_limit=0.05)

Regression type: OLS 

Estimating model...: 
 salario ~ Q('rh2') + Q('econometria2')

 Discarding atribute "Q('econometria2')" with p-value equal to 0.6691198047642124 

Estimating model...: 
 salario ~ Q('rh2')

 No more atributes with p-value higher than 0.05

 Atributes discarded on the process...: 

{'atribute': "Q('econometria2')", 'p-value': 0.6691198047642124}

 Model after stepwise process...: 
 salario ~ Q('rh2') 

                            OLS Regression Results                            
Dep. Variable:                salario   R-squared:                       0.826
Model:                            OLS   Adj. R-squared:                  0.813
Method:                 Least Squares   F-statistic:                     61.85
Date:                Wed, 28 Aug 2024   Prob (F-statistic):           2.69e-06
Time:                        20:51:04   Log-Likelihood:                -93.807
No. Observations:                  15   AIC:                             191.6
Df Residuals:      



In [14]:
# Modelo 2 aux1, somente com a preditora 'econometria2'

modelo2_aux2 = sm.OLS.from_formula('salario ~ econometria2',
                                   df_salarios).fit()

print(modelo2_aux2.summary())

                            OLS Regression Results                            
Dep. Variable:                salario   R-squared:                       0.806
Model:                            OLS   Adj. R-squared:                  0.791
Method:                 Least Squares   F-statistic:                     53.96
Date:                Wed, 28 Aug 2024   Prob (F-statistic):           5.62e-06
Time:                        20:54:44   Log-Likelihood:                -94.642
No. Observations:                  15   AIC:                             193.3
Df Residuals:                      13   BIC:                             194.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     1175.7672    122.281      9.615   

