In [2]:
import pandas as pd
import psycopg2
import math
import numpy as np
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats

import statsmodels.api as sm

import holoviews as hv
hv.extension('bokeh')
hv.opts.defaults(hv.opts.Curve(width=500),
                 hv.opts.Scatter(width=500, size=4),
                 hv.opts.Histogram(width=500),
                 hv.opts.Slope(color='k', alpha=0.5, line_dash='dashed'),
                 hv.opts.HLine(color='k', alpha=0.5, line_dash='dashed'))   
from bokeh.plotting import show

# Cargando datos

In [3]:
datos = pd.read_csv("DatosBAIN081-2.csv")
datos

Unnamed: 0,BAIN081-14 promedio,BAIN081-14 online,PSP 1 semestre anterior,PSP 2 semestres anteriores,PGA,¿Congelo?,BAIN065-14 intentos,BAIN065-14 promedio,BAIN065-14 online,BAIN067-14 intentos,BAIN067-14 promedio,BAIN067-14 online,BAIN073-14 intentos,BAIN073-14 promedio,BAIN073-14 online,BAIN075-14 intentos,BAIN075-14 promedio,BAIN075-14 online,Carrera,Año de ingreso
0,4.2,0,3.91,4.28,4.54,0,1,4.0,0,1,4.9,0,2,4.0,0,2,4.1,0,1779,2015
1,4.7,0,5.65,5.99,5.81,0,1,5.9,0,1,6.3,0,1,5.4,0,1,5.4,0,1779,2016
2,4.1,0,4.49,5.38,4.90,0,1,5.0,0,1,5.6,0,1,4.1,0,1,4.3,0,1736,2016
3,4.2,0,2.88,4.38,3.63,0,1,4.1,0,1,4.1,0,1,2.1,0,1,4.1,0,1736,2016
4,4.0,0,4.23,5.39,4.78,0,1,4.9,0,1,5.8,0,1,4.9,0,1,4.3,0,1779,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
861,3.0,0,4.43,3.07,4.20,0,1,4.1,1,1,4.1,1,3,4.2,0,3,4.0,0,1704,2021
862,3.0,0,4.23,4.89,4.54,0,1,5.0,0,1,5.0,0,1,4.3,0,1,4.1,0,1812,2022
863,1.6,0,3.47,3.59,4.04,0,1,4.1,1,1,4.5,1,3,4.7,0,3,4.8,0,1704,2021
864,6.1,0,6.35,6.64,6.49,0,1,6.5,0,1,6.6,0,1,6.3,0,1,6.4,0,1708,2022


# Estandarización

In [4]:
datos_st = datos.copy()
# Lista de nombres de las columnas que son variables categóricas
columnas_categoricas = ['Carrera', 'Año de ingreso']

# Aplicar One-Hot Encoding a las variables categóricas con valores 1 y 0
datos_st = pd.get_dummies(datos_st, columns=columnas_categoricas, drop_first=True, dtype=int)

columnas_continuas = ['PSP 1 semestre anterior', 'PSP 2 semestres anteriores', 'PGA', 'BAIN065-14 intentos',
       'BAIN065-14 promedio', 'BAIN067-14 intentos', 'BAIN067-14 promedio', 'BAIN073-14 intentos',
       'BAIN073-14 promedio', 'BAIN075-14 intentos', 'BAIN075-14 promedio']

datos_st[columnas_continuas] = (datos_st[columnas_continuas] - datos_st[columnas_continuas].mean())/datos_st[columnas_continuas].std()
datos_st2 = datos_st.copy()
datos_st.describe()

Unnamed: 0,BAIN081-14 promedio,BAIN081-14 online,PSP 1 semestre anterior,PSP 2 semestres anteriores,PGA,¿Congelo?,BAIN065-14 intentos,BAIN065-14 promedio,BAIN065-14 online,BAIN067-14 intentos,...,Carrera_1779,Carrera_1807,Carrera_1812,Año de ingreso_2016,Año de ingreso_2017,Año de ingreso_2018,Año de ingreso_2019,Año de ingreso_2020,Año de ingreso_2021,Año de ingreso_2022
count,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,...,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0
mean,3.769746,0.498845,4.635758e-16,-8.204881e-18,-1.173298e-15,0.033487,5.1280510000000005e-17,9.68176e-16,0.362587,9.025370000000001e-17,...,0.183603,0.016166,0.003464,0.051963,0.092379,0.202079,0.227483,0.192841,0.159353,0.054273
std,1.039524,0.500288,1.0,1.0,1.0,0.180009,1.0,1.0,0.481025,1.0,...,0.387384,0.126188,0.058789,0.222081,0.289727,0.401783,0.419449,0.394757,0.366217,0.226686
min,1.1,0.0,-4.634252,-4.19215,-3.064411,0.0,-0.4739727,-2.13982,0.0,-0.478793,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.2,0.0,-0.4439172,-0.6210161,-0.6556159,0.0,-0.4739727,-0.7630858,0.0,-0.478793,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,0.0,0.1263651,0.1121675,-0.1110189,0.0,-0.4739727,-0.3041745,0.0,-0.478793,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4.4,1.0,0.6222628,0.7003871,0.6220926,0.0,-0.4739727,0.6136481,1.0,-0.478793,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,6.8,1.0,2.23393,2.141665,3.303186,1.0,5.768771,3.520086,1.0,5.732963,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Regresion Lineal Multiple

In [5]:
Y = datos_st.iloc[:,0]
X = datos_st.iloc[:, 1:]
X = sm.add_constant(X)

model = sm.OLS(Y, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,BAIN081-14 promedio,R-squared:,0.323
Model:,OLS,Adj. R-squared:,0.297
Method:,Least Squares,F-statistic:,12.4
Date:,"Mon, 27 Nov 2023",Prob (F-statistic):,3.23e-51
Time:,22:13:31,Log-Likelihood:,-1093.2
No. Observations:,866,AIC:,2252.0
Df Residuals:,833,BIC:,2410.0
Df Model:,32,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.9023,0.243,16.044,0.000,3.425,4.380
BAIN081-14 online,0.2513,0.100,2.506,0.012,0.054,0.448
PSP 1 semestre anterior,0.1073,0.061,1.763,0.078,-0.012,0.227
PSP 2 semestres anteriores,0.0233,0.057,0.411,0.681,-0.088,0.134
PGA,0.2328,0.093,2.515,0.012,0.051,0.415
¿Congelo?,-0.2970,0.172,-1.727,0.085,-0.635,0.041
BAIN065-14 intentos,-0.0683,0.045,-1.511,0.131,-0.157,0.020
BAIN065-14 promedio,0.0124,0.048,0.258,0.796,-0.082,0.107
BAIN065-14 online,-0.1075,0.254,-0.423,0.672,-0.606,0.391

0,1,2,3
Omnibus:,50.363,Durbin-Watson:,1.8
Prob(Omnibus):,0.0,Jarque-Bera (JB):,62.245
Skew:,-0.546,Prob(JB):,3.05e-14
Kurtosis:,3.73,Cond. No.,54.3


In [6]:
Y = datos_st.iloc[:,0]
X = datos_st[['PGA', 'BAIN067-14 online', 'BAIN075-14 promedio', 'BAIN075-14 intentos', 'PSP 1 semestre anterior', 
              'Carrera_1708', 'Carrera_1730', 'Carrera_1736', 'Carrera_1737', 'Carrera_1740', 
              'Carrera_1779', 'Carrera_1807']]
X = sm.add_constant(X)

model = sm.OLS(Y, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,BAIN081-14 promedio,R-squared:,0.291
Model:,OLS,Adj. R-squared:,0.281
Method:,Least Squares,F-statistic:,29.23
Date:,"Mon, 27 Nov 2023",Prob (F-statistic):,4.150000000000001e-56
Time:,22:14:25,Log-Likelihood:,-1112.7
No. Observations:,866,AIC:,2251.0
Df Residuals:,853,BIC:,2313.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.9853,0.070,56.800,0.000,3.848,4.123
PGA,0.3027,0.056,5.371,0.000,0.192,0.413
BAIN067-14 online,-0.3051,0.066,-4.607,0.000,-0.435,-0.175
BAIN075-14 promedio,0.1237,0.036,3.446,0.001,0.053,0.194
BAIN075-14 intentos,-0.1023,0.033,-3.135,0.002,-0.166,-0.038
PSP 1 semestre anterior,0.1525,0.052,2.940,0.003,0.051,0.254
Carrera_1708,-0.2063,0.095,-2.174,0.030,-0.393,-0.020
Carrera_1730,-0.1380,0.131,-1.056,0.291,-0.394,0.118
Carrera_1736,0.0903,0.109,0.832,0.405,-0.123,0.303

0,1,2,3
Omnibus:,59.526,Durbin-Watson:,1.721
Prob(Omnibus):,0.0,Jarque-Bera (JB):,72.198
Skew:,-0.634,Prob(JB):,2.1e-16
Kurtosis:,3.627,Cond. No.,13.0


In [12]:
import statsmodels.formula.api as smf
from statsmodels.stats.anova import anova_lm

df_new = datos_st2.copy()
df_new.columns = [f"x{i}" for i in range(0, len(df_new.columns))]
m1 = smf.ols('x0~x1+x2+x3+x4+x5+x6+x7+x8+x9+x10+x11+x12+x13+x14+x15+x16+x17+x18+x19+x20+x21+x22+x23+x24+x25+x26+x27+x28+x29+x30+x31+x32', data=df_new).fit()
m2 = smf.ols('x0~x2+x4+x11+x15+x16+x18+x19+x20+x21+x22+x23+x24+x25', data=df_new).fit()
anovaResults = anova_lm(m2, m1)
display(anovaResults)

Unnamed: 0,df_resid,ssr,df_diff,ss_diff,F,Pr(>F)
0,852.0,662.338629,0.0,,,
1,833.0,633.170723,19.0,29.167907,2.019649,0.006118
