# Regresión 

## Situación problemática

In [1]:
import numpy as np 
import pandas as pd 

In [2]:
df = pd.read_csv('StudentsPerformance.csv') 
df.head(3) 

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93


## Determinación de la naturaleza de las características 

In [3]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
gender                         1000 non-null object
race/ethnicity                 1000 non-null object
parental level of education    1000 non-null object
lunch                          1000 non-null object
test preparation course        1000 non-null object
math score                     1000 non-null int64
reading score                  1000 non-null int64
writing score                  1000 non-null int64
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [4]:
df.columns = ['genero', 'raza',
             'educacion_padres','comida',
             'curso_preparatorio','score_math',
             'score_lectura','score_escritura']
df.head() 

Unnamed: 0,genero,raza,educacion_padres,comida,curso_preparatorio,score_math,score_lectura,score_escritura
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [5]:
# Creamos las variables binarias
genero_cod = pd.get_dummies(df['genero'])
# Añadimos las variables binarias al DataFrame
df = pd.concat([df, genero_cod], axis = 1)

# Eliminamos la vairable original genero
df = df.drop(columns=['genero']) 

In [6]:
df.head() 

Unnamed: 0,raza,educacion_padres,comida,curso_preparatorio,score_math,score_lectura,score_escritura,female,male
0,group B,bachelor's degree,standard,none,72,72,74,1,0
1,group C,some college,standard,completed,69,90,88,1,0
2,group B,master's degree,standard,none,90,95,93,1,0
3,group A,associate's degree,free/reduced,none,47,57,44,0,1
4,group C,some college,standard,none,76,78,75,0,1


In [7]:
# Creamos las variables binarias
raza_cod = pd.get_dummies(df['raza'])
# Añadimos las variables binarias al DataFrame
df = pd.concat([df, raza_cod['group A'], raza_cod['group B'], 
                raza_cod['group C'], raza_cod['group D']], axis = 1)

# Eliminamos la vairable original raza
df = df.drop(columns=['raza'])  

# Codicicación de variable categórica ordinal `educacion_padres`

In [8]:
from sklearn.preprocessing import OrdinalEncoder

# Creamos el codificador indicandole el orden de la variables

encoder = OrdinalEncoder(categories = [['some high school',"high school", 
                                      "some college","bachelor's degree",  
                                      "master's degree","associate's degree"]])

# Ajustamos el codificador con la variable educacion padres y la transformamos

encoder.fit(df[["educacion_padres"]])
df["educacion_padres-encoded"] = encoder.transform(df[["educacion_padres"]]) 
# Eliminamos la vairable original race
df = df.drop(columns=['educacion_padres'])  

In [9]:
df.head() 

Unnamed: 0,comida,curso_preparatorio,score_math,score_lectura,score_escritura,female,male,group A,group B,group C,group D,educacion_padres-encoded
0,standard,none,72,72,74,1,0,0,1,0,0,3.0
1,standard,completed,69,90,88,1,0,0,0,1,0,2.0
2,standard,none,90,95,93,1,0,0,1,0,0,4.0
3,free/reduced,none,47,57,44,0,1,1,0,0,0,5.0
4,standard,none,76,78,75,0,1,0,0,1,0,2.0


# Codicicación de variable categórica nominal `curso_preparatorio`

In [10]:
# Creamos las variables binarias
preparatorio_cod = pd.get_dummies(df['curso_preparatorio'])
# Añadimos las variables binarias al DataFrame
df = pd.concat([df, preparatorio_cod], axis = 1)

# Eliminamos la vairable original curso_preparatorio
df = df.drop(columns=['curso_preparatorio'])  

In [11]:
df.head(3) 

Unnamed: 0,comida,score_math,score_lectura,score_escritura,female,male,group A,group B,group C,group D,educacion_padres-encoded,completed,none
0,standard,72,72,74,1,0,0,1,0,0,3.0,0,1
1,standard,69,90,88,1,0,0,0,1,0,2.0,1,0
2,standard,90,95,93,1,0,0,1,0,0,4.0,0,1


In [12]:
# Creamos las variables binarias
comida_cod = pd.get_dummies(df['comida'])
# Añadimos las variables binarias al DataFrame
df = pd.concat([df, comida_cod], axis = 1)

# Eliminamos la vairable original curso_preparatorio
df = df.drop(columns=['comida'])  

In [13]:
df.head() 

Unnamed: 0,score_math,score_lectura,score_escritura,female,male,group A,group B,group C,group D,educacion_padres-encoded,completed,none,free/reduced,standard
0,72,72,74,1,0,0,1,0,0,3.0,0,1,0,1
1,69,90,88,1,0,0,0,1,0,2.0,1,0,0,1
2,90,95,93,1,0,0,1,0,0,4.0,0,1,0,1
3,47,57,44,0,1,1,0,0,0,5.0,0,1,1,0
4,76,78,75,0,1,0,0,1,0,2.0,0,1,0,1


In [14]:
df.keys() 

Index(['score_math', 'score_lectura', 'score_escritura', 'female', 'male',
       'group A', 'group B', 'group C', 'group D', 'educacion_padres-encoded',
       'completed', 'none', 'free/reduced', 'standard'],
      dtype='object')

In [15]:
df.columns = ["score_math","score_lectura","score_escritura","female","male","raza_A",
             "raza_B","raza_C","raza_D","educacion_padres","curso_preparatario_completado",
             "no_hizo_curso_preparatorio","comida_reducida","comida_estandar"]

In [16]:
df.head() 

Unnamed: 0,score_math,score_lectura,score_escritura,female,male,raza_A,raza_B,raza_C,raza_D,educacion_padres,curso_preparatario_completado,no_hizo_curso_preparatorio,comida_reducida,comida_estandar
0,72,72,74,1,0,0,1,0,0,3.0,0,1,0,1
1,69,90,88,1,0,0,0,1,0,2.0,1,0,0,1
2,90,95,93,1,0,0,1,0,0,4.0,0,1,0,1
3,47,57,44,0,1,1,0,0,0,5.0,0,1,1,0
4,76,78,75,0,1,0,0,1,0,2.0,0,1,0,1


# Un modelo de regresión lineal para predecir el desempeño en matemáticas