# Categorical Data

In [1]:
import pandas as pd
import matplotlib as plt
import numpy as np

In [2]:
# cargamos los datasets
X = pd.read_csv('train.csv', index_col = 'Id')
X_test = pd.read_csv('test.csv', index_col= 'Id')

X.shape, X_test.shape

((1460, 80), (1459, 79))

Tenemos que eliminar las filas que tienen datos faltantes en la columna `SalePrice`, porque no nos van a servir ni para entrenar el modelo ni para validarlo

In [3]:
# eliminamos las filas
X.dropna(axis = 0, subset=['SalePrice'], inplace = True)
X.shape

(1460, 80)

In [4]:
# definimos las etiquetas que vamos utilizar para entrenar y validar el modelo
y = X.SalePrice
type(y)

pandas.core.series.Series

In [5]:
# eliminamos la columna SalePrice de los datos de entrenamiento
X.drop(['SalePrice'], axis = 1, inplace = True)
X.shape

(1460, 79)

El tratamiento de los datos faltantes esta fuera de los alcances de esta notebook (ver notebook 02-Missing values). Por lo tanto vamos a elegir eliminar todas las columnas que tengan datos faltantes.

In [6]:
cols_with_missing_values = [col for col in X.columns if X[col].isnull().any()]
cols_with_missing_values

['LotFrontage',
 'Alley',
 'MasVnrType',
 'MasVnrArea',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [7]:
X.drop(cols_with_missing_values, axis = 1, inplace = True)
X.shape

(1460, 60)

In [8]:
# Eliminamos las mismas columnas del dataset de test
X_test.drop(cols_with_missing_values, axis = 1, inplace = True)
X_test.shape

(1459, 60)

In [9]:
# dividimos el dataset X_train en train y validation
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [10]:
X_train.shape, X_valid.shape

((1168, 60), (292, 60))

In [11]:
y_train.shape, y_valid.shape

((1168,), (292,))

vamos a definir una funcion que nos va a permitir evaluar las acciones que tomemos sobre los datasets

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0) # utilizamos una version standar del algoritmo
                                                                # en otra instancia se puede optimizar los parametros
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

### Step 1: Eliminamos columnas con datos categoricos

In [13]:
# seleccionamos las columnas que no tienen datos categoricos
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_train.shape

(1168, 33)

In [14]:
drop_X_valid = X_valid.select_dtypes(exclude=['object'])
drop_X_valid.shape

(292, 33)

In [15]:
print('-'*50, 'RESULTS','-'*50)
print("MAE from Approach 1 (Drop Missing Val Cols and Drop categorical variables):")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))
print('-'*50, 'RESULTS','-'*50)

-------------------------------------------------- RESULTS --------------------------------------------------
MAE from Approach 1 (Drop Missing Val Cols and Drop categorical variables):
17837.82570776256
-------------------------------------------------- RESULTS --------------------------------------------------


### Step 2: Ordinal encoding


In [16]:
print("Unique values in 'Condition2' column in training data:", X_train['Condition2'].unique())
print("\nUnique values in 'Condition2' column in validation data:", X_valid['Condition2'].unique())

Unique values in 'Condition2' column in training data: ['Norm' 'PosA' 'Feedr' 'PosN' 'Artery' 'RRAe']

Unique values in 'Condition2' column in validation data: ['Norm' 'RRAn' 'RRNn' 'Artery' 'Feedr' 'PosN']


Notar que en la columna `Condition2` de los datos de validacion tienen categorias que no estan en la misma columna de los datos de entrenamiento. Esto va a generar que no podemas usar estos datos, asi como estan, para generar un modelo. Por lo tanto vamos a tener que tomar ua decision con respecto a estos datos.<br>
En primer logar lo que vamos a hacer es eliminar todas las columnas que presenten este problema.

In [17]:
# que columnas son categoricas en los datos de entrenamiento?
object_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
object_cols

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'KitchenQual',
 'Functional',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [18]:
'''
The set() function creates a set object. The items in a set list are unordered, so it will appear in random order. 
'''
good_label_cols = [col for col in object_cols if 
                   set(X_valid[col]).issubset(set(X_train[col]))]

# Estas son todas las columnas con las que no vamos a tener problemas con la codificacion ordinal, porque las columnas
# de los datos de validacion tienen las mismas o menos categorias que sus columnas correspondientes en los detos
# de entrenamiento
good_label_cols

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'Exterior1st',
 'Exterior2nd',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'KitchenQual',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [19]:
# Las columnas con las que tendriamos problemas seria estas
bad_label_cols = list(set(object_cols) - set(good_label_cols))
bad_label_cols

['Functional', 'RoofMatl', 'Condition2']

In [20]:
print('Categorical columns that will be ordinal encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)

Categorical columns that will be ordinal encoded: ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'BldgType', 'HouseStyle', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'PavedDrive', 'SaleType', 'SaleCondition']

Categorical columns that will be dropped from the dataset: ['Functional', 'RoofMatl', 'Condition2']


In [21]:
# Eliminamos las columnas categoricas que no vamos a usar

label_X_train = X_train.drop(bad_label_cols, axis = 1)
label_X_valid = X_valid.drop(bad_label_cols, axis = 1)

In [22]:
# Aplicamos la codificacion ordinal

from sklearn.preprocessing import OrdinalEncoder

# 1. Definimos el tipo de codificacion
encoder = OrdinalEncoder()

# 2. Entrenamos el codificador
# Para entrenar al codificador usamos los datos de entrenamiento porque ya sabemos que tienen las mismas o mas
# categorias que los datos de validcions
encoder.fit(label_X_train[good_label_cols])

# 3. Usamos el codificador entrenado para hacer la trnsformacion de los datos
# notar que solo lo aplicamos a las features que tienen datos categoricos, no tocamos las features con datos numericos
label_X_train[good_label_cols] = encoder.transform(label_X_train[good_label_cols])
label_X_valid[good_label_cols] = encoder.transform(label_X_valid[good_label_cols])

'''
En este caso elegimos dividir el ENTRENAMIENTO y la TRANSFORMACION en dos pasos.
Tambien podriamos haber hecho las dos cosas en un mismo paso utiilzando el siguiente codigo:

# Apply ordinal encoder
ordinal_encoder = OrdinalEncoder()
label_X_train[good_label_cols] = ordinal_encoder.fit_transform(X_train[good_label_cols])
label_X_valid[good_label_cols] = ordinal_encoder.transform(X_valid[good_label_cols])

'''


'\nEn este caso elegimos dividir el ENTRENAMIENTO y la TRANSFORMACION en dos pasos.\nTambien podriamos haber hecho las dos cosas en un mismo paso utiilzando el siguiente codigo:\n\n# Apply ordinal encoder\nordinal_encoder = OrdinalEncoder()\nlabel_X_train[good_label_cols] = ordinal_encoder.fit_transform(X_train[good_label_cols])\nlabel_X_valid[good_label_cols] = ordinal_encoder.transform(X_valid[good_label_cols])\n\n'

In [23]:
print('-'*50, 'RESULTS','-'*50)
print("MAE from Approach 2 (Ordinal Encoding):") 
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))
print('-'*50, 'RESULTS','-'*50)

-------------------------------------------------- RESULTS --------------------------------------------------
MAE from Approach 2 (Ordinal Encoding):
17098.01649543379
-------------------------------------------------- RESULTS --------------------------------------------------


Los resultados obtenidos hasta el momento muestra que es mejor codificar los datos categoricos que eliminar las columnas.<br>
La codificacion que usamos hasta ahora fue una codificacion cardinal, que no es muy recomendada excepto para casos particualars.<br>
Ahora vamos a probar con la codificaion `OneHotEncoding()` que es mucho mas utilizada para estos casos

### Step 3: OneHotEncding

In [24]:
# Cuantas categorias diferentes tiene cada feature?
for col in X_train[object_cols].columns:
    print(X_train[col].nunique(), '<<<', col)

5 <<< MSZoning
2 <<< Street
4 <<< LotShape
4 <<< LandContour
2 <<< Utilities
5 <<< LotConfig
3 <<< LandSlope
25 <<< Neighborhood
9 <<< Condition1
6 <<< Condition2
5 <<< BldgType
8 <<< HouseStyle
6 <<< RoofStyle
7 <<< RoofMatl
15 <<< Exterior1st
16 <<< Exterior2nd
4 <<< ExterQual
5 <<< ExterCond
6 <<< Foundation
6 <<< Heating
5 <<< HeatingQC
2 <<< CentralAir
4 <<< KitchenQual
6 <<< Functional
3 <<< PavedDrive
9 <<< SaleType
6 <<< SaleCondition


Si bien es posible codificar features que tienen mas de 10 categorias diferentes, por el momento vamos a eliminar esas columnas y trabajar solo con las features qu tienen menos de 10 categorias diferentes.

In [25]:
# que columnas tienen menos de 10 categorias diferentes
low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]
low_cardinality_cols

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'KitchenQual',
 'Functional',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [26]:
# que columnas vamos a eliminar de los datos de entrenamiento y validacion?

high_cardinality_cols = list(set(object_cols) - set(low_cardinality_cols))
high_cardinality_cols

['Exterior2nd', 'Neighborhood', 'Exterior1st']

In [27]:
# eliminamos las columnas que tienen una cardinalidad superior a 10

reduced_X_train = X_train.drop(high_cardinality_cols, axis = 1)
reduced_X_valid = X_valid.drop(high_cardinality_cols, axis = 1)

print(X_train.shape, reduced_X_train.shape)
print(X_valid.shape, reduced_X_valid.shape)

(1168, 60) (1168, 57)
(292, 60) (292, 57)


In [28]:
# Encoding

from sklearn.preprocessing import OneHotEncoder

# 1. Definimos el encoder
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

# 2. Entrenamos el encoder
OH_encoder.fit(X_train[low_cardinality_cols])

# 3. Aplicamos la transformacion
# Recordar que la codificacin OneHot devuelve un array que tenemos que transformar en DF para poder seguir operando
OH_cols_X_train = pd.DataFrame(OH_encoder.transform(reduced_X_train[low_cardinality_cols]))
OH_cols_X_valid = pd.DataFrame(OH_encoder.transform(reduced_X_valid[low_cardinality_cols]))

# 4. OneHot elimina el indice, lo volvemos a poner
OH_cols_X_train.index = X_train.index
OH_cols_X_valid.index = X_valid.index

# 5. Eliminamos las columnas categoricas de los datos de entrenamiento y validacion originales
num_X_train = X_train.drop(object_cols, axis = 1)
num_X_valid = X_valid.drop(object_cols, axis = 1)

# 6. Unimos la DF codificada con los features numericos de los datos de entrenamiento y validacion
OH_X_train = pd.concat([num_X_train, OH_cols_X_train], axis = 1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_X_valid], axis = 1)


In [29]:
OH_X_train.head()

Unnamed: 0_level_0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,112,113,114,115,116,117,118,119,120,121
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
619,20,11694,9,5,2007,2007,48,0,1774,1822,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
871,20,6600,5,5,1962,1962,0,0,894,894,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
93,30,13360,5,7,1921,2006,713,0,163,876,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
818,20,13265,8,5,2002,2002,1218,0,350,1568,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
303,20,13704,7,5,2001,2002,0,0,1541,1541,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [30]:
OH_X_valid.head()

Unnamed: 0_level_0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,112,113,114,115,116,117,118,119,120,121
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
530,20,32668,6,3,1957,1975,1219,0,816,2035,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
492,50,9490,6,7,1941,1950,403,165,238,806,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
460,50,7015,5,4,1950,1950,185,0,524,709,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
280,60,10005,7,5,1977,1977,392,0,768,1160,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
656,160,1680,6,5,1971,1971,0,0,525,525,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [31]:
print('-'*50, 'RESULTS','-'*50)
print("MAE (One-Hot Encoding):") 
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))
print('-'*50, 'RESULTS','-'*50)

-------------------------------------------------- RESULTS --------------------------------------------------
MAE (One-Hot Encoding):




17525.345719178084
-------------------------------------------------- RESULTS --------------------------------------------------




### Step 4: Final Model

In [32]:
# cargamos nuevamente los datasets
X = pd.read_csv('train.csv', index_col = 'Id')
X.shape

(1460, 80)

Para esta etapa del desarrollo del modelo NO vamos a dividir los datos en entrenamiento y validcion, vamos a usar todos los datos para generar el modelo. 

In [33]:
# eliminamos las filas que tienen datos faltantes en la variable SalePrice
X.dropna(axis = 0, subset=['SalePrice'], inplace = True)
X.shape

(1460, 80)

In [34]:
# definimos las etiquetas que vamos utilizar para entrenar y validar el modelo
y = X.SalePrice
type(y)

pandas.core.series.Series

In [35]:
# eliminamos la columna SalePrice de los datos de entrenamiento
X.drop(['SalePrice'], axis = 1, inplace = True)
X.shape

(1460, 79)

In [36]:
cols_with_missing_values = [col for col in X.columns if X[col].isnull().any()]
cols_with_missing_values

['LotFrontage',
 'Alley',
 'MasVnrType',
 'MasVnrArea',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [37]:
X.drop(cols_with_missing_values, axis = 1, inplace = True)
X.shape

(1460, 60)

In [38]:
# Cuantas categorias diferentes tiene cada feature?
for col in X[object_cols].columns:
    print(X[col].nunique(), '<<<', col)

5 <<< MSZoning
2 <<< Street
4 <<< LotShape
4 <<< LandContour
2 <<< Utilities
5 <<< LotConfig
3 <<< LandSlope
25 <<< Neighborhood
9 <<< Condition1
8 <<< Condition2
5 <<< BldgType
8 <<< HouseStyle
6 <<< RoofStyle
8 <<< RoofMatl
15 <<< Exterior1st
16 <<< Exterior2nd
4 <<< ExterQual
5 <<< ExterCond
6 <<< Foundation
6 <<< Heating
5 <<< HeatingQC
2 <<< CentralAir
4 <<< KitchenQual
7 <<< Functional
3 <<< PavedDrive
9 <<< SaleType
6 <<< SaleCondition


In [39]:
# que columnas tienen menos de 10 categorias diferentes
low_cardinality_cols = [col for col in object_cols if X[col].nunique() < 10]
low_cardinality_cols

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'KitchenQual',
 'Functional',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [40]:
# que columnas vamos a eliminar de los datos de entrenamiento
high_cardinality_cols = list(set(object_cols) - set(low_cardinality_cols))
high_cardinality_cols

['Exterior2nd', 'Neighborhood', 'Exterior1st']

In [41]:
# eliminamos las columnas que tienen una cardinalidad superior a 10

reduced_X = X.drop(high_cardinality_cols, axis = 1)

X.shape, reduced_X.shape


((1460, 60), (1460, 57))

In [42]:
# Encoding

from sklearn.preprocessing import OneHotEncoder

# 1. Definimos el encoder
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

# 2. Entrenamos el encoder
OH_encoder.fit(X[low_cardinality_cols])

# 3. Aplicamos la transformacion
# Recordar que la codificacin OneHot devuelve un array que tenemos que transformar en DF para poder seguir operando
OH_cols_X = pd.DataFrame(OH_encoder.transform(reduced_X[low_cardinality_cols]))

# 4. OneHot elimina el indice, lo volvemos a poner
OH_cols_X.index = X.index

# 5. Eliminamos las columnas categoricas de los datos de entrenamiento
num_X = X.drop(object_cols, axis = 1)

# 6. Unimos la DF codificada con los features numericos de los datos de entrenamiento y validacion
OH_X = pd.concat([num_X, OH_cols_X], axis = 1)


In [43]:
OH_X.head()

Unnamed: 0_level_0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,116,117,118,119,120,121,122,123,124,125
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,8450,7,5,2003,2003,706,0,150,856,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,20,9600,6,8,1976,1976,978,0,284,1262,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,60,11250,7,5,2001,2002,486,0,434,920,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,70,9550,7,5,1915,1970,216,0,540,756,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
5,60,14260,8,5,2000,2000,655,0,490,1145,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [44]:
# 1. Definimos el modelo
final_model = RandomForestRegressor(n_estimators=100, random_state=0) # utilizamos una version standar del algoritmo
                                                                # en otra instancia se puede optimizar los parametros
# 2. Entrenamos el modelo
final_model.fit(OH_X,y)



El analisis de los datos de test estaba fuera de los alcances de esta notebook. Pero si quisieramos seguir adelante con este proyecto, ya tenemos listo el modelo para aplicar a los datos de testo.<br>
<br>
**Que faltaria hacer?<br>**
* Tratamiento de los datos faltantes de X_test. Eliminar las filas no es una opcion, habria que imputarlas
* Codificar las features categoricas
* aplicar el modelo para relizar la prediccion