In [1]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import time
from datetime import datetime
import scipy.spatial
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Imputer

In [2]:
testingSet = pd.read_csv('/home/lucio/Documentos/Datos/NaventDatosTP/Data/test_final_100k.csv')

In [3]:
testingSet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
id              100000 non-null int64
idaviso         100000 non-null int64
idpostulante    100000 non-null object
dtypes: int64(2), object(1)
memory usage: 2.3+ MB


# Agrego informacion de avisos detalle al testingSet

In [4]:
avisos_detalle = pd.read_csv('/home/lucio/Documentos/Datos/NaventDatosTP/Data/optimized_avisos_detalle.csv')
avisos_detalle.drop('Unnamed: 0',inplace=True,axis=1)

In [5]:
avisos_detalle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25287 entries, 0 to 25286
Data columns (total 6 columns):
idaviso                      25287 non-null int64
nombre_area_code             25287 non-null int64
denominacion_empresa_code    25287 non-null int64
nivel_laboral_code           25287 non-null int64
tipo_de_trabajo_code         25287 non-null int64
nombre_zona_code             25287 non-null int64
dtypes: int64(6)
memory usage: 1.2 MB


In [6]:
testingSet = pd.merge(testingSet,avisos_detalle,how='left',on='idaviso')

In [7]:
testingSet.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 8 columns):
id                           100000 non-null int64
idaviso                      100000 non-null int64
idpostulante                 100000 non-null object
nombre_area_code             100000 non-null int64
denominacion_empresa_code    100000 non-null int64
nivel_laboral_code           100000 non-null int64
tipo_de_trabajo_code         100000 non-null int64
nombre_zona_code             100000 non-null int64
dtypes: int64(7), object(1)
memory usage: 6.9+ MB


# Agrego informacion de postulantes

In [8]:
dtypes = pd.Series({'estado': CategoricalDtype(categories=['Abandonado', 'En Curso', 'Graduado'], ordered=False), 'idpostulante': np.dtype('object'),
                    'nombre': CategoricalDtype(categories=['Doctorado', 'Master', 'Otro', 'Posgrado', 'Secundario',
                  'Terciario/Técnico', 'Universitario'],ordered=False),'idpostulante': np.dtype('object'),
                    'sexo': CategoricalDtype(categories=['FEM', 'MASC', 'NO_DECLARA'], ordered=False)})

dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]

column_types = dict(zip(dtypes_col, dtypes_type))

postulantes = pd.read_csv('/home/lucio/Documentos/Datos/NaventDatosTP/Data/POSTULANTES_FINAL.csv',dtype=column_types)
postulantes.drop('Unnamed: 0',inplace=True,axis=1)

In [9]:
postulantes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438357 entries, 0 to 438356
Data columns (total 5 columns):
idpostulante    438357 non-null object
edad            438357 non-null int64
estado_code     438357 non-null int64
sexo_code       438357 non-null int64
nombre_code     438357 non-null int64
dtypes: int64(4), object(1)
memory usage: 16.7+ MB


In [10]:
testingSet = pd.merge(testingSet,postulantes,how='left',on='idpostulante')

In [11]:
testingSet.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 12 columns):
id                           100000 non-null int64
idaviso                      100000 non-null int64
idpostulante                 100000 non-null object
nombre_area_code             100000 non-null int64
denominacion_empresa_code    100000 non-null int64
nivel_laboral_code           100000 non-null int64
tipo_de_trabajo_code         100000 non-null int64
nombre_zona_code             100000 non-null int64
edad                         93346 non-null float64
estado_code                  93346 non-null float64
sexo_code                    93346 non-null float64
nombre_code                  93346 non-null float64
dtypes: float64(4), int64(7), object(1)
memory usage: 9.9+ MB


In [35]:
testingSet_2 = testingSet[['nombre_area_code',
       'denominacion_empresa_code', 'nivel_laboral_code',
       'tipo_de_trabajo_code', 'nombre_zona_code', 'edad', 'estado_code',
       'sexo_code', 'nombre_code']]


In [None]:
postMasVistas2 = postMasVistas[['price_aprox_usd','lat','lon','surface_total_in_m2',
       'rooms', 'year', 'month', 'property_type_code','place_name_code', 
        'place_with_parent_names_code', 'state_name_code','cochera', 'pileta', 'vigilancia',
        'gimnasio', 'patio', 'terraza','parrilla', 'lavadero']]

postMasVistas2.corr().se_postulo.sort_values(ascending = False)

# Utilizamos imputing para los valores que no tenemos

In [51]:
fill_NaN = Imputer(missing_values=np.nan, strategy='mean', axis=1)
imputed_DF_mean = pd.DataFrame(fill_NaN.fit_transform(testingSet_2))
imputed_DF_mean.columns = ['nombre_area_code',
       'denominacion_empresa_code', 'nivel_laboral_code',
       'tipo_de_trabajo_code', 'nombre_zona_code', 'edad', 'estado_code','sexo_code', 'nombre_code']
imputed_DF_mean.index = testingSet.index

In [65]:
imputed_DF_mean.tail(27)

Unnamed: 0,nombre_area_code,denominacion_empresa_code,nivel_laboral_code,tipo_de_trabajo_code,nombre_zona_code,edad,estado_code,sexo_code,nombre_code
99973,170.0,2127.0,4.0,1.0,7.0,27.0,2.0,0.0,3.0
99974,170.0,2127.0,4.0,1.0,7.0,41.0,2.0,0.0,3.0
99975,170.0,2127.0,4.0,1.0,7.0,33.0,2.0,0.0,3.0
99976,32.0,410.0,4.0,1.0,7.0,32.0,2.0,0.0,5.0
99977,32.0,410.0,4.0,1.0,7.0,31.0,1.0,0.0,6.0
99978,32.0,410.0,4.0,1.0,7.0,33.05606,1.453089,0.505303,5.030392
99979,156.0,1452.0,4.0,5.0,1.0,37.0,2.0,0.0,6.0
99980,156.0,1452.0,4.0,5.0,1.0,37.0,2.0,0.0,5.0
99981,156.0,1452.0,4.0,5.0,1.0,28.0,0.0,0.0,6.0
99982,4.0,2982.0,4.0,1.0,7.0,40.0,2.0,1.0,6.0


In [53]:
fill_NaN = Imputer(missing_values=np.nan, strategy='median', axis=1)
imputed_DF_median = pd.DataFrame(fill_NaN.fit_transform(testingSet_2))
imputed_DF_median.columns = ['nombre_area_code',
       'denominacion_empresa_code', 'nivel_laboral_code',
       'tipo_de_trabajo_code', 'nombre_zona_code', 'edad', 'estado_code','sexo_code', 'nombre_code']
imputed_DF_median.index = testingSet.index

In [64]:
imputed_DF_median.head(27)

Unnamed: 0,nombre_area_code,denominacion_empresa_code,nivel_laboral_code,tipo_de_trabajo_code,nombre_zona_code,edad,estado_code,sexo_code,nombre_code
0,30.0,465.0,1.0,1.0,1.0,41.0,2.0,0.0,0.0
1,30.0,465.0,1.0,1.0,1.0,30.0,2.0,1.0,5.0
2,30.0,465.0,1.0,1.0,1.0,35.0,2.0,0.0,6.0
3,170.0,3891.0,3.0,1.0,1.0,68.0,2.0,1.0,3.0
4,170.0,3891.0,3.0,1.0,1.0,31.0,1.0,0.0,6.0
5,170.0,3891.0,3.0,1.0,1.0,27.0,0.0,1.0,6.0
6,40.0,3060.0,3.0,1.0,1.0,28.0,2.0,1.0,6.0
7,40.0,3060.0,3.0,1.0,1.0,53.0,1.0,1.0,5.0
8,40.0,3060.0,3.0,1.0,1.0,27.0,2.0,0.0,6.0
9,154.0,2486.0,3.0,4.0,0.0,37.0,2.0,1.0,3.0


In [61]:
fill_NaN = Imputer(missing_values=np.nan, strategy='most_frequent', axis=1)
imputed_DF_most_frequent = pd.DataFrame(fill_NaN.fit_transform(testingSet_2))
imputed_DF_most_frequent.columns = ['nombre_area_code',
       'denominacion_empresa_code', 'nivel_laboral_code',
       'tipo_de_trabajo_code', 'nombre_zona_code', 'edad', 'estado_code','sexo_code', 'nombre_code']
imputed_DF_most_frequent.index = testingSet.index

In [63]:
imputed_DF_most_frequent.head(27)

Unnamed: 0,nombre_area_code,denominacion_empresa_code,nivel_laboral_code,tipo_de_trabajo_code,nombre_zona_code,edad,estado_code,sexo_code,nombre_code
0,30.0,465.0,1.0,1.0,1.0,41.0,2.0,0.0,0.0
1,30.0,465.0,1.0,1.0,1.0,30.0,2.0,1.0,5.0
2,30.0,465.0,1.0,1.0,1.0,35.0,2.0,0.0,6.0
3,170.0,3891.0,3.0,1.0,1.0,68.0,2.0,1.0,3.0
4,170.0,3891.0,3.0,1.0,1.0,31.0,1.0,0.0,6.0
5,170.0,3891.0,3.0,1.0,1.0,27.0,0.0,1.0,6.0
6,40.0,3060.0,3.0,1.0,1.0,28.0,2.0,1.0,6.0
7,40.0,3060.0,3.0,1.0,1.0,53.0,1.0,1.0,5.0
8,40.0,3060.0,3.0,1.0,1.0,27.0,2.0,0.0,6.0
9,154.0,2486.0,3.0,4.0,0.0,37.0,2.0,1.0,3.0


In [27]:
testingSet_imp_mean = testingSet
testingSet_imp_mean[]

Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)