In [1]:
import pandas as pd
import numpy as np
from pandas.api.types import CategoricalDtype
import time
from datetime import datetime
import scipy.spatial
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import HuberRegressor 
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LassoLars
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor

# Training set

In [2]:
dtypes = pd.Series({'idaviso': np.dtype('uint64'), 'idpostulante': np.dtype('object'),
                   'se_postulo': np.dtype('uint8'), 'sexo_numerico': np.dtype('uint8'),
                   'edad': np.dtype('uint16'), 'estado_code': np.dtype('uint8'),
                   'sexo_code': np.dtype('uint8'), 'nombre_code': np.dtype('uint8'),
                   'nombre_area_code': np.dtype('uint8'),
                    'denominacion_empresa_code': np.dtype('uint16'),
                   'nivel_laboral_code': np.dtype('uint8'),
                   'tipo_de_trabajo_code': np.dtype('uint8'),
                   'nombre_zona_code': np.dtype('uint8')})

dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]

column_types = dict(zip(dtypes_col, dtypes_type))

trainingSet = pd.read_csv('Data/trainingSet.csv',dtype=column_types)
trainingSet.drop(columns=['Unnamed: 0'],inplace=True)
trainingSet.drop(columns=['sexo_numerico'],inplace=True)

In [3]:
trainingSet.drop(columns=['idpostulante'],inplace=True) #No sirve para el algoritmo de ML
trainingSet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9439964 entries, 0 to 9439963
Data columns (total 11 columns):
idaviso                      uint64
se_postulo                   uint8
edad                         uint16
estado_code                  uint8
sexo_code                    uint8
nombre_code                  uint8
nombre_area_code             uint8
denominacion_empresa_code    uint16
nivel_laboral_code           uint8
tipo_de_trabajo_code         uint8
nombre_zona_code             uint8
dtypes: uint16(2), uint64(1), uint8(8)
memory usage: 180.1 MB


# Testing sets

In [4]:
dtypes = pd.Series({'idaviso': np.dtype('uint64'), 'idpostulante': np.dtype('object'),
                   'se_postulo': np.dtype('uint8'), 'edad': np.dtype('uint16'), 
                    'estado_code': np.dtype('uint8'),'sexo_code': np.dtype('uint8'),
                    'nombre_code': np.dtype('uint8'),'nombre_area_code': np.dtype('uint8'),
                    'denominacion_empresa_code': np.dtype('uint16'),
                   'nivel_laboral_code': np.dtype('uint8'),
                   'tipo_de_trabajo_code': np.dtype('uint8'),
                   'nombre_zona_code': np.dtype('uint8')})

dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]

column_types = dict(zip(dtypes_col, dtypes_type))


testingSet_imp_mean = pd.read_csv('TestingSets/testingSet_imp_mean.csv')
testingSet_imp_mean.drop(columns=['Unnamed: 0'],inplace=True)

In [5]:
testingSet_imp_mean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 12 columns):
nombre_area_code             100000 non-null float64
denominacion_empresa_code    100000 non-null float64
nivel_laboral_code           100000 non-null float64
tipo_de_trabajo_code         100000 non-null float64
nombre_zona_code             100000 non-null float64
edad                         100000 non-null float64
estado_code                  100000 non-null float64
sexo_code                    100000 non-null float64
nombre_code                  100000 non-null float64
id                           100000 non-null int64
idaviso                      100000 non-null int64
idpostulante                 100000 non-null object
dtypes: float64(9), int64(2), object(1)
memory usage: 9.2+ MB


In [6]:
dtypes = pd.Series({'idaviso': np.dtype('uint64'), 'idpostulante': np.dtype('object'),
                   'se_postulo': np.dtype('uint8'), 'edad': np.dtype('uint16'), 
                    'estado_code': np.dtype('uint8'),'sexo_code': np.dtype('uint8'),
                    'nombre_code': np.dtype('uint8'),'nombre_area_code': np.dtype('uint8'),
                    'denominacion_empresa_code': np.dtype('uint16'),
                   'nivel_laboral_code': np.dtype('uint8'),
                   'tipo_de_trabajo_code': np.dtype('uint8'),
                   'nombre_zona_code': np.dtype('uint8')})

dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]

column_types = dict(zip(dtypes_col, dtypes_type))


testingSet_imp_median = pd.read_csv('TestingSets/testingSet_imp_median.csv')
testingSet_imp_median.drop(columns=['Unnamed: 0'],inplace=True)

In [7]:
testingSet_imp_median.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 12 columns):
nombre_area_code             100000 non-null float64
denominacion_empresa_code    100000 non-null float64
nivel_laboral_code           100000 non-null float64
tipo_de_trabajo_code         100000 non-null float64
nombre_zona_code             100000 non-null float64
edad                         100000 non-null float64
estado_code                  100000 non-null float64
sexo_code                    100000 non-null float64
nombre_code                  100000 non-null float64
id                           100000 non-null int64
idaviso                      100000 non-null int64
idpostulante                 100000 non-null object
dtypes: float64(9), int64(2), object(1)
memory usage: 9.2+ MB


In [8]:
dtypes = pd.Series({'idaviso': np.dtype('uint64'), 'idpostulante': np.dtype('object'),
                   'se_postulo': np.dtype('uint8'), 'edad': np.dtype('uint16'), 
                    'estado_code': np.dtype('uint8'),'sexo_code': np.dtype('uint8'),
                    'nombre_code': np.dtype('uint8'),'nombre_area_code': np.dtype('uint8'),
                    'denominacion_empresa_code': np.dtype('uint16'),
                   'nivel_laboral_code': np.dtype('uint8'),
                   'tipo_de_trabajo_code': np.dtype('uint8'),
                   'nombre_zona_code': np.dtype('uint8')})

dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]

column_types = dict(zip(dtypes_col, dtypes_type))


testingSet_imp_most_frequent = pd.read_csv('TestingSets/testingSet_imp_most_frequent.csv')
testingSet_imp_most_frequent.drop(columns=['Unnamed: 0'],inplace=True)

In [9]:
testingSet_imp_most_frequent.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 12 columns):
nombre_area_code             100000 non-null float64
denominacion_empresa_code    100000 non-null float64
nivel_laboral_code           100000 non-null float64
tipo_de_trabajo_code         100000 non-null float64
nombre_zona_code             100000 non-null float64
edad                         100000 non-null float64
estado_code                  100000 non-null float64
sexo_code                    100000 non-null float64
nombre_code                  100000 non-null float64
id                           100000 non-null int64
idaviso                      100000 non-null int64
idpostulante                 100000 non-null object
dtypes: float64(9), int64(2), object(1)
memory usage: 9.2+ MB


# Inicio de Machine Learning

In [None]:
columnas = ['idaviso','edad', 'estado_code', 'sexo_code',
       'nombre_code', 'nombre_area_code', 'denominacion_empresa_code',
       'nivel_laboral_code', 'tipo_de_trabajo_code', 'nombre_zona_code']

set_pruebas = trainingSet

X,y = trainingSet,trainingSet.se_postulo
X = X[columnas]

# prepare models
models = []
models.append(('LinearRegression', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('RidgeCV', RidgeCV()))
models.append(('Lasso', Lasso()))
models.append(('Elastic Net', ElasticNet()))
models.append(('LassoLars', LassoLars()))
models.append(('OrthogonalMatchingPursuit', OrthogonalMatchingPursuit()))
models.append(('BayesianRidge', BayesianRidge()))
models.append(('HuberRegressor  ', HuberRegressor()))

# evaluate each model in turn
for name, model in models:
    model.fit(X,Y)
    score = model.score(X,y) * 100
    print("Precision = {:.2f} % , name = {}".format(score,name))

Precision = -15795.07 % , name = LinearRegression
Precision = -15795.07 % , name = Ridge
Precision = -7607133338379186.00 % , name = RidgeCV
Precision = -15085.22 % , name = Lasso
Precision = -15169.90 % , name = Elastic Net
Precision = -14525.67 % , name = LassoLars
Precision = -15795.07 % , name = OrthogonalMatchingPursuit
Precision = -15795.07 % , name = BayesianRidge
