In [1]:
import pandas as pd
import numpy as np
from pandas.api.types import CategoricalDtype
import time
from datetime import datetime
import scipy.spatial
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import HuberRegressor 
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LassoLars
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor

# Training set

In [2]:
dtypes = pd.Series({'idaviso': np.dtype('uint64'), 'idpostulante': np.dtype('object'),
                   'se_postulo': np.dtype('uint8'), 'sexo_numerico': np.dtype('uint8'),
                   'edad': np.dtype('uint16'), 'estado_code': np.dtype('uint8'),
                   'sexo_code': np.dtype('uint8'), 'nombre_code': np.dtype('uint8'),
                   'nombre_area_code': np.dtype('uint8'),
                    'denominacion_empresa_code': np.dtype('uint16'),
                   'nivel_laboral_code': np.dtype('uint8'),
                   'tipo_de_trabajo_code': np.dtype('uint8'),
                   'nombre_zona_code': np.dtype('uint8')})

dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]

column_types = dict(zip(dtypes_col, dtypes_type))

trainingSet = pd.read_csv('Data/trainingSet.csv',dtype=column_types)
trainingSet.drop(columns=['Unnamed: 0'],inplace=True)
trainingSet.drop(columns=['sexo_numerico'],inplace=True)

In [3]:
trainingSet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9439964 entries, 0 to 9439963
Data columns (total 12 columns):
idaviso                      uint64
idpostulante                 object
se_postulo                   uint8
edad                         uint16
estado_code                  uint8
sexo_code                    uint8
nombre_code                  uint8
nombre_area_code             uint8
denominacion_empresa_code    uint16
nivel_laboral_code           uint8
tipo_de_trabajo_code         uint8
nombre_zona_code             uint8
dtypes: object(1), uint16(2), uint64(1), uint8(8)
memory usage: 252.1+ MB


In [4]:
#Vamos a codificar la columna de idpostulantes
lb_make1 = LabelEncoder()
trainingSet["idpostulante_code"] = lb_make1.fit_transform(trainingSet["idpostulante"])

In [5]:
trainingSet.drop(columns=['idpostulante'],inplace=True) #No sirve para el algoritmo de ML

# Testing sets

In [6]:
dtypes = pd.Series({'idaviso': np.dtype('uint64'), 'idpostulante': np.dtype('object'),
                   'se_postulo': np.dtype('uint8'), 'edad': np.dtype('uint16'), 
                    'estado_code': np.dtype('uint8'),'sexo_code': np.dtype('uint8'),
                    'nombre_code': np.dtype('uint8'),'nombre_area_code': np.dtype('uint8'),
                    'denominacion_empresa_code': np.dtype('uint16'),
                   'nivel_laboral_code': np.dtype('uint8'),
                   'tipo_de_trabajo_code': np.dtype('uint8'),
                   'nombre_zona_code': np.dtype('uint8')})

dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]

column_types = dict(zip(dtypes_col, dtypes_type))


testingSet_imp_mean = pd.read_csv('TestingSets/testingSet_imp_mean.csv')
testingSet_imp_mean.drop(columns=['Unnamed: 0'],inplace=True)

In [7]:
testingSet_imp_mean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 12 columns):
nombre_area_code             100000 non-null float64
denominacion_empresa_code    100000 non-null float64
nivel_laboral_code           100000 non-null float64
tipo_de_trabajo_code         100000 non-null float64
nombre_zona_code             100000 non-null float64
edad                         100000 non-null float64
estado_code                  100000 non-null float64
sexo_code                    100000 non-null float64
nombre_code                  100000 non-null float64
id                           100000 non-null int64
idaviso                      100000 non-null int64
idpostulante                 100000 non-null object
dtypes: float64(9), int64(2), object(1)
memory usage: 9.2+ MB


In [8]:
dtypes = pd.Series({'idaviso': np.dtype('uint64'), 'idpostulante': np.dtype('object'),
                   'se_postulo': np.dtype('uint8'), 'edad': np.dtype('uint16'), 
                    'estado_code': np.dtype('uint8'),'sexo_code': np.dtype('uint8'),
                    'nombre_code': np.dtype('uint8'),'nombre_area_code': np.dtype('uint8'),
                    'denominacion_empresa_code': np.dtype('uint16'),
                   'nivel_laboral_code': np.dtype('uint8'),
                   'tipo_de_trabajo_code': np.dtype('uint8'),
                   'nombre_zona_code': np.dtype('uint8')})

dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]

column_types = dict(zip(dtypes_col, dtypes_type))


testingSet_imp_median = pd.read_csv('TestingSets/testingSet_imp_median.csv')
testingSet_imp_median.drop(columns=['Unnamed: 0'],inplace=True)

In [9]:
testingSet_imp_median.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 12 columns):
nombre_area_code             100000 non-null float64
denominacion_empresa_code    100000 non-null float64
nivel_laboral_code           100000 non-null float64
tipo_de_trabajo_code         100000 non-null float64
nombre_zona_code             100000 non-null float64
edad                         100000 non-null float64
estado_code                  100000 non-null float64
sexo_code                    100000 non-null float64
nombre_code                  100000 non-null float64
id                           100000 non-null int64
idaviso                      100000 non-null int64
idpostulante                 100000 non-null object
dtypes: float64(9), int64(2), object(1)
memory usage: 9.2+ MB


In [10]:
dtypes = pd.Series({'idaviso': np.dtype('uint64'), 'idpostulante': np.dtype('object'),
                   'se_postulo': np.dtype('uint8'), 'edad': np.dtype('uint16'), 
                    'estado_code': np.dtype('uint8'),'sexo_code': np.dtype('uint8'),
                    'nombre_code': np.dtype('uint8'),'nombre_area_code': np.dtype('uint8'),
                    'denominacion_empresa_code': np.dtype('uint16'),
                   'nivel_laboral_code': np.dtype('uint8'),
                   'tipo_de_trabajo_code': np.dtype('uint8'),
                   'nombre_zona_code': np.dtype('uint8')})

dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]

column_types = dict(zip(dtypes_col, dtypes_type))


testingSet_imp_most_frequent = pd.read_csv('TestingSets/testingSet_imp_most_frequent.csv')
testingSet_imp_most_frequent.drop(columns=['Unnamed: 0'],inplace=True)

In [11]:
testingSet_imp_most_frequent.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 12 columns):
nombre_area_code             100000 non-null float64
denominacion_empresa_code    100000 non-null float64
nivel_laboral_code           100000 non-null float64
tipo_de_trabajo_code         100000 non-null float64
nombre_zona_code             100000 non-null float64
edad                         100000 non-null float64
estado_code                  100000 non-null float64
sexo_code                    100000 non-null float64
nombre_code                  100000 non-null float64
id                           100000 non-null int64
idaviso                      100000 non-null int64
idpostulante                 100000 non-null object
dtypes: float64(9), int64(2), object(1)
memory usage: 9.2+ MB


# Inicio de Machine Learning

In [12]:
columnas = ['idaviso','idpostulante_code','edad', 'estado_code', 'sexo_code',
       'nombre_code', 'nombre_area_code', 'denominacion_empresa_code',
       'nivel_laboral_code', 'tipo_de_trabajo_code', 'nombre_zona_code']

set_pruebas = trainingSet

X,y = trainingSet,trainingSet.se_postulo
X = X[columnas]

# prepare models
models = []
models.append(('LinearRegression', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('RidgeCV', RidgeCV()))
models.append(('Lasso', Lasso()))
models.append(('Elastic Net', ElasticNet()))
models.append(('LassoLars', LassoLars()))
models.append(('OrthogonalMatchingPursuit', OrthogonalMatchingPursuit()))
models.append(('BayesianRidge', BayesianRidge()))
models.append(('HuberRegressor  ', HuberRegressor()))

# evaluate each model in turn
for name, model in models:
    model.fit(X,y)
    score = model.score(X,y) * 100
    print("Precision = {:.2f} % , name = {}".format(score,name))

Precision = 0.35 % , name = LinearRegression
Precision = 0.35 % , name = Ridge
Precision = -78121918887.95 % , name = RidgeCV
Precision = 0.02 % , name = Lasso
Precision = 0.02 % , name = Elastic Net
Precision = 0.00 % , name = LassoLars
Precision = 0.19 % , name = OrthogonalMatchingPursuit
Precision = 0.35 % , name = BayesianRidge
Precision = -0.04 % , name = HuberRegressor  


In [13]:
trainingSet.corr().se_postulo.sort_values(ascending = False)

se_postulo                   1.000000
sexo_code                    0.043731
nivel_laboral_code           0.024677
estado_code                  0.016165
nombre_zona_code             0.010822
idpostulante_code            0.003469
idaviso                      0.002977
nombre_area_code             0.002570
tipo_de_trabajo_code        -0.007484
edad                        -0.012981
denominacion_empresa_code   -0.014913
nombre_code                 -0.020866
Name: se_postulo, dtype: float64

In [14]:
columnas_corr = ['sexo_code','nivel_laboral_code','estado_code','nombre_zona_code','idpostulante_code',
                 'idaviso','nombre_area_code']

set_pruebas = trainingSet

X,y = trainingSet,trainingSet.se_postulo
X = X[columnas_corr]

# prepare models
models = []
models.append(('LinearRegression', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('RidgeCV', RidgeCV()))
models.append(('Lasso', Lasso()))
models.append(('Elastic Net', ElasticNet()))
models.append(('LassoLars', LassoLars()))
models.append(('OrthogonalMatchingPursuit', OrthogonalMatchingPursuit()))
models.append(('BayesianRidge', BayesianRidge()))
models.append(('HuberRegressor  ', HuberRegressor()))

# evaluate each model in turn
for name, model in models:
    model.fit(X,y)
    score = model.score(X,y) * 100
    print("Precision = {:.2f} % , name = {}".format(score,name))

Precision = 0.28 % , name = LinearRegression
Precision = 0.28 % , name = Ridge
Precision = -210668018185.80 % , name = RidgeCV
Precision = 0.00 % , name = Lasso
Precision = 0.00 % , name = Elastic Net
Precision = 0.00 % , name = LassoLars
Precision = 0.19 % , name = OrthogonalMatchingPursuit
Precision = 0.28 % , name = BayesianRidge
Precision = -0.04 % , name = HuberRegressor  


## Random Forest

In [15]:
columnas = ['idaviso','idpostulante_code','edad', 'estado_code', 'sexo_code',
       'nombre_code', 'nombre_area_code', 'denominacion_empresa_code',
       'nivel_laboral_code', 'tipo_de_trabajo_code', 'nombre_zona_code']

set_pruebas = trainingSet

X,y = trainingSet,trainingSet.se_postulo
X = X[columnas]

random_forest = RandomForestRegressor()
random_forest.fit(X,y)
precission = random_forest.score(X,y) * 100
mse = np.mean(cross_val_score(random_forest, X, y,scoring='neg_mean_squared_error', cv=15,  n_jobs=2))
print("Method name = {}".format("RandomForestRegressor"))
print ('Mean squared error: {}'.format(abs(mse)))
print ('Precission: {:.2f} %'. format(abs(precission)))

Method name = RandomForestRegressor
Mean squared error: 0.19485293319871383
Precission: 88.25 %


In [16]:
from sklearn.cross_validation import train_test_split

X,y = trainingSet,trainingSet.se_postulo
X = X[columnas]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)



## ExtraTrees

In [17]:
X,y = trainingSet,trainingSet.se_postulo
X = X[columnas]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


extra_tree = ExtraTreesRegressor(n_estimators=32, criterion = 'mse', max_depth=None, min_samples_split=5,\
                                 min_samples_leaf=1, min_weight_fraction_leaf = 0.0, max_features ="auto",\
                                max_leaf_nodes = None,min_impurity_decrease = 6,bootstrap = False)
extra_tree.fit(X_train,y_train)


precission = extra_tree.score(X_test,y_test) * 100
mse = np.mean(cross_val_score(extra_tree, X_test, y_test,scoring='neg_mean_squared_error', cv=15,  n_jobs=2))
print("Method name = {}".format("ExtraTreesRegressor"))
print ('Mean squared error: {}'.format(abs(mse)))
print ('Precission: {} %'. format(abs(precission)))

Method name = ExtraTreesRegressor
Mean squared error: 0.2334869833774863
Precission: 1.474309678783925e-05 %


## GradientBoostingRegressor

In [18]:
from sklearn.ensemble import GradientBoostingRegressor

X,y = trainingSet,trainingSet.se_postulo
X = X[columnas]

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

gradient_boosting = GradientBoostingRegressor(loss='ls', learning_rate=0.75, n_estimators=100, subsample=1.0, criterion='mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto')
gradient_boosting.fit(X,y)

precission = gradient_boosting.score(X,y) * 100
mse = np.mean(cross_val_score(gradient_boosting, X, y,scoring='neg_mean_squared_error', cv=5,  n_jobs=2))
print("Method name = {}".format("GradientBoostingRegressor"))
print ('Mean squared error: {}'.format(abs(mse)))
print ('Precission: {} %'. format(abs(precission)))

Method name = GradientBoostingRegressor
Mean squared error: 0.2556160523924702
Precission: 37.677365878911914 %


## Adaboost

In [12]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor

columnas = ['idaviso','idpostulante_code','edad', 'estado_code', 'sexo_code',
       'nombre_code', 'nombre_area_code', 'denominacion_empresa_code',
       'nivel_laboral_code', 'tipo_de_trabajo_code', 'nombre_zona_code']

X,y = trainingSet,trainingSet.se_postulo
X = X[columnas]

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
#Tambien probar con columnas corr


for estimator in [DecisionTreeRegressor(),RandomForestRegressor(),ExtraTreesRegressor(),GradientBoostingRegressor()]:
    for val_loss in ['linear','square','exponential']:
        ada_boost= AdaBoostRegressor(loss=val_loss,base_estimator=estimator)
        ada_boost.fit(X,y)

        precission = ada_boost.score(X,y) * 100
        mse = np.mean(cross_val_score(ada_boost, X, y,scoring='neg_mean_squared_error', cv=15,  n_jobs=2))
        print("Method name = {}, Estimator = {}, Loss = {}".format("AdaBoostRegressor",estimator,val_loss))
        print ('Mean squared error: {}'.format(abs(mse)))
        print ('Precission: {} %'. format(abs(precission)))
        print ("")
        print ("")

MemoryError: 

## Random Search

In [20]:
from sklearn.ensemble import ExtraTreesRegressor

X,y = trainingSet,trainingSet.se_postulo
X = X[columnas]

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
#Tambien probar con columnas corr


extra = ExtraTreesRegressor()


# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  (results['mean_test_score'][candidate])*100,
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None,5,10,1],
              "max_features": [0.2,0.4,0.6,0.8,1.0],
              "n_estimators" : [10,25,50,80,100],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [2, 3, 10],
              "criterion": ["mse","friedman_mse"],
              "bootstrap": [True, False]}

# run randomized search
random_search = RandomizedSearchCV(estimator = extra, param_distributions=param_dist, n_jobs = 2, cv = 5)

start = time.time()
print ("Fitting... Inicio: {} ".format(time.strftime("%X")))
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates parameter settings." % ((time.time() - start), len(random_search.cv_results_['params'])))
report(random_search.cv_results_)

Fitting... Inicio: 14:07:46 
RandomizedSearchCV took 2266.89 seconds for 10 candidates parameter settings.
Model with rank: 1
Mean validation score: -59.384 (std: 1.188)
Parameters: {'n_estimators': 25, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_features': 0.4, 'max_depth': 10, 'criterion': 'friedman_mse', 'bootstrap': False}

Model with rank: 2
Mean validation score: -59.776 (std: 1.196)
Parameters: {'n_estimators': 10, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 0.8, 'max_depth': 5, 'criterion': 'mse', 'bootstrap': False}

Model with rank: 3
Mean validation score: -59.789 (std: 1.196)
Parameters: {'n_estimators': 80, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 0.8, 'max_depth': 3, 'criterion': 'mse', 'bootstrap': False}



## Grid search

In [13]:
from sklearn.tree import DecisionTreeRegressor

columnas = ['idaviso','idpostulante_code','edad', 'estado_code', 'sexo_code',
       'nombre_code', 'nombre_area_code', 'denominacion_empresa_code',
       'nivel_laboral_code', 'tipo_de_trabajo_code', 'nombre_zona_code']

# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  (results['mean_test_score'][candidate])*100,
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

# use a full grid over all parameters
param_grid = {"max_depth": [3, None,5,10,1],
              "max_features": [0.2,0.4,0.6,0.8,1.0],
              "splitter" : ["random","best"],
              "min_samples_split": [2, 3],
              "min_samples_leaf": [1, 3, 10],
              "criterion": ["mse","friedman_mse"]}

X,y = trainingSet,trainingSet.se_postulo
X = X[columnas]

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
#Tambien probar con columnas corr

tree = DecisionTreeRegressor()

# run grid search
grid_search = GridSearchCV(estimator = tree, param_grid=param_grid,n_jobs= 2,cv = 15)
start = time.time()
print ("Fitting... Inicio: {} ".format(time.strftime("%X")))
grid_search.fit(X, y)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time.time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)

Fitting... Inicio: 20:51:21 


KeyboardInterrupt: 