In [1]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
dtypes = pd.Series({'idaviso': np.dtype('uint64'), 'idpostulante': np.dtype('object'),
                   'se_postulo': np.dtype('uint8'), 'sexo_numerico': np.dtype('uint8'),
                   'edad': np.dtype('uint16'), 'estado_code': np.dtype('uint8'),
                   'sexo_code': np.dtype('uint8'), 'nombre_code': np.dtype('uint8'),
                   'nombre_area_code': np.dtype('uint8'),
                    'denominacion_empresa_code': np.dtype('uint16'),
                   'nivel_laboral_code': np.dtype('uint8'),
                   'tipo_de_trabajo_code': np.dtype('uint8'),
                   'nombre_zona_code': np.dtype('uint8')})

dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]

column_types = dict(zip(dtypes_col, dtypes_type))

trainingSet = pd.read_csv('Data/trainingSet.csv',dtype=column_types)
trainingSet.drop(['Unnamed: 0','sexo_numerico'],axis=1,inplace=True)

In [3]:
#Vamos a codificar la columna de idpostulantes
lb_make1 = LabelEncoder()
trainingSet["idpostulante_code"] = lb_make1.fit_transform(trainingSet["idpostulante"])
trainingSet.drop('idpostulante',axis=1,inplace=True) #No sirve para el algoritmo de ML

In [29]:
columnas_corr = ['sexo_code','nivel_laboral_code','estado_code','nombre_zona_code','idpostulante_code',
                 'idaviso','nombre_area_code']

In [5]:
columnas_reduced = ['sexo_code','nivel_laboral_code','estado_code','nombre_zona_code','idpostulante_code']
trainingSet_reduced =trainingSet.drop_duplicates(subset=columnas_reduced)

In [17]:
trainingSet_reduced['se_postulo'].value_counts()

1    1003475
0     402386
Name: se_postulo, dtype: int64

# Ups se produce el problema de clases no balanceadas
    · Uso el metodo de resampling

In [19]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = trainingSet_reduced[trainingSet_reduced.se_postulo==1]
df_minority = trainingSet_reduced[trainingSet_reduced.se_postulo==0]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=1003475,    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.se_postulo.value_counts()

1    1003475
0    1003475
Name: se_postulo, dtype: int64

# Inicio de Machine Learning

In [4]:
columnas = ['idaviso','idpostulante_code','edad', 'estado_code', 'sexo_code',
       'nombre_code', 'nombre_area_code', 'denominacion_empresa_code',
       'nivel_laboral_code', 'tipo_de_trabajo_code', 'nombre_zona_code']

In [34]:
from sklearn.model_selection import train_test_split

X,y = df_upsampled,df_upsampled.se_postulo
X = X[columnas]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [32]:
MLP_Classifier = MLPClassifier(hidden_layer_sizes=(11,11,11), max_iter=500)
MLP_Classifier.fit(X_train,y_train)
precission = MLP_Classifier.score(X_test,y_test) * 100

In [33]:
print ('Precission: {:.2f} %'. format(abs(precission)))

Precission: 49.99 %


In [23]:
print ('Precission: {:.2f} %'. format(abs(precission)))

Precission: 50.01 %


    ·Aplico Down-sample Majority Class

In [25]:
df_majority = trainingSet_reduced[trainingSet_reduced.se_postulo==1]
df_minority = trainingSet_reduced[trainingSet_reduced.se_postulo==0]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=402386,     # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
 
# Display new class counts
df_downsampled.se_postulo.value_counts()

1    402386
0    402386
Name: se_postulo, dtype: int64

In [27]:
from sklearn.model_selection import train_test_split

X,y = df_downsampled,df_downsampled.se_postulo
X = X[columnas]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [28]:
MLP_Classifier = MLPClassifier(hidden_layer_sizes=(11,11,11), max_iter=500)
MLP_Classifier.fit(X_train,y_train)
precission = MLP_Classifier.score(X_test,y_test) * 100
print ('Precission: {:.2f} %'. format(abs(precission)))

Precission: 49.97 %


# Aplico PCA al training reducido

In [7]:
from sklearn.model_selection import train_test_split

X,y = trainingSet_reduced,trainingSet_reduced.se_postulo
X = X[columnas]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Fit on training set only.
scaler.fit(X_train)

# Apply transform to both the training set and the test set.
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
from sklearn.decomposition import PCA

# Make an instance of the Model
pca = PCA(.95)

#Nos quedamos con el 95% de la varianza

In [9]:
pca.fit(X_train)

PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [10]:
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

# Uso MLPClassifier sin PCA y con todas las columnas

In [10]:
MLP_Classifier = MLPClassifier(hidden_layer_sizes=(11,11,11), max_iter=500)
MLP_Classifier.fit(X_train,y_train)
precission = MLP_Classifier.score(X_test,y_test) * 100

In [12]:
from sklearn.model_selection import cross_val_score
mse = np.mean(cross_val_score(MLP_Classifier, X_test, y_test,scoring='neg_mean_squared_error', cv=3,  n_jobs=-1))
print("Method name = {}".format("MLPClassifier"))
print ('Mean squared error: {}'.format(abs(mse)))
print ('Precission: {:.2f} %'. format(abs(precission)))

Method name = MLPClassifier
Mean squared error: 0.19009649511969026
Precission: 81.81 %


# TestingSet

In [13]:
dtypes = pd.Series({'idaviso': np.dtype('uint64'), 'idpostulante': np.dtype('object'),
                   'se_postulo': np.dtype('uint8'), 'edad': np.dtype('uint16'), 
                    'estado_code': np.dtype('uint8'),'sexo_code': np.dtype('uint8'),
                    'nombre_code': np.dtype('uint8'),'nombre_area_code': np.dtype('uint8'),
                    'denominacion_empresa_code': np.dtype('uint16'),
                   'nivel_laboral_code': np.dtype('uint8'),
                   'tipo_de_trabajo_code': np.dtype('uint8'),
                   'nombre_zona_code': np.dtype('uint8')})

dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]

column_types = dict(zip(dtypes_col, dtypes_type))


testingSet_imp_mean = pd.read_csv('TestingSets/testingSet_imp_mean.csv')
testingSet_imp_mean.drop(columns=['Unnamed: 0'],inplace=True)
testingSet_imp_mean["idpostulante_code"] = lb_make1.fit_transform(testingSet_imp_mean["idpostulante"])
testingSet_imp_mean.drop(columns=['idpostulante'],inplace=True) #No sirve para el algoritmo de ML

In [15]:
# Hago la prediccion
testingSet_imp_mean.loc[:,'sepostulo'] = testingSet_imp_mean[columnas].apply(lambda x: MLP_Classifier.predict([x])[0],axis = 1)
testingSet_imp_mean.drop(['nombre_area_code','denominacion_empresa_code','nivel_laboral_code','tipo_de_trabajo_code','tipo_de_trabajo_code','nombre_zona_code','edad','estado_code','sexo_code','nombre_code','idaviso','idpostulante_code'],axis=1,inplace=True)

In [16]:
testingSet_imp_mean['sepostulo'].value_counts()

0    100000
Name: sepostulo, dtype: int64

# MLP sin PCA