In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Training set

In [2]:
dtypes = pd.Series({'idaviso': np.dtype('uint64'), 'idpostulante': np.dtype('object'),
                   'se_postulo': np.dtype('uint8'), 'sexo_numerico': np.dtype('uint8'),
                   'edad': np.dtype('uint16'), 'estado_code': np.dtype('uint8'),
                   'sexo_code': np.dtype('uint8'), 'nombre_code': np.dtype('uint8'),
                   'nombre_area_code': np.dtype('uint8'),
                    'denominacion_empresa_code': np.dtype('uint16'),
                   'nivel_laboral_code': np.dtype('uint8'),
                   'tipo_de_trabajo_code': np.dtype('uint8'),
                   'nombre_zona_code': np.dtype('uint8')})

dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]

column_types = dict(zip(dtypes_col, dtypes_type))

trainingSet = pd.read_csv('Data/trainingSet.csv',dtype=column_types)
trainingSet.drop(['Unnamed: 0','sexo_numerico'],axis=1,inplace=True)
#trainingSet.drop(inplace=True)

In [3]:
trainingSet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9439964 entries, 0 to 9439963
Data columns (total 12 columns):
idaviso                      uint64
idpostulante                 object
se_postulo                   uint8
edad                         uint16
estado_code                  uint8
sexo_code                    uint8
nombre_code                  uint8
nombre_area_code             uint8
denominacion_empresa_code    uint16
nivel_laboral_code           uint8
tipo_de_trabajo_code         uint8
nombre_zona_code             uint8
dtypes: object(1), uint16(2), uint64(1), uint8(8)
memory usage: 252.1+ MB


In [4]:
from sklearn.preprocessing import LabelEncoder
#Vamos a codificar la columna de idpostulantes
lb_make1 = LabelEncoder()
trainingSet["idpostulante_code"] = lb_make1.fit_transform(trainingSet["idpostulante"])

In [5]:
trainingSet.drop('idpostulante',axis=1,inplace=True) #No sirve para el algoritmo de ML

# Inicio de Machine Learning

In [6]:
columnas = ['idaviso','idpostulante_code','edad', 'estado_code', 'sexo_code',
       'nombre_code', 'nombre_area_code', 'denominacion_empresa_code',
       'nivel_laboral_code', 'tipo_de_trabajo_code', 'nombre_zona_code']

## Light GBM

In [7]:
from sklearn.model_selection import train_test_split
set_pruebas = trainingSet

X,y = trainingSet,trainingSet.se_postulo
X = X[columnas]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [8]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [9]:
import sys
sys.path.append('/home/fabrizio/anaconda3/envs/envir/lib/python3.6/site-packages/')
import lightgbm as lgb

In [12]:
d_train = lgb.Dataset(X_train, label=y_train)

params = {}
params['learning_rate'] = 0.001
params['boosting_type'] = 'dart'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['bagging_freq'] = 1
params['bagging_fraction'] = 1.0
params['sub_feature'] = 0.5
params['num_leaves'] = 1000
params['max_bin'] = 128
params['save_binary'] = True
params['feature_fraction'] = 0.8

In [None]:
clf = lgb.train(params, d_train, 10000)

In [None]:
#Prediction
y_pred=clf.predict(X_test)

In [None]:
y_pred = y_pred.round()

In [None]:
#Confusion matrix

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

#Accuracy

from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_pred,y_test)

In [None]:
accuracy

In [None]:
from sklearn import metrics
print('confusion matrix')
print(metrics.confusion_matrix(y_test, y_pred))
print('classification report')
print(metrics.classification_report(y_test, y_pred))
print("-----------------------------------------------------------------------------------------")
print("Accuracy is :")
print(metrics.accuracy_score(y_test, y_pred, normalize=False))
print('Area under the curve : %f' % (metrics.roc_auc_score(y_test, y_pred)))

# Testing set

In [34]:
dtypes = pd.Series({'idaviso': np.dtype('uint64'), 'idpostulante': np.dtype('object'),
                   'se_postulo': np.dtype('uint8'), 'edad': np.dtype('uint16'), 
                    'estado_code': np.dtype('uint8'),'sexo_code': np.dtype('uint8'),
                    'nombre_code': np.dtype('uint8'),'nombre_area_code': np.dtype('uint8'),
                    'denominacion_empresa_code': np.dtype('uint16'),
                   'nivel_laboral_code': np.dtype('uint8'),
                   'tipo_de_trabajo_code': np.dtype('uint8'),
                   'nombre_zona_code': np.dtype('uint8')})

dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]

column_types = dict(zip(dtypes_col, dtypes_type))


testingSet_imp_mean = pd.read_csv('TestingSets/testingSet_imp_mean.csv')
testingSet_imp_mean.drop(columns=['Unnamed: 0'],inplace=True)

In [35]:
testingSet_imp_mean["idpostulante_code"] = lb_make1.fit_transform(testingSet_imp_mean["idpostulante"])

In [36]:
testingSet_imp_mean.drop(columns=['idpostulante'],inplace=True) #No sirve para el algoritmo de ML

In [37]:
testingSet_imp_mean[['idpostulante_code','idaviso']] = testingSet_imp_mean[['idpostulante_code','idaviso']].apply(pd.to_numeric,downcast='unsigned')

In [38]:
testingSet_imp_mean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 12 columns):
nombre_area_code             100000 non-null float64
denominacion_empresa_code    100000 non-null float64
nivel_laboral_code           100000 non-null float64
tipo_de_trabajo_code         100000 non-null float64
nombre_zona_code             100000 non-null float64
edad                         100000 non-null float64
estado_code                  100000 non-null float64
sexo_code                    100000 non-null float64
nombre_code                  100000 non-null float64
id                           100000 non-null int64
idaviso                      100000 non-null uint32
idpostulante_code            100000 non-null uint16
dtypes: float64(9), int64(1), uint16(1), uint32(1)
memory usage: 8.2 MB


# Pred

In [39]:
# Hago la prediccion
testingSet_imp_mean.loc[:,'sepostulo'] = testingSet_imp_mean[columnas].apply(lambda x: clf.predict([x])[0],axis = 1)

In [40]:
testingSet_imp_mean.drop(['nombre_area_code','denominacion_empresa_code','nivel_laboral_code','tipo_de_trabajo_code','tipo_de_trabajo_code','nombre_zona_code','edad','estado_code','sexo_code','nombre_code','idaviso','idpostulante_code'],axis=1,inplace=True)

In [41]:
testingSet_imp_mean['sepostulo'] = testingSet_imp_mean['sepostulo'].astype('float64')

In [44]:
testingSet_imp_mean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
id           100000 non-null int64
sepostulo    100000 non-null float64
dtypes: float64(1), int64(1)
memory usage: 1.5 MB


In [45]:
testingSet_imp_mean.to_csv('Submit/prediccion31.csv',index=False)