In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from pandas.api.types import CategoricalDtype

In [2]:
dtypes = pd.Series({'sepostulo': np.dtype('uint8'),'edad': np.dtype('uint16')})

dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]

column_types = dict(zip(dtypes_col, dtypes_type))

trainingSet = pd.read_csv('/home/fabrizio/env/NaventDatosTP/Data/TRAINING_SET_SIN_ENCODING.csv',dtype=column_types)
trainingSet.drop(['Unnamed: 0','denominacion_empresa'],axis=1,inplace=True)

for col in ['sexo', 'nombre', 'estado', 'nombre_zona',
            'tipo_de_trabajo','nivel_laboral','nombre_area']:
    trainingSet[col] = trainingSet[col].astype('category')

In [3]:
trainingSet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9439964 entries, 0 to 9439963
Data columns (total 9 columns):
sepostulo          uint8
sexo               category
nombre             category
estado             category
edad               uint16
nombre_zona        category
tipo_de_trabajo    category
nivel_laboral      category
nombre_area        category
dtypes: category(7), uint16(1), uint8(1)
memory usage: 99.0 MB


# Me quedo con 500K de registros random

In [6]:
trainingSet_samples = trainingSet.sample(n=2000000,random_state=50)

In [7]:
trainingSet_samples['sepostulo'].value_counts()

1    1256729
0     743271
Name: sepostulo, dtype: int64

# Hago dummificacion

In [8]:
trainingSet_samples = trainingSet_samples.join(pd.get_dummies(trainingSet_samples.sexo))

trainingSet_samples.drop('sexo',axis=1, inplace=True)

trainingSet_samples = trainingSet_samples.join(pd.get_dummies(trainingSet_samples.nombre))

trainingSet_samples.drop('nombre',axis=1, inplace=True)

trainingSet_samples = trainingSet_samples.join(pd.get_dummies(trainingSet_samples.estado))

trainingSet_samples.drop('estado',axis=1, inplace=True)

trainingSet_samples = trainingSet_samples.join(pd.get_dummies(trainingSet_samples.nombre_zona))

trainingSet_samples.drop('nombre_zona',axis=1, inplace=True)

trainingSet_samples = trainingSet_samples.join(pd.get_dummies(trainingSet_samples.tipo_de_trabajo))

trainingSet_samples.drop('tipo_de_trabajo',axis=1, inplace=True)

trainingSet_samples = trainingSet_samples.join(pd.get_dummies(trainingSet_samples.nivel_laboral,
                                                             prefix="Nivel_laboral"))

trainingSet_samples.drop('nivel_laboral',axis=1, inplace=True)

#trainingSet_samples = trainingSet_samples.join(pd.get_dummies(trainingSet_samples.nombre_area))
# LO SAQUE PORQUE MI MAQUINA NO SE BANCA POR AHI LA DE UDS SI AUNQUE AGREGA MUCHAS COLUMAS(188)

trainingSet_samples.drop('nombre_area',axis=1, inplace=True)

In [4]:
trainingSet = trainingSet.join(pd.get_dummies(trainingSet.sexo))

trainingSet.drop('sexo',axis=1, inplace=True)

trainingSet = trainingSet.join(pd.get_dummies(trainingSet.nombre))

trainingSet.drop('nombre',axis=1, inplace=True)

trainingSet = trainingSet.join(pd.get_dummies(trainingSet.estado))

trainingSet.drop('estado',axis=1, inplace=True)

trainingSet = trainingSet.join(pd.get_dummies(trainingSet.nombre_zona))

trainingSet.drop('nombre_zona',axis=1, inplace=True)

trainingSet = trainingSet.join(pd.get_dummies(trainingSet.tipo_de_trabajo))

trainingSet.drop('tipo_de_trabajo',axis=1, inplace=True)

trainingSet = trainingSet.join(pd.get_dummies(trainingSet.nivel_laboral,
                                                             prefix="Nivel_laboral"))

trainingSet.drop('nivel_laboral',axis=1, inplace=True)

#trainingSet_samples = trainingSet_samples.join(pd.get_dummies(trainingSet_samples.nombre_area))
# LO SAQUE PORQUE MI MAQUINA NO SE BANCA POR AHI LA DE UDS SI AUNQUE AGREGA MUCHAS COLUMAS(188)

trainingSet.drop('nombre_area',axis=1, inplace=True)

In [9]:
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.model_selection import GridSearchCV   #Perforing grid search



In [None]:
#Choose all predictors except target
target = 'sepostulo'
predictors = [x for x in trainingSet.columns if x not in [target]]
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=500,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=1,
 scale_pos_weight=1,
 seed=27)
#model.fit(xgb1, trainingSet_samples, predictors)

    
cv_folds=5
early_stopping_rounds=50
xgb_param = xgb1.get_xgb_params()
xgtrain = xgb.DMatrix(trainingSet[predictors].values, label=trainingSet[target].values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb1.get_params()['n_estimators'],
                  nfold=cv_folds, metrics={'auc'}, early_stopping_rounds=early_stopping_rounds)
xgb1.set_params(n_estimators=cvresult.shape[0])

#Fit the algorithm on the data
xgb1.fit(trainingSet[predictors], trainingSet['sepostulo'],eval_metric='auc')

#Predict training set:
dtrain_predictions = xgb1.predict(trainingSet[predictors])
dtrain_predprob = xgb1.predict_proba(trainingSet[predictors])[:,1]

#Print model report:
print ("\nModel Report")
print ("Accuracy : %.4g" % metrics.accuracy_score(trainingSet['sepostulo'].values, dtrain_predictions))
print ("AUC Score (Train): %f" % metrics.roc_auc_score(trainingSet['sepostulo'], dtrain_predprob))

In [10]:
#Choose all predictors except target
target = 'sepostulo'
predictors = [x for x in trainingSet_samples.columns if x not in [target]]
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=500,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=1,
 scale_pos_weight=1,
 seed=27)
#model.fit(xgb1, trainingSet_samples, predictors)

    
cv_folds=5
early_stopping_rounds=50
xgb_param = xgb1.get_xgb_params()
xgtrain = xgb.DMatrix(trainingSet_samples[predictors].values, label=trainingSet_samples[target].values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb1.get_params()['n_estimators'],
                  nfold=cv_folds, metrics={'auc'}, early_stopping_rounds=early_stopping_rounds)
xgb1.set_params(n_estimators=cvresult.shape[0])

#Fit the algorithm on the data
xgb1.fit(trainingSet_samples[predictors], trainingSet_samples['sepostulo'],eval_metric='auc')

#Predict training set:
dtrain_predictions = xgb1.predict(trainingSet_samples[predictors])
dtrain_predprob = xgb1.predict_proba(trainingSet_samples[predictors])[:,1]

#Print model report:
print ("\nModel Report")
print ("Accuracy : %.4g" % metrics.accuracy_score(trainingSet_samples['sepostulo'].values, dtrain_predictions))
print ("AUC Score (Train): %f" % metrics.roc_auc_score(trainingSet_samples['sepostulo'], dtrain_predprob))

  if diff:



Model Report
Accuracy : 0.6327
AUC Score (Train): 0.563374


# TestingSet

In [25]:
dtypes = pd.Series({'estado': CategoricalDtype(categories=['Abandonado', 'En Curso', 'Graduado'], 
                                               ordered=False), 'idpostulante': np.dtype('object'),
                    'nombre': CategoricalDtype(categories=['Doctorado', 'Master', 'Otro', 'Posgrado',
                            'Secundario','Terciario/Técnico', 'Universitario'],ordered=False),
                    'sexo': CategoricalDtype(categories=['FEM', 'MASC', 'NO_DECLARA'], ordered=False)})

dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]

column_types = dict(zip(dtypes_col, dtypes_type))

testingSet_CON_NAN = pd.read_csv('/home/fabrizio/env/NaventDatosTP/TestingSets/testingSet_CON_NAN.csv',dtype=column_types)
testingSet_CON_NAN.drop('Unnamed: 0',inplace=True,axis=1)
testingSet_CON_NAN.drop(columns=['idpostulante','denominacion_empresa','nombre_area'],inplace=True) #No sirve para el algoritmo de ML

In [13]:
testingSet_CON_NAN.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
id                 100000 non-null int64
idaviso            100000 non-null int64
nombre_zona        100000 non-null object
tipo_de_trabajo    100000 non-null object
nivel_laboral      100000 non-null object
sexo               93375 non-null category
nombre             93375 non-null category
estado             93375 non-null category
edad               93375 non-null float64
dtypes: category(3), float64(1), int64(2), object(3)
memory usage: 4.9+ MB


# Aplico dummies

In [14]:
testingSet_CON_NAN = testingSet_CON_NAN.join(pd.get_dummies(testingSet_CON_NAN.sexo))

testingSet_CON_NAN.drop('sexo',axis=1, inplace=True)

testingSet_CON_NAN = testingSet_CON_NAN.join(pd.get_dummies(testingSet_CON_NAN.nombre))

testingSet_CON_NAN.drop('nombre',axis=1, inplace=True)

testingSet_CON_NAN = testingSet_CON_NAN.join(pd.get_dummies(testingSet_CON_NAN.estado))

testingSet_CON_NAN.drop('estado',axis=1, inplace=True)

testingSet_CON_NAN = testingSet_CON_NAN.join(pd.get_dummies(testingSet_CON_NAN.nombre_zona))

testingSet_CON_NAN.drop('nombre_zona',axis=1, inplace=True)

testingSet_CON_NAN = testingSet_CON_NAN.join(pd.get_dummies(testingSet_CON_NAN.tipo_de_trabajo))

testingSet_CON_NAN.drop('tipo_de_trabajo',axis=1, inplace=True)

testingSet_CON_NAN = testingSet_CON_NAN.join(pd.get_dummies(testingSet_CON_NAN.nivel_laboral,
                                                             prefix="Nivel_laboral"))

testingSet_CON_NAN.drop('nivel_laboral',axis=1, inplace=True)

#testingSet_CON_NAN = trainingSet_samples.join(pd.get_dummies(trainingSet_samples.nombre_area))

In [15]:
# Get missing columns in the training test
missing_cols = set( trainingSet_samples.columns ) - set( testingSet_CON_NAN.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    testingSet_CON_NAN[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
testingSet_CON_NAN = testingSet_CON_NAN[trainingSet_samples.columns]

In [16]:
#Predict testing set:
dtest_predictions = xgb1.predict(testingSet_CON_NAN[predictors])
dtest_predprob = xgb1.predict_proba(testingSet_CON_NAN[predictors])[:,1]

  if diff:


In [17]:
testingSet_CON_NAN['sepostulo'] = dtest_predictions

In [21]:
testingSet_CON_NAN.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 42 columns):
sepostulo                                             100000 non-null uint8
edad                                                  93375 non-null float64
FEM                                                   100000 non-null uint8
MASC                                                  100000 non-null uint8
NO_DECLARA                                            100000 non-null uint8
Doctorado                                             100000 non-null uint8
Master                                                100000 non-null uint8
Otro                                                  100000 non-null uint8
Posgrado                                              100000 non-null uint8
Secundario                                            100000 non-null uint8
Terciario/Técnico                                     100000 non-null uint8
Universitario                                        

In [19]:
submit = testingSet_CON_NAN[['sepostulo']]
submit.head()

Unnamed: 0,sepostulo
0,1
1,1
2,1
3,1
4,1


In [26]:
submit.insert(0, 'id', testingSet_CON_NAN['id'])

In [27]:
submit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
id           100000 non-null int64
sepostulo    100000 non-null uint8
dtypes: int64(1), uint8(1)
memory usage: 879.0 KB


In [28]:
submit.to_csv('Submit/prediccion29_XGBOOST_500K_dummies.csv',index=False)

# Tuning

In [None]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}

gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=1,iid=False, cv=5)
gsearch1.fit(trainingSet_samples[predictors],trainingSet_samples[target])
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_