In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier  #GBM algorithm
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search
from sklearn.preprocessing import LabelEncoder
from pandas.api.types import CategoricalDtype

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4



In [2]:
dtypes = pd.Series({'sepostulo': np.dtype('uint8'),'edad': np.dtype('uint16')})

dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]

column_types = dict(zip(dtypes_col, dtypes_type))

trainingSet = pd.read_csv('/home/lucio/Documentos/Datos/NaventDatosTP/Data/TRAINING_SET_SIN_ENCODING.csv',dtype=column_types)
trainingSet.drop(['Unnamed: 0','denominacion_empresa'],axis=1,inplace=True)
trainingSet[["sexo",'nombre','estado','nombre_zona',
             'tipo_de_trabajo','nivel_laboral',
             'nombre_area']] = trainingSet[["sexo",'nombre','estado','nombre_zona',
                                            'tipo_de_trabajo','nivel_laboral',
                                            'nombre_area']].astype('category')

In [5]:
trainingSet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9439964 entries, 0 to 9439963
Data columns (total 9 columns):
sepostulo          uint8
sexo               category
nombre             category
estado             category
edad               uint16
nombre_zona        category
tipo_de_trabajo    category
nivel_laboral      category
nombre_area        category
dtypes: category(7), uint16(1), uint8(1)
memory usage: 99.0 MB


# Me quedo con 1.5M de registros random

In [3]:
trainingSet_samples = trainingSet.sample(n=1500000,random_state=50)

In [4]:
trainingSet_samples['sepostulo'].value_counts()

1    942921
0    557079
Name: sepostulo, dtype: int64

# Hago dummificacion

In [4]:
trainingSet_samples = trainingSet_samples.join(pd.get_dummies(trainingSet_samples.sexo))

trainingSet_samples.drop('sexo',axis=1, inplace=True)

trainingSet_samples = trainingSet_samples.join(pd.get_dummies(trainingSet_samples.nombre))

trainingSet_samples.drop('nombre',axis=1, inplace=True)

trainingSet_samples = trainingSet_samples.join(pd.get_dummies(trainingSet_samples.estado))

trainingSet_samples.drop('estado',axis=1, inplace=True)

trainingSet_samples = trainingSet_samples.join(pd.get_dummies(trainingSet_samples.nombre_zona))

trainingSet_samples.drop('nombre_zona',axis=1, inplace=True)

trainingSet_samples = trainingSet_samples.join(pd.get_dummies(trainingSet_samples.tipo_de_trabajo))

trainingSet_samples.drop('tipo_de_trabajo',axis=1, inplace=True)

trainingSet_samples = trainingSet_samples.join(pd.get_dummies(trainingSet_samples.nivel_laboral,
                                                             prefix="Nivel_laboral"))

trainingSet_samples.drop('nivel_laboral',axis=1, inplace=True)

#trainingSet_samples = trainingSet_samples.join(pd.get_dummies(trainingSet_samples.nombre_area))
# LO SAQUE PORQUE MI MAQUINA NO SE BANCA POR AHI LA DE UDS SI AUNQUE AGREGA MUCHAS COLUMAS(188)

trainingSet_samples.drop('nombre_area',axis=1, inplace=True)

In [6]:
from sklearn import ensemble
from sklearn import metrics

from sklearn.model_selection import train_test_split
columnas = trainingSet_samples.columns.drop("sepostulo")
xtrain, xtest, ytrain, ytest = train_test_split(trainingSet_samples[columnas], 
                                        trainingSet_samples['sepostulo'],test_size=0.30, random_state=12)

params = {'n_estimators': 500, 'max_depth': 3, 'subsample': 0.5,
          'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3}
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(xtrain, ytrain) #trains
y_pred = clf.predict(xtest)  #predicts
print('confusion matrix')
print(metrics.confusion_matrix(ytest, y_pred))
print('classification report')
print(metrics.classification_report(ytest, y_pred))
print("-----------------------------------------------------------------------------------------")
print("Accuracy is :")
print(metrics.accuracy_score(ytest, y_pred))
print('Area under the curve : %f' % (metrics.roc_auc_score(ytest, y_pred)))

confusion matrix
[[  2056 164830]
 [  1370 281744]]
classification report
             precision    recall  f1-score   support

          0       0.60      0.01      0.02    166886
          1       0.63      1.00      0.77    283114

avg / total       0.62      0.63      0.49    450000

-----------------------------------------------------------------------------------------
Accuracy is :
0.6306666666666667
Area under the curve : 0.503740


In [7]:
'''We are taking 10% of non frauds and merge it with frauds data'''

si_sepostulo = trainingSet_samples[trainingSet_samples['sepostulo'] ==1]
no_sepostulo = trainingSet_samples[trainingSet_samples['sepostulo'] == 0]

# random sampling
ignore_me, si_sepostulo = train_test_split(si_sepostulo, test_size = 0.1)

In [8]:
import warnings
warnings.filterwarnings("ignore")

si_sepostulo = pd.concat([si_sepostulo, no_sepostulo])

# Split into train and test units.
xtrain, xtest = train_test_split(si_sepostulo, test_size = 0.3)
ytrain = xtrain['sepostulo']
ytest = xtest['sepostulo']
xtrain.drop('sepostulo', 1, inplace = True)
xtest.drop('sepostulo', 1, inplace = True)

In [9]:
params = {'n_estimators': 500, 'max_depth': 3, 'subsample': 0.5,
          'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3}
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(xtrain, ytrain) #trains
y_pred = clf.predict(xtest)  #predicts
print('confusion matrix')
print(metrics.confusion_matrix(ytest, y_pred))
print('classification report')
print(metrics.classification_report(ytest, y_pred))
print("-----------------------------------------------------------------------------------------")
print("Accuracy is :")
print(metrics.accuracy_score(ytest, y_pred))
print('Area under the curve : %f' % (metrics.roc_auc_score(ytest, y_pred)))

confusion matrix
[[167015      0]
 [ 28394      3]]
classification report
             precision    recall  f1-score   support

          0       0.85      1.00      0.92    167015
          1       1.00      0.00      0.00     28397

avg / total       0.88      0.85      0.79    195412

-----------------------------------------------------------------------------------------
Accuracy is :
0.8546967432910978
Area under the curve : 0.500053


# Hago over sampling

In [13]:
from sklearn.model_selection import train_test_split
columnas = trainingSet_samples.columns.drop("sepostulo")
x_train, x_val, y_train, y_val = train_test_split(trainingSet_samples[columnas], 
                                        trainingSet_samples['sepostulo'],test_size = .1,random_state=12)

In [14]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=12, ratio = 1.0)
x_train_res, y_train_res = sm.fit_sample(x_train, y_train)



In [17]:
from sklearn.metrics import recall_score
gbm0 = GradientBoostingClassifier(random_state=10)

performCV=True
printFeatureImportance=True
cv_folds=5
#Fit the algorithm on the data
gbm0.fit(x_train_res, y_train_res)

print ('Validation Results')
print (gbm0.score(x_val, y_val))
print (recall_score(y_val, gbm0.predict(x_val)))

Validation Results
0.5353


NameError: name 'recall_score' is not defined

In [6]:
#Choose all predictors except target & IDcols
target = "sepostulo"
predictors = [x for x in trainingSet_samples.columns if x not in [target]]
gbm0 = GradientBoostingClassifier(random_state=10)

performCV=True
printFeatureImportance=True
cv_folds=5
#Fit the algorithm on the data
gbm0.fit(trainingSet_samples[predictors], trainingSet_samples['sepostulo'])

#Predict training set:
dtrain_predictions = gbm0.predict(trainingSet_samples[predictors])
dtrain_predprob = gbm0.predict_proba(trainingSet_samples[predictors])[:,1]

#Perform cross-validation:
if performCV:
    cv_score = cross_validation.cross_val_score(gbm0, trainingSet_samples[predictors],
                                                trainingSet_samples['sepostulo'],
                                                cv=cv_folds, scoring='roc_auc')

#Print model report:
print ("\nModel Report")
print ("Accuracy : %.4g" % metrics.accuracy_score(trainingSet_samples['sepostulo'].values,
                                                  dtrain_predictions))
print ("AUC Score (Train): %f" % metrics.roc_auc_score(trainingSet_samples['sepostulo'], dtrain_predprob))

if performCV:
    print ("CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g" % 
           (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))

#Print Feature Importance:
if printFeatureImportance:
    feat_imp = pd.Series(gbm0.feature_importances_, predictors).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

MemoryError: 

# TestingSet

In [10]:
dtypes = pd.Series({'estado': CategoricalDtype(categories=['Abandonado', 'En Curso', 'Graduado'], 
                                               ordered=False), 'idpostulante': np.dtype('object'),
                    'nombre': CategoricalDtype(categories=['Doctorado', 'Master', 'Otro', 'Posgrado',
                            'Secundario','Terciario/Técnico', 'Universitario'],ordered=False),
                    'sexo': CategoricalDtype(categories=['FEM', 'MASC', 'NO_DECLARA'], ordered=False)})

dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]

column_types = dict(zip(dtypes_col, dtypes_type))

testingSet_CON_NAN = pd.read_csv('/home/lucio/Documentos/Datos/NaventDatosTP/TestingSets/testingSet_CON_NAN.csv',dtype=column_types)
testingSet_CON_NAN.drop('Unnamed: 0',inplace=True,axis=1)
testingSet_CON_NAN.drop(columns=['idpostulante','denominacion_empresa','nombre_area'],inplace=True) #No sirve para el algoritmo de ML

# Aplico dummificacion

In [11]:
testingSet_CON_NAN = testingSet_CON_NAN.join(pd.get_dummies(testingSet_CON_NAN.sexo))

testingSet_CON_NAN.drop('sexo',axis=1, inplace=True)

testingSet_CON_NAN = testingSet_CON_NAN.join(pd.get_dummies(testingSet_CON_NAN.nombre))

testingSet_CON_NAN.drop('nombre',axis=1, inplace=True)

testingSet_CON_NAN = testingSet_CON_NAN.join(pd.get_dummies(testingSet_CON_NAN.estado))

testingSet_CON_NAN.drop('estado',axis=1, inplace=True)

testingSet_CON_NAN = testingSet_CON_NAN.join(pd.get_dummies(testingSet_CON_NAN.nombre_zona))

testingSet_CON_NAN.drop('nombre_zona',axis=1, inplace=True)

testingSet_CON_NAN = testingSet_CON_NAN.join(pd.get_dummies(testingSet_CON_NAN.tipo_de_trabajo))

testingSet_CON_NAN.drop('tipo_de_trabajo',axis=1, inplace=True)

testingSet_CON_NAN = testingSet_CON_NAN.join(pd.get_dummies(testingSet_CON_NAN.nivel_laboral,
                                                             prefix="Nivel_laboral"))

testingSet_CON_NAN.drop('nivel_laboral',axis=1, inplace=True)

#testingSet_CON_NAN = trainingSet_samples.join(pd.get_dummies(trainingSet_samples.nombre_area))

In [12]:
# Get missing columns in the training test
missing_cols = set( trainingSet_samples.columns ) - set( testingSet_CON_NAN.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    testingSet_CON_NAN[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
testingSet_CON_NAN = testingSet_CON_NAN[trainingSet_samples.columns]

In [15]:
testingSet_CON_NAN.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 42 columns):
sepostulo                                             100000 non-null int64
edad                                                  93375 non-null float64
FEM                                                   100000 non-null uint8
MASC                                                  100000 non-null uint8
NO_DECLARA                                            100000 non-null uint8
Doctorado                                             100000 non-null uint8
Master                                                100000 non-null uint8
Otro                                                  100000 non-null uint8
Posgrado                                              100000 non-null uint8
Secundario                                            100000 non-null uint8
Terciario/Técnico                                     100000 non-null uint8
Universitario                                        

# Aplico imputing a la edad

In [13]:
testingSet_CON_NAN['edad'].fillna(testingSet_CON_NAN['edad'].mode()[0],inplace=True)
testingSet_CON_NAN.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 42 columns):
sepostulo                                             100000 non-null int64
edad                                                  100000 non-null float64
FEM                                                   100000 non-null uint8
MASC                                                  100000 non-null uint8
NO_DECLARA                                            100000 non-null uint8
Doctorado                                             100000 non-null uint8
Master                                                100000 non-null uint8
Otro                                                  100000 non-null uint8
Posgrado                                              100000 non-null uint8
Secundario                                            100000 non-null uint8
Terciario/Técnico                                     100000 non-null uint8
Universitario                                       

In [14]:
#Predict testing set:
dtest_predictions = clf.predict(testingSet_CON_NAN[columnas])
dtest_predprob = clf.predict_proba(testingSet_CON_NAN[columnas])[:,1]

In [15]:
testingSet_CON_NAN['sepostulo'] = dtest_predictions

In [30]:
testingSet_CON_NAN['sepostulo'].value_counts()

1    98809
0     1191
Name: sepostulo, dtype: int64

In [26]:
testingSet_CON_NAN['sepostulo'].value_counts()

0    51432
1    48568
Name: sepostulo, dtype: int64

In [16]:
testingSet_CON_NAN['sepostulo'].value_counts()

0    100000
Name: sepostulo, dtype: int64

In [27]:
submit = testingSet_CON_NAN[['sepostulo']]
submit.head()

Unnamed: 0,sepostulo
0,0
1,0
2,0
3,0
4,0


In [29]:
#Primero volver a leer el testingSet del csv
submit.insert(0, 'id', testingSet_CON_NAN['id'])

In [30]:
submit.to_csv('Submit/prediccion29_LIGHTGBM_1.5M_dummies_over_sampling.csv',index=False)