In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline
import lightgbm as lgb

In [2]:
from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [3]:
cd ..

/home/olix/Documentos/datos/tp2master/OrgaDatos/TP2


In [4]:
total_train = pd.read_csv('DataAnalysis/postulaciones_nopostulaciones3Mv2.csv')

In [5]:
total_train = DataFrameImputer().fit_transform(total_train)

In [6]:
def cambiar_cero(x):
    if x == '0.0':
        return 'NO_DECLARA'
    return x

In [7]:
total_train['sexo'] = total_train['sexo'].map(lambda x: cambiar_cero(x))

In [8]:
total_train['sexo'].value_counts()

FEM           3010717
MASC          2765521
NO_DECLARA     184892
Name: sexo, dtype: int64

In [9]:
#Paso los valores a categoricos
total_train['nombre_sort'] = total_train['nombre_sort'].astype('category')
total_train['estado'] = total_train['estado'].astype('category')
total_train['sexo'] = total_train['sexo'].astype('category')
total_train['nombre_zona'] = total_train['nombre_zona'].astype('category')
total_train['nivel_laboral'] = total_train['nivel_laboral'].astype('category')
total_train['tipo_de_trabajo'] = total_train['tipo_de_trabajo'].astype('category')




In [10]:
total_train = total_train.drop(columns = ['idaviso','idpostulante'])

In [12]:
total_train.head()

Unnamed: 0,cantPostulaciones,cantVistas,descripcion,estado,fechanacimiento,nivel_laboral,nombre_sort,nombre_zona,postulantes,se_postulo,sexo,tipo_de_trabajo
0,36.0,1.0,0,En Curso,1996-01-12,Senior / Semi-Senior,3.0,Gran Buenos Aires,1080.0,1,FEM,Full-time
1,9.0,2.0,0,Graduado,1992-12-18,Senior / Semi-Senior,3.0,Gran Buenos Aires,1080.0,1,MASC,Full-time
2,5.0,4.0,0,Graduado,1991-10-12,Senior / Semi-Senior,3.0,Gran Buenos Aires,1080.0,1,FEM,Full-time
3,1067.0,2.0,0,Abandonado,1973-05-17,Senior / Semi-Senior,1.0,Gran Buenos Aires,1080.0,1,MASC,Full-time
4,16.0,1.0,0,Graduado,1986-05-09,Senior / Semi-Senior,2.0,Gran Buenos Aires,1080.0,1,MASC,Full-time


In [11]:
def datetime(date,format_string):
    return pd.to_datetime(date,format= format_string,errors='coerce')

In [12]:
total_train['edadPostulante'] = datetime(total_train['fechanacimiento'],'%Y-%m-%d')

In [13]:
total_train['edadPostulante'] = total_train['edadPostulante'].map(lambda x: 2018 - x.year)

In [14]:
total_train.drop(columns = ['fechanacimiento'],inplace= True)

In [122]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(total_train[['edadPostulante','cantVistas','cantPostulaciones','postulantes']], total_train['se_postulo'], test_size=0.20, random_state=75)

In [143]:
clf=lgb.LGBMClassifier(learning_rate=0.01,objective='binary',num_leaves=3000,max_depth=19,n_estimators=100,n_jobs=-1)

In [19]:
X_train.sample(5)

Unnamed: 0,edadPostulante,cantVistas,nivel_laboral,tipo_de_trabajo,nombre_zona,cantPostulaciones,postulantes,sexo,estado
2514671,23.0,1.0,Senior / Semi-Senior,Full-time,Gran Buenos Aires,70.0,731.0,MASC,Graduado
443372,28.0,4.0,Senior / Semi-Senior,Full-time,Gran Buenos Aires,160.0,3647.0,FEM,Abandonado
1074501,30.0,1.0,Otro,Part-time,Gran Buenos Aires,44.0,617.0,FEM,En Curso
963103,31.0,1.0,Senior / Semi-Senior,Full-time,Gran Buenos Aires,16.0,206.0,FEM,Graduado
979921,23.0,1.0,Senior / Semi-Senior,Full-time,Gran Buenos Aires,200.0,563.0,MASC,Graduado


In [144]:
clf.fit(X_train,y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.01, max_depth=19, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, num_leaves=3000, objective='binary', random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [145]:
clf.predict(X_test)

  if diff:


array([1, 1, 1, ..., 0, 0, 0])

In [146]:
#Pruebo el score para ver si realiza overfitting
clf.score(X_test,y_test)

  if diff:


0.9998456668450445

In [147]:
test = pd.read_csv('test_vistas_MFWv2.csv')

In [148]:
test = test.drop(['idaviso','idpostulante','titulo','denominacion_empresa','idpais','nombre_area'],axis=1)

In [149]:
test.drop(columns = ['mapacalle'],inplace = True)

In [150]:
test['edadPostulante'] = datetime(test['fechanacimiento'],'%Y-%m-%d')

In [151]:
test['edadPostulante'] = test['edadPostulante'].map(lambda x: 2018 - x.year)

In [152]:
test.drop(columns = ['fechanacimiento'],inplace= True)

In [153]:
test.drop(columns = ['ciudad'],inplace= True)

In [154]:
test = DataFrameImputer().fit_transform(test)

In [155]:
test.sample(5)

Unnamed: 0,id,nombre,estado,nombre_sort,sexo,descripcion,nivel_laboral,nombre_zona,tipo_de_trabajo,cantVistas,postulantes,cantPostulaciones,edadPostulante
67695,43924,Secundario,Graduado,1.0,FEM,1,Junior,Gran Buenos Aires,Part-time,0.0,0.0,0.0,26.0
18269,65449,Universitario,En Curso,3.0,MASC,0,Senior / Semi-Senior,Gran Buenos Aires,Full-time,2.0,157.0,308.0,25.0
49655,19194,Posgrado,En Curso,4.0,MASC,0,Senior / Semi-Senior,Gran Buenos Aires,Full-time,0.0,0.0,0.0,32.0
78597,63292,Universitario,En Curso,3.0,FEM,0,Senior / Semi-Senior,Gran Buenos Aires,Full-time,0.0,0.0,0.0,25.0
3477,19539,Universitario,Graduado,3.0,FEM,0,Senior / Semi-Senior,Gran Buenos Aires,Full-time,4.0,704.0,16.0,42.0


In [156]:
#Paso los valores a categoricos
test['nombre'] = test['nombre'].astype('category')
test['estado'] = test['estado'].astype('category')
test['sexo'] = test['sexo'].astype('category')
test['nombre_zona'] = test['nombre_zona'].astype('category')
test['nivel_laboral'] = test['nivel_laboral'].astype('category')
test['tipo_de_trabajo'] = test['tipo_de_trabajo'].astype('category')


In [157]:
result = clf.predict_proba(test[['edadPostulante','cantVistas','cantPostulaciones','postulantes']])

In [158]:
    result2 = []
    for elem in result:
        result2.append(elem[1])

In [159]:
test['sepostulo'] = result2

In [160]:
test = test[['id','sepostulo']]

In [161]:
test.sample(5)

Unnamed: 0,id,sepostulo
48446,17357,0.182416
67653,43831,0.182416
52129,22469,0.182416
32624,44109,0.182416
97745,97745,0.182416


In [162]:
#Verificamos como nos fue aproximadamente
test['sepostulo'].mean()

0.35088252579096896

In [163]:
test.to_csv('Predictions/Lightgbm6Mv6.csv', index=False)