In [2]:
import pandas as pd
import time
from datetime import datetime
import scipy.spatial
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import HuberRegressor 
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LassoLars
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor

In [3]:
postulaciones = pd.read_csv('Data/Postulaciones/Postulaciones-merge.csv')

In [4]:
postulaciones.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8311264 entries, 0 to 8311263
Data columns (total 3 columns):
idaviso             int64
idpostulante        object
fechapostulacion    object
dtypes: int64(1), object(2)
memory usage: 190.2+ MB


In [30]:
# Intento reducir el uso de memoria
postulaciones.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8311264 entries, 0 to 8311263
Data columns (total 3 columns):
idaviso             int64
idpostulante        object
fechapostulacion    object
dtypes: int64(1), object(2)
memory usage: 1.1 GB


In [31]:
for dtype in ['int','object']:
    selected_dtype = postulaciones.select_dtypes(include=[dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_mb = mean_usage_b / 1024 ** 2
    print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb))

Average memory usage for int columns: 31.70 MB
Average memory usage for object columns: 369.36 MB


In [37]:
import numpy as np
int_types = ["uint8","uint32", "int8", "int16","int32","int64"]
for it in int_types:
    print(np.iinfo(it))

Machine parameters for uint8
---------------------------------------------------------------
min = 0
max = 255
---------------------------------------------------------------

Machine parameters for uint32
---------------------------------------------------------------
min = 0
max = 4294967295
---------------------------------------------------------------

Machine parameters for int8
---------------------------------------------------------------
min = -128
max = 127
---------------------------------------------------------------

Machine parameters for int16
---------------------------------------------------------------
min = -32768
max = 32767
---------------------------------------------------------------

Machine parameters for int32
---------------------------------------------------------------
min = -2147483648
max = 2147483647
---------------------------------------------------------------

Machine parameters for int64
---------------------------------------------------------

In [34]:
# We're going to be calculating memory usage a lot,
# so we'll create a function to save us some time!

def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)

postulaciones_int = postulaciones.select_dtypes(include=['int'])
converted_int = postulaciones_int.apply(pd.to_numeric,downcast='unsigned')

print(mem_usage(postulaciones_int))
print(mem_usage(converted_int))

compare_ints = pd.concat([postulaciones_int.dtypes,converted_int.dtypes],axis=1)
compare_ints.columns = ['before','after']
compare_ints.apply(pd.Series.value_counts)

63.41 MB
31.71 MB


Unnamed: 0,before,after
uint32,,1.0
int64,1.0,


In [38]:
optimized_postulaciones = postulaciones.copy()

optimized_postulaciones[converted_int.columns] = converted_int

print(mem_usage(postulaciones))
print(mem_usage(optimized_postulaciones))

1171.48 MB
1139.78 MB


In [None]:
postulaciones_obj = postulaciones.select_dtypes(include=['object']).copy()
postulaciones_obj.describe()

# Obtengo postulantes no duplicados haber si asi aumento la cantidad

In [5]:
postulantes_en_postulaciones = postulaciones.drop_duplicates(subset='idpostulante', keep='first')

In [6]:
postulantes_en_postulaciones.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 348883 entries, 0 to 8311263
Data columns (total 3 columns):
idaviso             348883 non-null int64
idpostulante        348883 non-null object
fechapostulacion    348883 non-null object
dtypes: int64(1), object(2)
memory usage: 10.6+ MB


In [7]:
postulantes_en_postulaciones.drop(['idaviso','fechapostulacion'],inplace=True,axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [8]:
postulantes_educacion = pd.read_csv('Data/Postulantes-Educacion/postulantes_educacion-merge.csv')

In [9]:
postulantes_educacion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 447909 entries, 0 to 447908
Data columns (total 4 columns):
idpostulante       447909 non-null object
nombre             447909 non-null object
estado             447909 non-null object
nombre_numerico    447909 non-null int64
dtypes: int64(1), object(3)
memory usage: 13.7+ MB


In [10]:
postulantes_genero_edad = pd.read_csv('Data/Postulantes-Genero-Edad/postulantes_genero_y_edad-merge.csv')

In [11]:
postulantes_genero_edad.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504407 entries, 0 to 504406
Data columns (total 3 columns):
idpostulante       504407 non-null object
fechanacimiento    477884 non-null object
sexo               504407 non-null object
dtypes: object(3)
memory usage: 11.5+ MB


In [12]:
#Hago un merge de educacion y genero-edad
postulantes_merge = pd.merge(postulantes_genero_edad,postulantes_educacion,how='left',on='idpostulante')

In [13]:
postulantes_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 504407 entries, 0 to 504406
Data columns (total 6 columns):
idpostulante       504407 non-null object
fechanacimiento    477884 non-null object
sexo               504407 non-null object
nombre             447909 non-null object
estado             447909 non-null object
nombre_numerico    447909 non-null float64
dtypes: float64(1), object(5)
memory usage: 26.9+ MB


In [14]:
#Agrego postulantes que no esten en el merge anterior
postulantes_merge2 = pd.merge(postulantes_merge,postulantes_en_postulaciones,how='outer')

In [15]:
postulantes_merge2.drop_duplicates()

Unnamed: 0,idpostulante,fechanacimiento,sexo,nombre,estado,nombre_numerico
0,6MM,1985-01-01,MASC,,,
1,Nzz,,NO_DECLARA,,,
2,ZX1,,NO_DECLARA,,,
3,Nq5,,NO_DECLARA,,,
4,ebE,1952-07-07,MASC,Posgrado,Graduado,4.0
5,N1x,,NO_DECLARA,,,
6,52aw,,NO_DECLARA,,,
7,NAjM,1962-06-09,FEM,,,
8,eRk9,,NO_DECLARA,,,
9,a6OE,,NO_DECLARA,,,


# Avisos detalle

In [16]:
avisos_detalle = pd.read_csv('Data/Avisos-Detalle/Avisos-Detalle-merge.csv')

In [17]:
avisos_detalle.rename(index=str, columns={"idaviso": "idAviso"},inplace=True)
avisos_detalle.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45969 entries, 0 to 45968
Data columns (total 11 columns):
idAviso                 45969 non-null int64
idpais                  45969 non-null int64
titulo                  45969 non-null object
descripcion             45969 non-null object
nombre_zona             45969 non-null object
ciudad                  225 non-null object
mapacalle               3444 non-null object
tipo_de_trabajo         45969 non-null object
nivel_laboral           45634 non-null object
nombre_area             45969 non-null object
denominacion_empresa    45955 non-null object
dtypes: int64(2), object(9)
memory usage: 4.2+ MB


In [None]:
for dtype in ['float','int','object']:
    selected_dtype = avisos_detalle.select_dtypes(include=[dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_mb = mean_usage_b / 1024 ** 2
    print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb

# Avisos online

In [18]:
avisos_online = pd.read_csv('Data/Avisos-Online/Avisos-Online-merge.csv')

In [19]:
avisos_online.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9886 entries, 0 to 9885
Data columns (total 1 columns):
idaviso    9886 non-null int64
dtypes: int64(1)
memory usage: 77.3 KB


# Vistas

In [20]:
vistas = pd.read_csv('Data/Vistas/Vistas-merge.csv')

In [None]:
vistas.info()

# Quito duplicados para luego ver de que tipo es cada aviso
unique_advises = vistas.drop_duplicates(subset='idAviso', keep='first')

unique_advises.info()

# Agrego la informacion de avisos detalles
unique_advises = pd.merge(unique_advises,avisos_detalle,how='left',on='idAviso')

unique_advises = unique_advises[unique_advises['tipo_de_trabajo'].notnull()]
unique_advises.info()

unique_postulantes_vistas = vistas.drop_duplicates(subset='idpostulante', keep='first')

unique_postulantes_vistas.info()

groupby_aviso_postulante = vistas.groupby(['idpostulante','idAviso']).size()

asd = groupby_aviso_postulante['0002q']
aa = asd.index
aa.get_values()

In [21]:
# Creo funcion para recuperar lista de TIPOS DE TRABAJO de avisos que le interesan a un postulante de acuerdo
# a lo que visita y se postula

def obtener_lista_de_intereses(idpostulante):
    lista = []
    lista_de_ids_avisos_postulados = obtener_lista_de_ids_avisos_postulados(idpostulante)
    for id in lista_de_ids_avisos_postulados:
        tipo_de_trabajo = avisos_detalle[avisos_detalle['idAviso'] == id]['tipo_de_trabajo']
        lista.append(tipo_de_trabajo)
    lista_de_ids_avisos_visitados = obtener_lista_de_ids_avisos_visitados(idpostulante)
    if (lista_de_ids_avisos_visitados is not None):
        for id in lista_de_ids_avisos_visitados:
            tipo_de_trabajo = avisos_detalle[avisos_detalle['idAviso'] == id]['tipo_de_trabajo']
            lista.append(tipo_de_trabajo)    
    return lista
    
# Creo funcion para recuperar los ids de avisos a los que se postula un postulante
def obtener_lista_de_ids_avisos_postulados(idpostulante):
    data = postulaciones[postulaciones['idpostulante'] == idpostulante]
    return data['idaviso'].tolist()

# Creo funcion para recuperar los ids de avisos que visita un postulante
def obtener_lista_de_ids_avisos_visitados(idpostulante):
    try:
        return groupby_aviso_postulante.pop(idpostulante).index.get_values()
    except KeyError:
        return None

In [20]:
# Agrego columna con cantidad de postulaciones por postulante
serie_postulaciones = postulaciones['idpostulante'].value_counts()
def cant_postulaciones_postulante(idpostulante):
    try:
        return serie_postulaciones[idpostulante]
    except KeyError:
        return 0

In [22]:
groupby_aviso_postulante = vistas.groupby(['idpostulante','idAviso']).size()

In [None]:
groupby_aviso_postulante.

In [None]:
postulantes_merge2['intereses'] = None
postulantes_merge2.loc[:,'intereses'] = postulantes_merge2['idpostulante'].apply(obtener_lista_de_intereses)

In [None]:
postulantes_merge2['cant_postulaciones_a_avisos'] = 0
postulantes_merge2.loc[:,'cant_postulaciones_a_avisos'] = postulantes_merge2['idpostulante'].apply(cant_postulaciones_postulante)

# Avisos online

# Training test

In [None]:
trainingSet = 

# Testing set

In [22]:
testingSet = pd.read_csv('Data/test_final_100k.csv')

In [49]:
testingSet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
id              100000 non-null int64
idaviso         100000 non-null int64
idpostulante    100000 non-null object
dtypes: int64(2), object(1)
memory usage: 2.3+ MB


In [50]:
# Agrego informacion al set de testing con los datos que contamos

In [24]:
testingSet.drop_duplicates(subset='idaviso', keep='first')

Unnamed: 0,id,idaviso,idpostulante
0,0,739260,6M9ZQR
3,3,758580,1Q35ej
6,6,776420,aZJ2XN
9,9,820850,6ZBD33
13,13,914880,6q6eNl
16,16,1112360,6kRZ44
20,20,1141070,edaNB4
23,23,1144960,NEJjvL
26,26,1145170,Y9d5Bw
29,29,1176260,ZvQJ2z
