In [67]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from tqdm import tqdm, tqdm_notebook
import time, os
import lightfm as lfm
from lightfm import data
from lightfm import cross_validation
from lightfm import evaluation
from lightfm.evaluation import auc_score, precision_at_k

## Carga de datos

In [51]:
df_test = pd.read_csv('./data/ejemplo_de_solucion.csv')
df_train = pd.read_csv('./data/postulaciones/postulaciones_train.csv')
print(df_test.shape)
print(df_train.shape)

(312464, 2)
(6468689, 3)


In [52]:
# Revisar postulantes_train_clean.csv
df_postulante = pd.read_csv("./data/postulantes/postulantes_train_clean.csv")
df_avisos = pd.read_csv("./data/avisos/avisos_filtrados.csv")
print(df_postulante.shape)
print(df_avisos.shape)

(283163, 7)
(18362, 7)


In [3]:
df_avisos.head()

Unnamed: 0,idaviso,titulo,nivel_laboral,nombre_area,denominacion_empresa,online_desde,online_hasta
0,8725750,VENDEDOR/A PROVINCIA DE SANTA FE,Senior / Semi-Senior,Comercial,VENTOR,2018-01-15,2018-02-10
1,17903700,Enfermeras,Senior / Semi-Senior,Salud,Farmacias Central Oeste,2018-03-20,2018-04-17
2,1000610287,CHOFER DE CAMIONETA BAHIA BLANCA - PUNTA ALTA,Senior / Semi-Senior,Transporte,Wurth Argentina S.A,2018-01-15,2018-03-17
3,1001135716,Vendedor Viajante TUCUMAN/SANTIAGO DEL ESTERO,Senior / Semi-Senior,Ventas,Wurth Argentina S.A,2018-01-15,2018-04-16
4,1001326344,Vendedor Viajante RECONQUISTA/AVELLANEDA,Senior / Semi-Senior,Ventas,Wurth Argentina S.A,2018-01-15,2018-03-27


In [4]:
df_avisos.denominacion_empresa.unique()

array(['VENTOR', 'Farmacias Central Oeste', 'Wurth Argentina S.A', ...,
       'ISAC', 'País Marcela', 'GO-BETWEEN RRHH'], dtype=object)

In [6]:
df_postulante.head()

Unnamed: 0,idpostulante,fechanacimiento,sexo,nombre,estado,idaviso,fechapostulacion
0,NM5M,1970-12-03,FEM,Secundario,Graduado,1112257047,2018-01-15 16:22:34
1,5awk,1962-12-04,FEM,Universitario,Graduado,1112237522,2018-01-25 18:55:03
2,ZaO5,1978-08-10,FEM,Terciario/Técnico,Graduado,1112286523,2018-01-24 15:07:39
3,NdJl,1969-05-09,MASC,Posgrado,En Curso,1112261212,2018-01-16 08:50:30
4,eo2p,1981-02-16,MASC,Secundario,Graduado,1112301117,2018-02-01 13:58:50


In [5]:
df_avisos.titulo.unique()
i_f=[]
for titulo in df_avisos.titulo.unique():
    i_f.append('titulo:'+titulo)
for nivel_laboral in df_avisos.nivel_laboral.unique():
    i_f.append('nivel_laboral:'+nivel_laboral)
for nombre_area in df_avisos.nombre_area.unique():
    i_f.append('nombre_area:'+nombre_area)
for de in df_avisos.denominacion_empresa.unique():
    i_f.append('denominacion_empresa:'+str(de))

In [7]:
# feature_name,feature_value posibilities
# res = str(x)+ ":" +str(y)
u_f=[]
u_f.append('sexo:FEM')
u_f.append('sexo:MASC')
u_f.append('sexo:NO_DECLARA')
u_f.append('nombre:Secundario')
u_f.append('nombre:Universitario')
u_f.append('nombre:Terciario/Técnico')
u_f.append('nombre:Posgrado')
u_f.append('nombre:Otro')
u_f.append('nombre:Master')
u_f.append('nombre:Doctorado')
u_f.append('estado:Graduado')
u_f.append('estado:En Curso')
u_f.append('estado:Abandonado')
u_f

['sexo:FEM',
 'sexo:MASC',
 'sexo:NO_DECLARA',
 'nombre:Secundario',
 'nombre:Universitario',
 'nombre:Terciario/Técnico',
 'nombre:Posgrado',
 'nombre:Otro',
 'nombre:Master',
 'nombre:Doctorado',
 'estado:Graduado',
 'estado:En Curso',
 'estado:Abandonado']

In [157]:
# we call fit to supply userid, item id and user/item features 
ds = lfm.data.Dataset()
ds.fit(users=df_train['idpostulante'].unique(), # list of all the users
       items=df_avisos["idaviso"].unique(), #list of all the items
       user_features = u_f) #additional user features
       #item_features = i_f) #additional item features
ds.interactions_shape()

(302787, 18362)

In [158]:
(interactions, weights) = ds.build_interactions(df_train[['idpostulante','idaviso']].itertuples(index=False))
interactions

<302787x18362 sparse matrix of type '<class 'numpy.int32'>'
	with 6468689 stored elements in COOrdinate format>

In [10]:
#interactions.todense()
#weights.todense()

In [159]:
#for titulo in df_avisos.titulo.unique():
#    i_f.append('titulo:'+titulo)
uf_list=[]
for row in tqdm_notebook(df_postulante.itertuples()):
    uf=[]
    uf.append('sexo:'+row.sexo)
    uf.append('nombre:'+row.nombre)
    uf.append('estado:'+row.estado)
    uf_list.append(uf)
#user_tuples    

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [160]:
user_tuple = list(zip(df_postulante.idpostulante, uf_list))

In [161]:
user_tuple[:10]

[('NM5M', ['sexo:FEM', 'nombre:Secundario', 'estado:Graduado']),
 ('5awk', ['sexo:FEM', 'nombre:Universitario', 'estado:Graduado']),
 ('ZaO5', ['sexo:FEM', 'nombre:Terciario/Técnico', 'estado:Graduado']),
 ('NdJl', ['sexo:MASC', 'nombre:Posgrado', 'estado:En Curso']),
 ('eo2p', ['sexo:MASC', 'nombre:Secundario', 'estado:Graduado']),
 ('Ez8J', ['sexo:MASC', 'nombre:Universitario', 'estado:Abandonado']),
 ('aOQq', ['sexo:MASC', 'nombre:Universitario', 'estado:Abandonado']),
 ('8BkL', ['sexo:FEM', 'nombre:Universitario', 'estado:En Curso']),
 ('1d2B', ['sexo:MASC', 'nombre:Universitario', 'estado:En Curso']),
 ('NPBx', ['sexo:MASC', 'nombre:Universitario', 'estado:En Curso'])]

In [162]:
user_features = ds.build_user_features(user_tuple, normalize= False)

In [None]:
#item_features = ds.build_user_features(user_tuple, normalize= False)

In [163]:
user_id_map, user_feature_map, item_id_map, item_feature_map = ds.mapping()
#user_feature_map

In [164]:
#Modelo
# TODO: Entrenar con item_features
# TODO: Probar no pasar sample_weight
t_0 = time.time()
model = lfm.LightFM(loss='warp')
#%time model.fit(interactions,user_features= user_features,sample_weight= weights,epochs=10,num_threads=8)
%time model.fit(interactions,user_features= user_features,epochs=10,num_threads=8)
print ("tiempo: ", time.time() - t_0)

CPU times: user 5min 49s, sys: 3.04 s, total: 5min 52s
Wall time: 1min 11s
tiempo:  71.6612982749939


In [56]:
#print("Train precision: %.2f" % precision_at_k(model, df_train, k=5).mean())

In [57]:
t_0 = time.time()
train_auc = auc_score(model,
                      interactions,
                      user_features=user_features
                     ).mean()
print('Hybrid training set AUC: %s' % train_auc)
print ("tiempo: ", time.time() - t_0)

Hybrid training set AUC: 0.9701069
tiempo:  304.5644133090973


In [181]:
# Prediccion para un usuario existente
user_x = user_id_map['1d2B']
n_users, n_items = interactions.shape # number of users * number of items
#predict_scores = model.predict(user_x, np.arange(n_items))
predict_scores = model.predict(user_x, np.arange(n_items), user_features= user_features) # means predict for all 
predict_scores

array([-1089.08349609, -1090.18408203, -1088.74243164, ...,
       -1089.08239746, -1089.98498535, -1089.12866211])

In [182]:
np.sort(predict_scores)[::-1][:10]

array([-1084.6973877 , -1084.98693848, -1085.08789062, -1085.10974121,
       -1085.15197754, -1085.1730957 , -1085.29797363, -1085.31628418,
       -1085.32531738, -1085.38427734])

In [183]:
# Me da las posiciones donde estan los avisos de mayor score   
posiciones_scores = np.argsort(predict_scores)[::-1][:10]
posiciones_scores

array([ 8559, 10374,  4614,  2717,  6209,  3915,  9007,  2041, 10693,
        2694])

In [184]:
items=df_avisos["idaviso"].unique()

In [185]:
items[posiciones_scores]

array([1112260111, 1112196813, 1112359836, 1112094756, 1112305277,
       1112296169, 1112319451, 1112347174, 1112206678, 1112033906])

In [186]:
df_train.loc[df_train['idpostulante']=='1d2B']

Unnamed: 0,idaviso,idpostulante,fechapostulacion
91,1112020573,1d2B,2018-02-19 07:01:34
92,1112325331,1d2B,2018-02-19 07:04:50
93,1112343340,1d2B,2018-02-26 21:54:31


In [187]:
df_train.loc[df_train['idpostulante']=='NM5M'].idaviso

0    1112257047
1    1111920714
2    1112346945
3    1112345547
Name: idaviso, dtype: int64

In [188]:
idpost_train = df_train.idpostulante
idpost_test = df_test.idpostulante
idpost_train_set = set(idpost_train.unique().tolist())
idpost_test_set = set(idpost_test.unique().tolist())

test_unique = (idpost_test_set - idpost_train_set)
test_intersect = (idpost_test_set - test_unique)
print("unicos de test: ",len(test_unique ))
print("compartidos con train: ", len(test_intersect))

unicos de test:  41204
compartidos con train:  115028


In [190]:
users = test_intersect
users_unknown = test_unique

In [200]:
# TODO: Si el PRONOSTICADO ya esta en TRAIN, hay que SACARLO!!!
scored_list=[]
for user in tqdm_notebook(users):
    user_x = user_id_map[user]
    predict_scores = model.predict(user_x, np.arange(n_items), user_features= user_features)
    # TODO: Validos solo los que estan en ABRIL.
    avisos = items[np.argsort(predict_scores)[::-1][:10]]
    for aviso in avisos:
        user_aviso=dict()
        #user_aviso[user] = aviso
        user_aviso["idaviso"] = aviso
        user_aviso["idpostulante"] = user
        scored_list.append(user_aviso)

#scored_list

HBox(children=(FloatProgress(value=0.0, max=115028.0), HTML(value='')))




In [201]:
#value=scored_list[:10][0].keys()
df_modelo = pd.DataFrame(scored_list)
df_modelo.head()


Unnamed: 0,idaviso,idpostulante
0,1112334791,Dr6ArVA
1,1112033906,Dr6ArVA
2,1112204682,Dr6ArVA
3,1112334788,Dr6ArVA
4,1112260584,Dr6ArVA


In [202]:
scored_list_unknow = []

for user_unknown in tqdm_notebook(users_unknown):
    for index in range(10):
        user_aviso=dict()
        user_aviso["idaviso"] = random.choices(avisos)[0]
        user_aviso["idpostulante"] = user_unknown
        scored_list_unknow.append(user_aviso)

#scored_list_unknow

HBox(children=(FloatProgress(value=0.0, max=41204.0), HTML(value='')))




In [203]:
df_modelo_unknow = pd.DataFrame(scored_list_unknow)
df_modelo_unknow.head()

Unnamed: 0,idaviso,idpostulante
0,1112350601,PmGvRbb
1,1112099930,PmGvRbb
2,1112260111,PmGvRbb
3,1112281808,PmGvRbb
4,1112099930,PmGvRbb


In [209]:
df_modeloligthFM = df_modelo.append(df_modelo_unknow)
df_modeloligthFM.head()

Unnamed: 0,idaviso,idpostulante
0,1112334791,Dr6ArVA
1,1112033906,Dr6ArVA
2,1112204682,Dr6ArVA
3,1112334788,Dr6ArVA
4,1112260584,Dr6ArVA


In [210]:
len(df_modeloligthFM.idpostulante)

1562320

In [211]:
len(df_modeloligthFM.idpostulante.unique())

156232

In [212]:
len(df_modeloligthFM.idaviso.unique())

2527

In [213]:
df_modeloligthFM.to_csv("./submissions/modeloligthFM_2.csv", index=False)
# Resultado Kaggle: 