In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tqdm import tqdm, tqdm_notebook
import time, os
import lightfm as lfm
import pickle
from lightfm import data
from lightfm import cross_validation
from lightfm import evaluation
from lightfm.evaluation import auc_score, precision_at_k    

## Carga de datos y ETL

In [2]:
df_test = pd.read_csv('./data/ejemplo_de_solucion.csv')
df_train = pd.read_csv('./data/postulaciones/postulaciones_train.csv')
print(df_test.shape)
print(df_train.shape)

(312464, 2)
(6468689, 3)


In [3]:
df_postulante = pd.read_csv("./data/postulantes/postulantes_justInTrain.csv")
df_avisos = pd.read_csv("./data/avisos/avisos_filtrados.csv")
print(df_postulante.shape)
print(df_avisos.shape)

(283163, 5)
(18362, 9)


In [4]:
# Concateno para crear un df con todos los postulantes (train+test)
users_total= pd.concat([df_train[['idpostulante']].drop_duplicates(), df_test[['idpostulante']].drop_duplicates()]).drop_duplicates()

In [5]:
# Reemplazo los 6 NaN que hay en denominacion_empresa por vacio
df_avisos = df_avisos.replace(np.nan, '', regex=True)

In [6]:
idpost_train = df_train.idpostulante
idpost_test = df_test.idpostulante
idpost_train_set = set(idpost_train.unique().tolist())
idpost_test_set = set(idpost_test.unique().tolist())

test_unique = (idpost_test_set - idpost_train_set)
test_intersect = (idpost_test_set - test_unique)
print("unicos de test: ",len(test_unique ))
print("compartidos con train: ", len(test_intersect))

unicos de test:  41204
compartidos con train:  115028


In [7]:
user_test_totales = df_test.idpostulante.drop_duplicates()
users = test_intersect
users_unknown = test_unique
len(user_test_totales)

156232

In [8]:
avisos_abril = set(df_avisos.loc[df_avisos['online_hasta']>='2018-04-01'].idaviso)
avisos_abril_list = df_avisos.loc[df_avisos['online_hasta']>='2018-04-01'].idaviso.tolist()
print(len(avisos_abril))
print(len(df_avisos))

6722
18362


In [9]:
print (1112393398 in avisos_abril)
print (1112403164 in avisos_abril)

True
False


In [10]:
avisos_train_bl = df_train[['idaviso','idpostulante']]
avisos_train_bl.loc[avisos_train_bl['idpostulante']=='NM5M'].idaviso.tolist()
#['NM5M', '5awk', 'ZaO5', 'NdJl', 'eo2p', 'Ez8J', 'aOQq', '8BkL','1d2B', '1QLLO', 'YjGMZ']

[1112257047, 1111920714, 1112346945, 1112345547]

In [11]:
# Creo una tupla cuyo primer elemento es un user y como segundo elemento la lista de avisos a los que se postulo
postulations_list=dict()
for user, group in tqdm_notebook(avisos_train_bl.groupby('idpostulante')):
    postulations_list[user] = group.idaviso.values

HBox(children=(FloatProgress(value=0.0, max=302787.0), HTML(value='')))




In [12]:
print (postulations_list.get('5awk'))
print (postulations_list.get('YjGMZ'))

[1112237522 1112277633 1112302347 1112291495 1112304011 1112303543
 1112315393 1112315188 1112346738 1112330625 1112306543 1112315170
 1111799928 1112323355 1112305358]
[1112228888 1112359096 1112349723 1112308241 1112228888 1112359096
 1112349723 1112308241 1112373863 1112402536]


### Creacion de Item Features y User Features

In [13]:
df_avisos.titulo.unique()
i_f=[]
for titulo in df_avisos.titulo.unique():
    i_f.append('titulo:'+titulo)
for nivel_laboral in df_avisos.nivel_laboral.unique():
    i_f.append('nivel_laboral:'+nivel_laboral)
for nombre_area in df_avisos.nombre_area.unique():
    i_f.append('nombre_area:'+nombre_area)
for de in df_avisos.denominacion_empresa.unique():
   i_f.append('denominacion_empresa:'+str(de))
#nombre area y nivel laboral

In [14]:
u_f=[]
for sexo in df_postulante.sexo.unique():
    u_f.append('sexo:'+sexo)
for nombre in df_postulante.nombre.unique():
    u_f.append('nombre:'+nombre)
for estado in df_postulante.estado.unique():
    u_f.append('estado:'+estado)
#u_f

### Primer fit con todos los datos
- Creacion de matriz de interacciones
- Creacion de user_tuples + item_tuples

In [15]:
# fit inicial para proveer userid, item id, user/item features 
ds = lfm.data.Dataset()
ds.fit(users=users_total['idpostulante'].unique(), # list of all the users
       items=df_avisos["idaviso"].unique(), #list of all the items
       user_features = u_f, #additional user features
       item_features = i_f) #additional item features
ds.interactions_shape()

(343991, 18362)

In [16]:
(interactions, weights) = ds.build_interactions(df_train[['idpostulante','idaviso']].itertuples(index=False))
interactions
#interactions.todense()
#weights.todense()

<343991x18362 sparse matrix of type '<class 'numpy.int32'>'
	with 6468689 stored elements in COOrdinate format>

In [17]:
# Creo user_tuple
uf_list=[]
for row in tqdm_notebook(df_postulante.itertuples()):
    uf=[]
    uf.append('sexo:'+row.sexo)
    uf.append('nombre:'+row.nombre)
    uf.append('estado:'+row.estado) # Probar no estado
    uf_list.append(uf)
#uf_list 
user_tuple = list(zip(df_postulante.idpostulante, uf_list)) 
user_tuple[:2] 

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




[('NM5M', ['sexo:FEM', 'nombre:Secundario', 'estado:Graduado']),
 ('5awk', ['sexo:FEM', 'nombre:Universitario', 'estado:Graduado'])]

In [18]:
# Creo item_tuple
if_list=[]
for row in tqdm_notebook(df_avisos.itertuples()):
    i_f=[]
    i_f.append('titulo:'+row.titulo)
    i_f.append('nivel_laboral:'+row.nivel_laboral) ###
    i_f.append('nombre_area:'+row.nombre_area) ###
    i_f.append('denominacion_empresa:'+row.denominacion_empresa)
    if_list.append(i_f)
#if_list 
item_tuple = list(zip(df_avisos.idaviso, if_list)) 
item_tuple[:2] 

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




[(8725750,
  ['titulo:VENDEDOR/A PROVINCIA DE SANTA FE',
   'nivel_laboral:Senior / Semi-Senior',
   'nombre_area:Comercial',
   'denominacion_empresa:VENTOR']),
 (17903700,
  ['titulo:Enfermeras',
   'nivel_laboral:Senior / Semi-Senior',
   'nombre_area:Salud',
   'denominacion_empresa:Farmacias Central Oeste'])]

In [19]:
user_features = ds.build_user_features(user_tuple, normalize= False)
item_features = ds.build_item_features(item_tuple, normalize= False)

In [20]:
user_id_map, user_feature_map, item_id_map, item_feature_map = ds.mapping()
#user_feature_map

## Modelo LightFM
- Funcion del modelo con parametros + fit
- Evaluacion local
- Bajada del modelo, si se justifica, con pickle

In [66]:
#### Mejor modelo ####

#model = lfm.LightFM(no_components=40, loss='warp', random_state=42)
#%time model.fit(interactions,epochs=90,num_threads=8)
#%time lfm.evaluation.precision_at_k(model, interactions, k=10, num_threads=8).mean()


## CPU times: user 38min 51s, sys: 12.1 s, total: 39min 3s
## Wall time: 6min 24s
## CPU times: user 10min 41s, sys: 780 ms, total: 10min 41s
## Wall time: 1min 42s
## 0.35895196

CPU times: user 38min 51s, sys: 12.1 s, total: 39min 3s
Wall time: 6min 24s
CPU times: user 10min 41s, sys: 780 ms, total: 10min 41s
Wall time: 1min 42s


0.35895196

In [82]:
# Save y Load de modelos en pickle.
#pickle.dump(model_13, open('./data/models/warp_90epochs_40components', "wb"))
model = pickle.load(open('./data/models/warp_90epochs_40components', "rb"))
%time lfm.evaluation.precision_at_k(model, interactions, k=10, num_threads=8).mean()

## Prediccion

- Prediccion unitaria de prueba

In [41]:
# Prediccion unitaria para un usuario existente
user_x = user_id_map['1d2B']
n_users, n_items = interactions.shape # number of users * number of items
#predict_scores = model.predict(user_x, np.arange(n_items), user_features= user_features, item_features= item_features) # means predict for all 
predict_scores = model.predict(user_x, np.arange(n_items))
predict_scores

array([-0.62829709, -2.38207245,  0.30956799, ..., -2.24508238,
       -1.77208209, -2.11038733])

In [42]:
np.sort(predict_scores)[::-1][:10]

array([3.31741476, 3.24790478, 3.12483549, 3.05813098, 2.95095873,
       2.94779158, 2.85524487, 2.73456693, 2.7138629 , 2.6743021 ])

In [43]:
# Me da las posiciones donde estan los avisos de mayor score   
posiciones_scores = np.argsort(predict_scores)[::-1][:10]
posiciones_scores

array([ 9358,  2423,  6837, 10881,   149, 10423,  4123,  2440,  1762,
       11839])

In [44]:
items=df_avisos["idaviso"].unique()
items[posiciones_scores]

array([1112325331, 1112352365, 1112368896, 1112210546, 1112020573,
       1112202363, 1112299495, 1112352852, 1112343340, 1112274759])

In [45]:
df_train.loc[df_train['idpostulante']=='1d2B']

Unnamed: 0,idaviso,idpostulante,fechapostulacion
91,1112020573,1d2B,2018-02-19 07:01:34
92,1112325331,1d2B,2018-02-19 07:04:50
93,1112343340,1d2B,2018-02-26 21:54:31


In [46]:
# Devuelve una lista con los 10 avisos mejor rankeados que son de Abril y ademas si pertenece a TRAIN revisa que no sea una postulacion a la que ya se haya presentado.
def enAbrilYNoPostulado(ranking_list,avisos_abril, avisos, user):
    if postulations_list.get(user) is None:
        for i in avisos:
            if ((i in avisos_abril)):
                ranking_list.append(i)
            if (len(ranking_list)==10):
                break
    else:
        for i in avisos:
            if ((i in avisos_abril) and (i not in postulations_list.get(user))):
                ranking_list.append(i)
            if (len(ranking_list)==10):
                break
    return ranking_list

In [47]:
user_x = user_id_map['akO24jJ']
#predict_scores = model.predict(user_x, np.arange(n_items), user_features= user_features, item_features= item_features)
predict_scores = model.predict(user_x, np.arange(n_items))
avisos = items[np.argsort(predict_scores)][::-1][:100]

In [48]:
avisos_dummy=[]
enAbrilYNoPostulado(avisos_dummy,avisos_abril, avisos,'erL9')

[1112208702,
 1112334788,
 1112208765,
 1112208550,
 1112208794,
 1112334791,
 1112397527,
 1112380129,
 1112391017,
 1112033906]

- Prediccion general

In [79]:
# Genero la prediccion y me aseguro que los avisos predichos sean de Abril y que las predicciones de users de TRAIN no repitan postulaciones para el mismo user.
scored_list=[]
for user in tqdm_notebook(user_test_totales):
    top_avisos=[]
    user_x = user_id_map[user]
    #predict_scores = model.predict(user_x, np.arange(n_items), user_features= user_features, item_features= item_features)    
    predict_scores = model.predict(user_x, np.arange(n_items))    
    avisos_full = items[np.argsort(predict_scores)[::-1]]
    #Validos solo los que estan en ABRIL.
    enAbrilYNoPostulado(top_avisos, avisos_abril, avisos_full,user)
    for aviso in top_avisos:
        user_aviso=dict()
        user_aviso["idaviso"] = aviso
        user_aviso["idpostulante"] = user
        scored_list.append(user_aviso)
#scored_list

HBox(children=(FloatProgress(value=0.0, max=156232.0), HTML(value='')))




### Submission

In [80]:
# Creo dataframe con la lista scoreada de los postulantes que estan en train y test
df_modelo = pd.DataFrame(scored_list)
# Imprimo longitudes del dataset final.
print(len(df_modelo.idpostulante))
print(len(df_modelo.idpostulante.unique()))
print(len(df_modelo.idaviso.unique()))  
print()
df_modelo.head()

1562320
156232
4106



Unnamed: 0,idaviso,idpostulante
0,1112312666,ZaO5
1,1112397792,ZaO5
2,1112377249,ZaO5
3,1112398171,ZaO5
4,1112377749,ZaO5


In [81]:
df_modelo.to_csv("./submissions/submission_model.csv", index=False)