In [4]:
from sys import path
path.append("/home/ec2-user/SageMaker/data-science-development/utils")
path.append("/home/ec2-user/SageMaker/data-science-development/config")

import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import pyarrow.parquet as pq

import s3fs
import os
import torch
import random
import json
# import sktime

from torch import nn
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader, WeightedRandomSampler

from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

from redshift import get_from_redshift
from datetime import datetime
from config import Config  
from collections import defaultdict, Counter
from tqdm import tqdm 
from itertools import zip_longest

tqdm.pandas()

# Loading data

### Work experience

In [5]:
sql_path = os.path.join("/home/ec2-user/SageMaker/data-science-development/talent_recommender/daily_snapshots",
                        Config.query_candidate_work_experience) 

In [6]:
candidate_work_experience = get_from_redshift(sql_path, Config.redshift_creds)
candidate_work_experience.head()

Unnamed: 0,candidate_id,unique_id,date_start_job,date_end_job,function_id,isco_functie_niveau,isco_code4,function_name_self,company_name,source
0,89030,187756,2005-04-04 00:00:00,2005-04-13,1046.0,2.0,5246.0,,Compass Group NL/Prénatal,plaatsing
1,89030,187818,2005-04-08 00:00:00,2005-08-07,1046.0,2.0,5246.0,,Albron Cat. / 3381 ASM Europe BV,plaatsing
2,89030,1290468,2005-09-12 00:00:00,2005-11-10,1046.0,2.0,5246.0,,Albron cat.3695/ Schuitema,plaatsing
3,89030,1285504,2005-09-12 00:00:00,2005-11-10,1046.0,2.0,5246.0,,Albron cat.3695/ Schuitema,plaatsing
4,89030,1506287,2005-09-12 00:00:00,2005-11-10,1046.0,2.0,5246.0,,Paresto/tripvzoudtland/09008 (9.8),plaatsing


### Education

In [7]:
sql_path = os.path.join("/home/ec2-user/SageMaker/data-science-development/talent_recommender/daily_snapshots",
                        Config.query_candidate_education) 

In [8]:
candidate_education = get_from_redshift(sql_path, Config.redshift_creds)
candidate_education.head()

Unnamed: 0,candidate_id,education_level,date_start,date_end,date_registrated,passed
0,239187,3,1985-09-01 00:00:00,1987-06-01 00:00:00,2005-08-16 16:53:06,1
1,109774,2,2001-09-01 00:00:00,2004-09-01 00:00:00,2005-07-26 17:50:44,0
2,179574,3,2000-09-01 00:00:00,2001-06-01 00:00:00,2005-08-09 18:28:17,1
3,239357,2,1998-09-01 00:00:00,2002-06-08 00:00:00,2005-08-16 16:53:40,1
4,294921,3,2001-09-01 00:00:00,2005-08-31 00:00:00,2005-08-17 13:12:58,1


### Skills

In [None]:
sql_path = os.path.join("/home/ec2-user/SageMaker/data-science-development/talent_recommender/daily_snapshots",
                        Config.query_candidate_skills) 

#### What's date start here? 

In [None]:
candidate_skill = get_from_redshift(sql_path, Config.redshift_creds)
candidate_skill.head()

### Languages

In [None]:
sql_path = os.path.join("/home/ec2-user/SageMaker/data-science-development/talent_recommender/daily_snapshots",
                        Config.query_candidate_languages) 

In [None]:
candidate_languages = get_from_redshift(sql_path, Config.redshift_creds)
candidate_languages.head()

In [None]:
lang_to_idx = {v : i for i, v in enumerate(sorted(candidate_languages["language_id"].astype(float).unique()))}
idx_to_lang = {i : v for i, v in lang_to_idx.items()}
candidate_languages["language_id"] = candidate_languages["language_id"].progress_apply(lambda x: lang_to_idx[x])

In [None]:
languages_oh = pd.crosstab(candidate_languages["candidate_id"],
                           candidate_languages["language_id"])

In [27]:
languages_oh.to_csv("../Data/languages_one-hot.csv")

### Addresses

In [18]:
sql_path = os.path.join("/home/ec2-user/SageMaker/data-science-development/talent_recommender/daily_snapshots",
                        Config.query_candidate_addresses) 

In [19]:
candidate_addresses = get_from_redshift(sql_path, Config.redshift_creds)
candidate_addresses.head()

Unnamed: 0,candidate_id,from_post_code,date_start,date_end
0,2065656,9714 JV,2016-02-05 00:00:00,2016-12-26 00:00:00
1,2674423,1621 JW,2014-09-01 00:00:00,2015-12-16 00:00:00
2,7301821,3064 AW,2018-01-25 00:00:00,
3,7367386,7141 WE,2018-03-26 00:00:00,
4,1613716,7731 CA,2011-03-15 00:00:00,


In [20]:
candidate_addresses.shape

(4148409, 4)

In [21]:
# Drop nonsense rows
candidate_addresses = candidate_addresses[(candidate_addresses["date_start"] >= dt.datetime(1950, 1, 1)) &
                                          (candidate_addresses["date_start"] <= dt.datetime(2025, 1, 1))]

In [22]:
candidate_addresses.shape

(4148259, 4)

In [23]:
# Only store for candidates we will actually be using
candidate_addresses = candidate_addresses[candidate_addresses["candidate_id"].isin(
    set(candidate_work_experience["candidate_id"].unique()))]

In [24]:
candidate_addresses.shape

(770495, 4)

In [25]:
most_recent_address = candidate_addresses.groupby("candidate_id")["date_start"].progress_apply(max)

100%|██████████| 528183/528183 [02:00<00:00, 4381.98it/s]


In [26]:
# TODO: do this in a way that isn't hideous
makeshift_index = set((most_recent_address.index.astype(str) + most_recent_address.astype(str)).values)

In [27]:
candidate_addresses.loc[:,"temp"] = candidate_addresses["candidate_id"].astype(str) + candidate_addresses["date_start"].astype(str)

In [28]:
candidate_addresses = candidate_addresses[candidate_addresses["temp"].isin(makeshift_index)].drop("temp", axis=1)

In [29]:
candidate_addresses["from_post_code"] = candidate_addresses["from_post_code"].str.slice(0, 4)

In [30]:
candidate_final_addresses = candidate_addresses.groupby("candidate_id")["from_post_code"].progress_apply(lambda x: x.values[-1])

100%|██████████| 528183/528183 [00:13<00:00, 38298.87it/s]


In [31]:
zip_to_idx = {v : i for i, v in enumerate(sorted(candidate_final_addresses.unique()))}
idx_to_zip = {i : v for i, v in zip_to_idx.items()}
candidate_final_addresses = candidate_final_addresses.progress_apply(lambda x: zip_to_idx[x])

100%|██████████| 528183/528183 [00:00<00:00, 604613.16it/s]


In [35]:
addresses_one_hot = pd.get_dummies(candidate_final_addresses)

addresses_one_hot.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,4747,4748,4749,4750,4751,4752,4753,4754,4755,4756
candidate_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
84556,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
84612,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
84731,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
85437,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
85627,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
# DIY Sparse matrix
compressed_addresses_one_hot = addresses_one_hot.idxmax(axis=1)

In [38]:
compressed_addresses_one_hot.to_csv("../Data/addresses_one-hot.csv")

In [46]:
for k, v in compressed_addresses_one_hot.items():
    print(k, v)
    a = np.zeros(4757)
    a[v] = 1
    break

84556 90
[   0 3174 3173 ... 1587 4756   90]


In [54]:
emb = nn.Embedding(4757, 10)

In [60]:
emb(torch.LongTensor([90])), emb(torch.LongTensor([50]))

(tensor([[-0.7321,  0.4676,  0.0742, -1.5482,  1.1119, -0.9195, -1.2665, -0.5721,
           0.0527,  0.2775]], grad_fn=<EmbeddingBackward>),
 tensor([[ 0.9187,  0.3937, -0.8414,  0.1928, -0.3927,  0.9119, -0.7422,  0.6156,
           0.2376,  1.0105]], grad_fn=<EmbeddingBackward>))

### Driving licenses

In [107]:
sql_path = os.path.join("/home/ec2-user/SageMaker/data-science-development/talent_recommender/daily_snapshots",
                        Config.query_candidate_driving_license) 

In [108]:
candidate_driving = get_from_redshift(sql_path, Config.redshift_creds)
candidate_driving.head()

Unnamed: 0,candidate_id,driving_licenses,date_driving_license
0,395381,A,2005-09-02 11:31:09
1,430828,A,2005-09-06 17:57:28
2,1074181,A,2013-08-07 10:06:11
3,1976775,A,2014-11-03 15:29:38
4,94916,A,2013-11-25 15:09:48


In [113]:
candidate_driving = candidate_driving[candidate_driving["candidate_id"].isin(set(candidate_work_experience["candidate_id"].unique()))]

In [114]:
licenses_one_hot = pd.crosstab(candidate_driving["candidate_id"],
                               candidate_driving["driving_licenses"])

In [115]:
licenses_one_hot.shape

(216926, 8)

In [116]:
licenses_one_hot.to_csv("../Data/licenses_one-hot.csv")

# Skill reindexing

In [6]:
skill_to_idx = {v : i for i, v in enumerate(sorted(candidate_skill["skill_id"].astype(float).unique()))}
idx_to_skill = {i : v for i, v in skill_to_idx.items()}
candidate_skill["skill_id"] = candidate_skill["skill_id"].progress_apply(lambda x: skill_to_idx[x])
skills_dict = candidate_skill.groupby("candidate_id")["skill_id"].apply(list).to_dict()

NameError: name 'candidate_skill' is not defined

In [9]:
# Convert lists of skills to dataframe of skills per candidate
skills_ct = pd.crosstab(candidate_skill["candidate_id"], 
                        candidate_skill["skill_id"])

# # skills = skills_ct.where(skills_ct != 1, skills_ct.columns.to_series(), axis=1)
skills_ct.columns = [f"skill_{i}" for i in skills_ct.columns]

In [10]:
skills_ct.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 465522 entries, 84267 to 7980693
Columns: 317 entries, skill_0 to skill_316
dtypes: int64(317)
memory usage: 1.1 GB


In [11]:
# skills_ct.to_csv("skills_one-hot.csv")

# Merging and formatting

In [9]:
def clean_df(df, table = "work_experience"):
    
    if table == "work_experience":
        start, end = "date_start_job", "date_end_job"
    elif table == "education":
        start, end = "date_start", "date_end"
    else:
        return NotImplemented
    
    # Drop nonsense data
    df = df[df[start] >= dt.datetime(1800, 1, 1)]
    df = df[df[end] <= dt.datetime(2100, 1, 1)]

    # Convert datetime to date
    df[start] = pd.to_datetime(df[start]).dt.date
    df[end] = pd.to_datetime(df[end]).dt.date
    
    return df
        
candidate_work_experience = clean_df(candidate_work_experience)
candidate_education = clean_df(candidate_education, table = "education")

In [10]:
candidate_work_experience["time_spent"] = (candidate_work_experience["date_end_job"] - 
                                           candidate_work_experience["date_start_job"]).dt.days.astype('int16')

In [11]:
def find_educations(df_work, df_education):

    # Merge career data with education levels
    career_education = pd.merge(df_work, 
                                df_education, 
                                on = "candidate_id", 
                                how = "left")[["candidate_id",
                                               "date_start_job",
                                               "date_end_job",
                                               "time_spent",
                                               "education_level",
                                               "date_start",
                                               "date_end",
                                               "passed"]]
    
    # Filter out education that were not passed (yet) at the time of starting a job
    passed = career_education[(career_education["date_start_job"] >= career_education["date_end"]) & 
                              (career_education["passed"] == 1)]
    
    # Only store the highest education level reached at the start of each job
    education_through_time = passed.groupby(["candidate_id", "date_start_job"])["education_level"].max()
    
    df_work.set_index(["candidate_id", "date_start_job"], inplace=True)

    # Store education data in candidate_work_experience
    df_work["education"] = education_through_time

    # np.nan education = no education
    df_work["education"].fillna(0, inplace=True)

    # Reset index for further data augmentation
    df_work.reset_index(inplace=True)
        
    return df_work

candidate_work_experience = find_educations(candidate_work_experience, candidate_education)

candidate_work_experience.head()

Unnamed: 0,candidate_id,date_start_job,unique_id,date_end_job,function_id,isco_functie_niveau,isco_code4,function_name_self,company_name,source,time_spent,education
0,89030,2005-04-04,187756,2005-04-13,1046.0,2.0,5246.0,,Compass Group NL/Prénatal,plaatsing,9,3.0
1,89030,2005-04-08,187818,2005-08-07,1046.0,2.0,5246.0,,Albron Cat. / 3381 ASM Europe BV,plaatsing,121,3.0
2,89030,2005-09-12,1290468,2005-11-10,1046.0,2.0,5246.0,,Albron cat.3695/ Schuitema,plaatsing,59,3.0
3,89030,2005-09-12,1285504,2005-11-10,1046.0,2.0,5246.0,,Albron cat.3695/ Schuitema,plaatsing,59,3.0
4,89030,2005-09-12,1506287,2005-11-10,1046.0,2.0,5246.0,,Paresto/tripvzoudtland/09008 (9.8),plaatsing,59,3.0


In [12]:
# Add skills
# candidate_work_experience = pd.merge(candidate_work_experience, 
#                                      skills_ct, 
#                                      left_on="candidate_id", 
#                                      right_index=True, 
#                                      how="left")

# Filtering and reindexing

In [13]:
more_than_5 = candidate_work_experience["isco_code4"].value_counts()
more_than_5 = set(more_than_5[more_than_5 > 5].index)
candidate_work_experience = candidate_work_experience[candidate_work_experience["isco_code4"].isin(more_than_5)]

In [14]:
id_to_idx = {v : i for i, v in enumerate(sorted(candidate_work_experience["function_id"].astype(float).unique()))}
idx_to_id = {i : v for i, v in id_to_idx.items()}

code_to_idx = {v : i for i, v in enumerate(sorted(candidate_work_experience["isco_code4"].astype(float).unique()))}
idx_to_code = {i : v for i, v in code_to_idx.items()}

In [15]:
candidate_work_experience["function_id"] = candidate_work_experience["function_id"].apply(lambda x: id_to_idx.get(x))
candidate_work_experience["isco_code4"] = candidate_work_experience["isco_code4"].apply(lambda x: code_to_idx[x])

In [16]:
candidate_work_experience.shape

(1666369, 12)

# Add CV embeddings

In [17]:
embedding_pd = pd.read_parquet("s3://s3-nl-prd-semrb-emr/embeddings/doc_embeddings/word2vec/word2vec_doc_embedding.parquet")
embedding_pd['cv_id'] = embedding_pd['cv_id'].astype('int')
embedding_pd.rename(columns={"doc_embedding": "tensor"}, inplace=True)

In [18]:
sql_path = os.path.join("/home/ec2-user/SageMaker/data-science-development/talent_recommender/daily_snapshots",
                        Config.query_candidate_cvs) 

candidate_cvs = get_from_redshift(sql_path, Config.redshift_creds)

In [19]:
def find_embeddings(embedding_pd, candidate_cvs):
    embedding_pd = embedding_pd.set_index("cv_id")
    
    candidate_cvs = candidate_cvs[["cv_id", "candidate_id", "date_start"]]

    embedding_per_candidate = pd.merge(embedding_pd, 
                                       candidate_cvs, 
                                       left_index=True, 
                                       right_on="cv_id")[["candidate_id", "embedding", "date_start"]]
    
    matches = (set(embedding_per_candidate["candidate_id"]) & set(candidate_work_experience["candidate_id"]))
    
    matched_cvs = embedding_per_candidate[embedding_per_candidate["candidate_id"].isin(matches)]
    matched_cvs["date_start"] = matched_cvs["date_start"].dt.date
    
    last_cv_per_day = matched_cvs.groupby(["candidate_id", "date_start"])["embedding"].apply(lambda x: list(x)[-1])
    
    cv_embeddings = last_cv_per_day.reset_index()
    
    return cv_embeddings

cv_embeddings = find_embeddings(embedding_pd, candidate_cvs)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [20]:
cv_embeddings.head()

Unnamed: 0,candidate_id,date_start,embedding
0,84556.0,2020-07-02,"[0.07265971601009369, -0.8582056760787964, -0...."
1,84612.0,2020-05-30,"[0.3474675714969635, -1.0429900884628296, -0.3..."
2,84612.0,2020-06-03,"[0.34068500995635986, -1.0179638862609863, -0...."
3,84731.0,2020-06-03,"[0.12455234676599503, -0.1725415289402008, -0...."
4,85437.0,2020-06-06,"[0.2048000991344452, -0.49068811535835266, -0...."


In [21]:
def add_embeddings(candidate_work_experience, cv_embeddings):
    
    # Add CVs to candidates (includes duplicates)
    full_merge = pd.merge(candidate_work_experience,
                          cv_embeddings,
                          left_on="candidate_id",
                          right_on="candidate_id",
                          how="left")
    
    # display(full_merge)
    
    # Find the maximum date of uploaded cvs per job (i.e. most recent CV during each job)
    most_recent_cvs = full_merge[(full_merge["date_start_job"]
                                  >= full_merge["date_start"])].groupby(
        "unique_id")["date_start"].idxmax()   
    
    # Some candidates only added CVs after their last job started, so account for that too
    late_cvs = full_merge[(full_merge["date_start_job"] <= 
                           full_merge["date_start"])].groupby("unique_id")["date_start"].idxmax()
    
    # Filter out everything we already found earlier
    late_cvs = late_cvs[set(late_cvs.index) - set(most_recent_cvs.index)]
    
    # Now we have all the CVs in one place
    cv_idxs = pd.concat([most_recent_cvs, late_cvs])
    
    # Combine the two frames to include candidates without any CVs
    combined = pd.concat([full_merge[full_merge["embedding"].isna()], 
                          full_merge.loc[cv_idxs.values]])
    
    return combined

candidate_work_experience = add_embeddings(candidate_work_experience, cv_embeddings)

In [22]:
# candidate_work_experience["embedding"] = candidate_work_experience["embedding"].apply(lambda x: x if x is not np.nan
#                                                                                                   else [0] * 300)

In [23]:
candidate_work_experience = candidate_work_experience.reset_index().drop("index", axis=1)

In [24]:
# embedding_lists = candidate_work_experience["embedding"].to_list()
# embeddings = pd.DataFrame(embedding_lists)

In [25]:
# embeddings.columns = [f"embedding_{i}" for i in range(len(embeddings.columns))]

In [26]:
# candidate_work_experience = pd.concat([candidate_work_experience, embeddings], axis=1)

# Ordering & Time

In [27]:
def add_order(df):
    
    # Count the number of jobs each candidate has ahd
    job_counts = df.groupby("candidate_id").size()  
    
    # Sort by candidate_id, date_start_job
    sorted_df = df.sort_values(by = ['candidate_id', "date_start_job"])
    
    # Reset index
    sorted_df.reset_index(inplace=True, drop=True)
    
    # Create a list of lists containing the order of each candidates jobs (which came first, second, third, etc.)
    order = [np.arange(count) for count in job_counts.values]
    
    # Flatten list
    order = [item for sublist in order for item in sublist]
    
    # Add order to df
    sorted_df["job_order"] = order
    
    # Set a candidate_id, job_order as the index
    return sorted_df.set_index(["candidate_id", 
                                "job_order"])

df = add_order(candidate_work_experience)

In [29]:
df = df[["date_start_job", "date_end_job", "time_spent", "isco_code4", "function_id",
         "isco_functie_niveau", "company_name", "source", "education", "embedding"]]

df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,date_start_job,date_end_job,time_spent,isco_code4,function_id,isco_functie_niveau,company_name,source,education,embedding
candidate_id,job_order,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
84556,0,2000-08-01,2001-01-04,156,208,936,2.0,(oud 3146Amsterdam RAI Catering Services,plaatsing,0.0,"[0.07265971601009369, -0.8582056760787964, -0...."
84556,1,2000-09-07,2001-01-01,116,348,809,1.0,(oud 3146Amsterdam RAI Catering Services,plaatsing,0.0,"[0.07265971601009369, -0.8582056760787964, -0...."
84556,2,2000-09-30,2001-07-02,275,208,936,2.0,AMSTERDAM RAI CATERING SERVICES B.V.,plaatsing,0.0,"[0.07265971601009369, -0.8582056760787964, -0...."
84556,3,2000-10-23,2000-10-27,4,344,1519,1.0,Formido Bouwmarkt,plaatsing,0.0,"[0.07265971601009369, -0.8582056760787964, -0...."
84556,4,2003-12-22,2004-01-02,11,344,1519,1.0,Yakult Nederland B.V.,plaatsing,0.0,"[0.07265971601009369, -0.8582056760787964, -0...."
84556,5,2004-07-12,2004-07-13,1,344,1519,1.0,Yakult Nederland B.V.,plaatsing,0.0,"[0.07265971601009369, -0.8582056760787964, -0...."
84556,6,2005-04-11,2005-07-01,81,344,1519,1.0,Intersnack Nederland B.V.,plaatsing,0.0,"[0.07265971601009369, -0.8582056760787964, -0...."
84556,7,2005-05-25,2005-06-01,7,344,1519,1.0,TTG Garantiegevallen BU 1,plaatsing,0.0,"[0.07265971601009369, -0.8582056760787964, -0...."
84556,8,2005-07-27,2005-08-26,30,344,1519,1.0,Coback B.V.,plaatsing,0.0,"[0.07265971601009369, -0.8582056760787964, -0...."
84556,9,2006-03-16,2006-05-15,60,345,893,1.0,Tempo-Team inzake Opleidingen AH,plaatsing,0.0,"[0.07265971601009369, -0.8582056760787964, -0...."


In [30]:
def change_steps(df):
    """Groups users' careers by same job types. I.e., if someone didn't
       change the type of job they had, but just the company, it will be
       grouped into one time step."""
    
    new_values = defaultdict(list)
    
    total_time_spent = 0
    education = 0
    date_end_job = dt.date(1970, 1, 1)
    
    for i in tqdm(range(len(df))):
        if i < len(df) - 1:
            current = df.iloc[i]
            next_ = df.iloc[i + 1]
            # Group all consecutive time steps with the same job 
            if (current.name[0] != next_.name[0]) or (current["isco_code4"] != next_["isco_code4"]):
                # Add values of last correct row
                total_time_spent += current["time_spent"]
                if current["education"] > education:
                    education = current["education"]

                if current["date_end_job"] > date_end_job:
                    date_end_job = current["date_end_job"]

                # Store results
                new_values["candidate_id"].append(current.name[0])
                new_values["date_start_job"].append(current["date_start_job"])
                new_values["date_end_job"].append(date_end_job)
                new_values["time_spent"].append(total_time_spent)
                new_values["isco_code4"].append(current["isco_code4"])
                new_values["function_id"].append(current["function_id"])
                new_values["isco_functie_niveau"].append(current["isco_functie_niveau"])
                new_values["education"].append(education)
                new_values["embedding"].append(current["embedding"])

                # Reset values
                total_time_spent = 0
                education = 0
                date_end_job = dt.date(1970, 1, 1)           
            else:
                total_time_spent += current["time_spent"]
                if current["education"] > education:
                    education = current["education"]

                if current["date_end_job"] > date_end_job:
                    date_end_job = current["date_end_job"]

    return pd.DataFrame(new_values)

df = change_steps(df)

100%|██████████| 1665709/1665709 [22:29<00:00, 1234.18it/s]


In [34]:
df.head()

Unnamed: 0,candidate_id,date_start_job,date_end_job,time_spent,isco_code4,function_id,isco_functie_niveau,education,embedding
0,84556,2000-08-01,2001-01-04,156,208,936,2.0,0.0,"[0.07265971601009369, -0.8582056760787964, -0...."
1,84556,2000-09-07,2001-01-01,116,348,809,1.0,0.0,"[0.07265971601009369, -0.8582056760787964, -0...."
2,84556,2000-09-30,2001-07-02,275,208,936,2.0,0.0,"[0.07265971601009369, -0.8582056760787964, -0...."
3,84556,2005-07-27,2005-08-26,134,344,1519,1.0,0.0,"[0.07265971601009369, -0.8582056760787964, -0...."
4,84556,2006-05-17,2006-07-10,174,345,893,1.0,0.0,"[0.07265971601009369, -0.8582056760787964, -0...."


In [35]:
# Calculate the time it took to go from one job to another (in order)
df["time_between"] = df.groupby(
    "candidate_id")["date_start_job"].progress_apply(lambda x: x - x.shift(1))

100%|██████████| 469073/469073 [05:05<00:00, 1537.53it/s]


In [36]:
df["time_between"] = df["time_between"].shift(-1).fillna(pd.Timedelta(seconds=0)).dt.days.astype('int16')

In [37]:
df.head()

Unnamed: 0,candidate_id,date_start_job,date_end_job,time_spent,isco_code4,function_id,isco_functie_niveau,education,embedding,time_between
0,84556,2000-08-01,2001-01-04,156,208,936,2.0,0.0,"[0.07265971601009369, -0.8582056760787964, -0....",37
1,84556,2000-09-07,2001-01-01,116,348,809,1.0,0.0,"[0.07265971601009369, -0.8582056760787964, -0....",23
2,84556,2000-09-30,2001-07-02,275,208,936,2.0,0.0,"[0.07265971601009369, -0.8582056760787964, -0....",1761
3,84556,2005-07-27,2005-08-26,134,344,1519,1.0,0.0,"[0.07265971601009369, -0.8582056760787964, -0....",294
4,84556,2006-05-17,2006-07-10,174,345,893,1.0,0.0,"[0.07265971601009369, -0.8582056760787964, -0....",252


In [38]:
num_classes = len(df["isco_code4"].unique())

In [49]:
# df_pred = df[["isco_functie_niveau", "education", "function_id"]].fillna(0)
pred_cols = ["candidate_id","time_between", "time_spent", "isco_functie_niveau", 
             "education", "function_id", "isco_code4"] # + [col for col in df.columns if "skill_" in col]
num_features = len(pred_cols)

df_pred = df[pred_cols].fillna(0)

In [50]:
df_pred.head()

Unnamed: 0,candidate_id,time_between,time_spent,isco_functie_niveau,education,function_id,isco_code4
0,84556,37,156,2.0,0.0,936,208
1,84556,23,116,1.0,0.0,809,348
2,84556,1761,275,2.0,0.0,936,208
3,84556,294,134,1.0,0.0,1519,344
4,84556,252,174,1.0,0.0,893,345


In [41]:
df_pred[["function_id", "isco_code4"]].reset_index().to_csv("../Data/DTW_df_pred2.csv")

### Create separate embedding dict

Done for optimization purposes

In [51]:
embedding_order = df.reset_index()[["candidate_id", "embedding"]]

In [52]:
grouped_embeddings = embedding_order.groupby("candidate_id")

In [53]:
embedding_a = defaultdict(lambda: defaultdict(list))

# Find each time step at which the candidate got a new CV
for candidate, values in tqdm(grouped_embeddings):
    embeddings = values["embedding"].values
  
    if (type(embeddings[0]) == type(np.array([]))) and np.nan not in embeddings[0]:
        embedding_a[candidate][0] = embeddings[0]
        
        for i, embedding in enumerate(embeddings):
            if i > 0:
                truths = embeddings[i - 1] != embedding

                if (type(truths) == type(np.array([]))) and (truths).all():
                    embedding_a[candidate][i] = embedding

100%|██████████| 469073/469073 [01:21<00:00, 5736.13it/s]


In [54]:
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

In [55]:
with open('../Data/embeddings_grouped.json', 'w') as f:
    json.dump(embedding_a, f, cls=NumpyEncoder)

In [56]:
df_pred.reset_index().to_csv("../Data/df_pred_grouped.csv")

In [None]:
pd.Series(skills_dict).to_csv("../Data/skills.csv")

### Candidate Certificates

In [None]:
sql_path = os.path.join("/home/ec2-user/SageMaker/data-science-development/talent_recommender/daily_snapshots",
                        Config.query_candidate_certificates) 

candidate_certificates = get_from_redshift(sql_path, Config.redshift_creds)
candidate_certificates.head()

In [None]:
candidate_certificates = pd.crosstab(candidate_certificates["candidate_id"], 
                                     candidate_certificates["candidate_certificate_id"])
candidate_certificates.head()

In [None]:
candidate_certificates.to_csv("../Data/candidate_certificates_one-hot.csv")

### CV Embeddings

In [None]:
df_pred = pd.read_csv("../Data/df_pred.csv")
skills = pd.read_csv("../Data/skills.csv")

In [None]:
# s3 = s3fs.S3FileSystem()

# embedding_pd = pq.ParquetDataset('s3a://s3-nl-prd-semrb-emr/embeddings/doc_embeddings/xlm-roberta-base-smartmatch', filesystem=s3).read_pandas().to_pandas()

# embedding_pd['cv_id'] = embedding_pd['cv_id'].astype('int')
# embedding_pd.rename(columns={"doc_embedding": "tensor"}, inplace=True)
# embedding_pd.head() 

In [None]:
# cvPath = 's3a://s3-nl-prd-datahub-projects/smartmatch_cv/parsed_cv'

# s3 = s3fs.S3FileSystem()
# parsedCv_pd = pq.ParquetDataset(cvPath, filesystem=s3).read_pandas().to_pandas() 

In [None]:
# parsedCv_pd

In [None]:
len(set(cv_embeddings["candidate_id"]) & set(df_pred["candidate_id"]))

In [None]:
cv_embeddings.to_csv("../Data/cv_w2v_embeddings.csv")