In [11]:
!pip install dtw-python

Collecting dtw-python
  Downloading dtw_python-1.1.12-cp36-cp36m-manylinux2010_x86_64.whl (581 kB)
     |████████████████████████████████| 581 kB 24.6 MB/s            
Installing collected packages: dtw-python
Successfully installed dtw-python-1.1.12


In [51]:
from sys import path
path.append("/home/ec2-user/SageMaker/data-science-development/utils")
path.append("/home/ec2-user/SageMaker/data-science-development/config")

import pandas as pd
import numpy as np

from collections import defaultdict
from tqdm import tqdm 
from heapq import nlargest
import dtw

tqdm.pandas()

In [52]:
df_pred = pd.read_csv("../Data/df_pred_ext.csv").drop("Unnamed: 0", axis=1)

In [53]:
df_pred.shape

(1666675, 10)

In [54]:
career_paths = df_pred.groupby("candidate_id")["isco_code4"]

In [56]:
# Convert to 2d-arrays, getting rid of candidate_ids as values
career_paths = career_paths.progress_apply(lambda x: x.values)

100%|██████████| 469568/469568 [00:11<00:00, 41353.90it/s]


In [57]:
career_lens = career_paths.apply(len)
career_paths = career_paths.loc[(career_lens > 1)]

In [58]:
career_paths = career_paths.loc[career_paths.apply(lambda x: x[-1] != x[-2])]
len(career_paths)

113724

In [59]:
career_paths_train = career_paths.iloc[:int(0.8 * len(career_paths))]
career_paths_valid = career_paths.iloc[int(0.8 * len(career_paths)):int(0.9 * len(career_paths))]
career_paths_test = career_paths.iloc[int(0.9 * len(career_paths)):]

In [60]:
s = career_paths_train.apply(lambda x : x[-2])
second_to_last_jobs = s.to_frame().groupby("isco_code4").apply(lambda x: set(x.index))

In [62]:
results = defaultdict(lambda: defaultdict(int))

for c, jobs in tqdm(career_paths_valid.iteritems(), total=len(career_paths_test)):
    # We can only find similar individuals if others have had the same second-to-last job
    if jobs[-2] in second_to_last_jobs:
        similar = second_to_last_jobs[jobs[-2]]
        similar = career_paths_train.loc[similar]

        jobs = jobs[:-1].astype("double")

        for j in similar.iteritems():
            c2, jobs2 = j

            if c != c2:
                results[c][c2] = dtw.dtw(c, c2, distance_only=True).normalizedDistance

100%|█████████▉| 11372/11373 [1:15:03<00:00,  2.53it/s]


In [63]:
best_matches = {}

for i in tqdm(results):
    if len(results[i]):
        best_matches[i] = sorted(results[i], key=lambda x: results[i][x])

100%|██████████| 11371/11371 [00:21<00:00, 524.49it/s]


In [64]:
def knn_classifier(best_matches, career_paths, k=1, n=[]):
    
    predictions = defaultdict(list)
    
    for candidate in best_matches.keys():
        correct = career_paths[candidate][-1]
                
        neighbors = [(career_paths[neighbor][-1], i + 1) for i, neighbor in enumerate(best_matches[candidate][:k])]
        
        weights = defaultdict(int)
        
        for job, weight in neighbors:
            weights[job] += 1/np.log(1 + weight)
            
        for v in n :
            predictions[v].append(correct in nlargest(v, weights, key=weights.get))

    return {k: np.mean(v) for k, v in predictions.items()}

In [None]:
results = defaultdict(list)

for k in tqdm(range(1, 101)):
    results["k"].append(k)
    r = knn_classifier(best_matches, career_paths, k=k, n=[1, 5, 10])
    results["acc@1"].append(r[1])
    results["acc@5"].append(r[5])
    results["acc@10"].append(r[10])
    
pd.DataFrame(results).set_index("k")

100%|██████████| 100/100 [07:28<00:00,  4.49s/it]


Unnamed: 0_level_0,acc@1,acc@5,acc@10
k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.068596,0.068596,0.068596
2,0.068596,0.147480,0.147480
3,0.073608,0.234368,0.234368
4,0.075279,0.273942,0.273942
5,0.076686,0.299974,0.299974
...,...,...,...
96,0.179228,0.464515,0.580600
97,0.179228,0.464515,0.581919
98,0.180283,0.464955,0.582007
99,0.180283,0.464515,0.582095


In [69]:
pd.DataFrame(results).set_index("k").sort_values(by="acc@1").head(10)

Unnamed: 0_level_0,acc@1,acc@5,acc@10
k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.068596,0.068596,0.068596
2,0.068596,0.14748,0.14748
3,0.073608,0.234368,0.234368
4,0.075279,0.273942,0.273942
5,0.076686,0.299974,0.299974
6,0.085217,0.311054,0.328027
7,0.114062,0.317035,0.355554
8,0.114238,0.326181,0.375253
9,0.120922,0.330226,0.408935
13,0.121977,0.371735,0.4566


In [12]:
most_common_switch, 1.96 * np.sqrt(((1 - most_common_switch)* (most_common_switch)) / (113724*0.2))

(0.6130138491976258, 0.006329914804694123)

In [13]:
print(f"Most common switch accuracy: {most_common_switch * 100:>.2f}%")

Most common switch accuracy: 61.30%
