# Exploration of the first batch of validation results

# 0. Import dependencies and inputs

In [2]:
%run ../notebook_preamble_Transitions.ipy
from scipy.stats import pearsonr, spearmanr

# Validations results of the first batch
batch_1 = pd.read_csv(data_folder + 'processed/validation/Transitions_to_validate_BATCH_1.csv')
batch_2 = pd.read_csv(data_folder + 'processed/validation/Transitions_to_validate_BATCH_2.csv')
batch = pd.concat([batch_1, batch_2], axis=0)

# First batch of transitions
path_to_val_data_1 = data_folder + 'raw/validation/nesta_output_16Dec.csv'
path_to_val_data_2 = data_folder + 'raw/validation/nesta_output_4_Jan_2021.csv'
batch_results = pd.concat([pd.read_csv(path_to_val_data_1), pd.read_csv(path_to_val_data_2)], axis=0)

# Occupation similarity matrices
sims = load_data.Similarities()

In [3]:
# Check the mean feasibility for each transition
batch_results_agg = (batch_results
                     .groupby('subject_ids')
                     .agg({'feasibility_1-5': 'mean'})
                     .reset_index())

# Add occupation IDs 
batch_results_agg = batch_results_agg.merge(batch_results.drop_duplicates('subject_ids')[[
    'origin_id', 'destination_id',
    'origin_label', 'destination_label',
    'subject_ids']], on=['subject_ids'], how='left')

batch_results_agg.sample(5)

Unnamed: 0,subject_ids,feasibility_1-5,origin_id,destination_id,origin_label,destination_label
3894,53136856,2.333333,1401,66,bank manager,auction house manager
9072,53142662,3.833333,2142,2869,membranophone musical instruments maker,wind musical instrument maker
7584,53141091,3.25,951,1607,filing machine operator,sawmill operator
4224,53137186,2.625,1881,825,ICT system administrator,ICT application configurator
3052,53136014,3.0,160,1419,ICT security consultant,ICT auditor manager


In [4]:
batch_results_agg = batch_results_agg.rename(columns={'feasibility_1-5': 'feasibility_mean'})

# 1. Create features

### 1.1 Transitions

Here we obtain the work similarity and skill similarity scores.

In [5]:
transition_pairs = zip(batch_results_agg['origin_id'], batch_results_agg['destination_id'])
transitions_df = trans_utils.get_transition_data(transition_pairs)

In [6]:
transition_feature_cols = ['origin_id', 'destination_id', 'W_work', 'W_skills']
transitions_df = transitions_df[transition_feature_cols]

In [7]:
transitions_df = transitions_df.merge(
    batch_results_agg[['subject_ids', 'feasibility_mean', 'origin_id', 'destination_id']],
    left_on=['origin_id', 'destination_id'],
    right_on=['origin_id', 'destination_id'],
                                     )

In [8]:
transitions_df = transitions_df.set_index('subject_ids')

In [9]:
transitions_df.head()

Unnamed: 0_level_0,origin_id,destination_id,W_work,W_skills,feasibility_mean
subject_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
52451278,29,2654,0.549595,0.118355,2.0
52451279,29,1803,0.506951,0.132669,2.888889
52451280,29,1036,0.654659,0.073554,2.875
52451281,29,2877,0.579094,0.125653,3.857143
52451282,29,2292,0.564219,0.069026,2.222222


### 1.2 Skills match distributions

Here we calculate statistics about the distribution of best matched skill similarities for each transition.

In [None]:
transition_pairs = zip(batch_results_agg['subject_ids'], 
                       batch_results_agg['origin_id'], 
                       batch_results_agg['destination_id'])

skills_matches = []

for transition_id, origin_id, destination_id in transition_pairs:
    try:
        skills_matching = trans_utils.show_skills_overlap(origin_id, destination_id, 
                                                          skills_match='optional', verbose=False)
        skills_matching['transition_id'] = transition_id
        skills_matches.append(skills_matching)
    except:
        continue
    
skills_matches = pd.concat(skills_matches, axis=0)

In [None]:
transitions_df['skill_similarity_10pc'] = (skills_matches
                                           .set_index('transition_id')
                                     .groupby('transition_id')['similarity']
                                     .apply(lambda x: np.percentile(x, 10)))
transitions_df['skill_similarity_mean'] = (skills_matches
                                           .set_index('transition_id')
                                     .groupby('transition_id')['similarity']
                                     .mean())
transitions_df['skill_similarity_90pc'] = (skills_matches
                                           .set_index('transition_id')
                                     .groupby('transition_id')['similarity']
                                     .apply(lambda x: np.percentile(x, 90)))

In [None]:
transitions_df['skill_similarity_90pc'] = (transitions_df['skill_similarity_90pc']
                                           .fillna(transitions_df['skill_similarity_90pc'].mean()))
transitions_df['skill_similarity_10pc'] = (transitions_df['skill_similarity_10pc']
                                           .fillna(transitions_df['skill_similarity_10pc'].mean()))

### 1.3 Job Similarities

Here we calculate the cosine similarity between embeddings of the two job descriptions for each transitions.

In [None]:
from scipy.spatial.distance import cosine

In [None]:
s = np.load(data_folder + 'interim/embeddings/embeddings_occupation_description_SBERT_bert-base-nli-mean-tokens.npy')
ids = set(transitions_df['origin_id']).union(set(transitions_df['destination_id']))
occ_embeddings = pd.DataFrame(s, index=ids)

In [None]:
description_sims = []

for o, d in zip(transitions_df['origin_id'], transitions_df['destination_id']):
    v_o = occ_embeddings.loc[o].values
    v_d = occ_embeddings.loc[d].values
    description_sims.append(cosine(v_o, v_d))

In [None]:
transitions_df['description_similarity'] = description_sims
transitions_df['description_similarity'] = 1 - transitions_df['description_similarity']

## 2. Analysis

In [None]:
feature_cols = [ 
    'W_work',
    'description_similarity',
    'W_skills',
    'skill_similarity_mean',
    'skill_similarity_10pc',
    'skill_similarity_90pc', 
                ]
target_col = 'feasibility_mean'

In [None]:
fig, axs = plt.subplots(ncols=len(feature_cols), figsize=(2.5*len(feature_cols), 3))

for ax, col in zip(axs, feature_cols):
    ax.scatter(transitions_df[col], transitions_df[target_col], alpha=0.05)
    ax.set_title(col)
    ax.set_ylabel('mean_feasibility')
    
plt.tight_layout();

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))
sns.heatmap(transitions_df[feature_cols].corr().abs(), annot=True, ax=ax, square=True);

In [None]:
transitions_df[feature_cols + [target_col]].corr()[target_col].drop(target_col).abs().plot.barh();

In [None]:
pd.plotting.scatter_matrix(transitions_df[feature_cols + [target_col]], figsize=(15, 15), alpha=0.15);

## 3. Model

Model to predict transition feasibility:

- Create a binary vector of skill pairs to represent each job and train a regression model to predict feasibility
- Create a model of averaged origin and destination skill embeddings and train a regression model to predict feasibility
- Combination of similarity measures that we already have

New features

- Specialisation of skills (overall and adjusted overlap)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from yellowbrick.regressor import ResidualsPlot

In [None]:
for col in feature_cols:
    transitions_df[col] = transitions_df[col].fillna(transitions_df[col].mean())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(transitions_df[feature_cols], 
                                                    transitions_df[target_col], 
                                                    random_state=0, test_size=0.2, train_size=0.8)

### 3.1 Support Vector Regression

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, loguniform, uniform
from sklearn.metrics import mean_squared_error, SCORERS

In [None]:
params = {
    'C': loguniform(1e-1, 1e3),
    'gamma': loguniform(1e-2, 1),
    'epsilon': uniform(0, 1),
    'kernel': ['rbf']
         }

svr = SVR()
regressor = RandomizedSearchCV(
    svr, n_iter=100, param_distributions=params, 
    scoring='neg_mean_squared_error', n_jobs=-2, 
    random_state=0, verbose=3)

In [None]:
regressor.fit(X_train, y_train)

In [None]:
best = regressor.best_estimator_

### 3.2 Assess Model

In [None]:
visualizer = ResidualsPlot(best)

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
visualizer.show() 

In [None]:
y_pred = best.predict(X_test)

print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test, y_pred))

In [None]:
best.get_params()

In [None]:
print(classification_report(y_test >= 2.5, y_pred >= 2.5))

### Final Fit

In [None]:
best.fit(transitions_df[feature_cols], transitions_df[target_col])

## 4. Feasibility Threshold

In [None]:
checked_df = pd.read_csv(data_folder + 'restricted/validation/test_transitions_checked.csv')

In [None]:
checked_df = checked_df.dropna(subset=['feasible? y/n'])

checked_df = checked_df.merge(test_df, left_on=['origin_id', 'destination_id'],
              right_on=['origin_id', 'destination_id'],
              how='left')

In [None]:
fig, ax = plt.subplots()
sns.histplot(checked_df, x='feasibility_pred', hue='feasible? y/n', ax=ax)

ax.set_xlabel('Predicted Feasibility Score (model)');

In [None]:
checked_df['bin'] = pd.cut(checked_df['feasibility_pred'], bins=np.arange(1.25, 3.25, 0.125))
checked_df['is_feasible'] = checked_df['feasible? y/n'].map({'y':1, 'n': 0})

checked_df.groupby('bin')['is_feasible'].mean().plot.bar();

In [None]:
transitions_df[(transitions_df['feasibility_pc_gt_2'] == 0.5)]['feasibility_mean'].mean()

In [None]:
bins = pd.cut(transitions_df['feasibility_mean'], bins=np.arange(1.25, 3.25, 0.125))

transitions_df.groupby(bins)['feasibility_pc_gt_2'].mean().plot.bar();