# Predicting likelihood of job automation

Mike Griffin

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import (RBF,RationalQuadratic)
import matplotlib.pyplot as plt
%matplotlib inline

## Calibrating against 2010 analysis

In [None]:
ML_data = pd.read_csv("ML_data_2010.csv") 

In [None]:
ML_data.shape

## Train model on labelled data

In [None]:
ML_train = ML_data[ML_data['label_ML']>=0]

In [None]:
features = ['fine_arts','finger_dexterity','manual_dexterity','social_perceptiveness','Negotiation','Originality','Persuasion','assisting_and_caring','cramped_work_space']

X_train = ML_train.filter(features)
y_train = ML_train.filter(['label_ML'])

In [None]:
X_all = ML_data.filter(features)

In [None]:
X_train = X_train
print(X_train.shape)
print(ML_data.shape)

In [None]:
#kernel = 1.0 * RBF(1.0)
kernel = 1.0 * RationalQuadratic(length_scale=1.0, alpha=0.1)
gpc = GaussianProcessClassifier(kernel=kernel,random_state=0).fit(X_train, y_train)
gpc.score(X_train, y_train)

## Run model on all occupations

In [None]:
GP_est = pd.DataFrame(gpc.predict_proba(X_all))

In [None]:
ML_output = pd.concat([ML_data,GP_est],axis=1,ignore_index= True)

In [None]:
ML_output.columns = ['OCC_CODE','OCC_TITLE','fine_arts','finger_dexterity',
                     'manual_dexterity','social_perceptiveness','Negotiation',
                     'Persuasion','Originality','assisting_and_caring','cramped_work_space',
                     'label_ML','probability_ML','GP_pred_0','GP_pred_1']

## Analyse results against original study 

In [None]:
plt.scatter(ML_output['probability_ML'], ML_output['GP_pred_1'])

In [None]:
# Assign classifications

def prob_class_old(row):
    if row["probability_ML"] < 0.3:
        return "3) Low"
    elif row["probability_ML"] < 0.6:
        return "2) Medium"
    else:
        return "1) High"

def prob_class_new(row):
    if row["GP_pred_1"] < 0.3:
        return "3) Low"
    elif row["GP_pred_1"] < 0.6:
        return "2) Medium"
    else:
        return "1) High"


In [None]:
ML_output_mod = ML_output.assign(prob_class_old=ML_output.apply(prob_class_old, axis=1))
ML_output_mod = ML_output_mod.assign(prob_class_new=ML_output_mod.apply(prob_class_new, axis=1))

In [None]:
ML_output_mod.groupby(['prob_class_old','prob_class_new']).size()

## Running same model on 2018 dataset

In [None]:
ML_data_2018 = pd.read_csv("ML_data_2018.csv")
column_names_2018 = ML_data_2018.columns.values.tolist()

In [None]:
X_all_2018 = ML_data_2018.filter(features)
GP_est_2018 = pd.DataFrame(gpc.predict_proba(X_all_2018))
ML_output_2018 = pd.concat([ML_data_2018,GP_est_2018],axis=1,ignore_index= True)
ML_output_2018.columns = column_names_2018+ ['GP_pred_0','GP_pred_1']

In [None]:
ML_output_2018_mod = ML_output_2018.assign(prob_class_new=ML_output_2018.apply(prob_class_new, axis=1))

In [None]:
ML_output_2018_mod.to_csv('ML_output_2018.csv')