In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from validation.dot_data import get_dictionary
from toolz import curry
from statsmodels.iolib.tableformatting import fmt_latex, fmt_txt

In [2]:
from validation.scoring import *

In [62]:
from validation.data import indeed_test_data, dot_train_data, get_soc_n

SOC_LEVEL=3

X_train, y_train = dot_train_data(SOC_LEVEL, include_tasks=False)
count_lookup = y_train.value_counts().to_dict()
code_lookup = make_code_lookup(SOC_LEVEL)

df = pd.read_csv('confusion-matrices/soc-3/withprod/sentencespace_100_indeed-lr-large.csv')
idx = df.sum(1) > 5

df.index = df.columns
test_count = df.sum(1)

scores = [counts(df.values, i) for i,_ in enumerate(df.values)]
sdf = pd.DataFrame({'precision': [prec(tp,fp) for tp,fp,fn in scores], 'recall': [recall(tp,fn) for tp,fp,fn in scores]})


In [58]:
def flip_df_to_socs(df, soc_level):
    df = df.copy()
    code_lookup = make_code_lookup(soc_level)
    df.columns = [code_lookup[c] for c in df.columns]
    df.index = df.columns
    return df

def make_title_lookup(soc_level):
    code_lookup = make_code_lookup(soc_level)
    return {v:k for k,v in code_lookup.items()}

def collect_socs(vals, uplevel):
    vals = [(int(str(v)[:uplevel]), v) for v in vals]
    di = {}
    for k,v in vals:
        try:
            di[k] += [v]
        except KeyError:
            di[k] = [v]
    return di

In [63]:
def print_confusion_matrix_by_uplevel(basename, df, soc_level, uplevel, prod):
    soc_df = flip_df_to_socs(df, soc_level)
    title_lookup = make_title_lookup(soc_level)
    for key,codes in collect_socs(soc_df, uplevel).items():
        mini_df = soc_df.loc[codes, codes]
        mini_df.columns = [title_lookup[c] for c in mini_df.columns]
        mini_df = mini_df.reset_index(drop=True)
        mini_df.to_csv(f'confusion-matrices/soc-{soc_level}/{prod}/aggs/{basename}-agg-level-{uplevel}-{key}.csv', index=False)


print_confusion_matrix_by_uplevel('sentencespace_100_indeed-lr', df, 3, 1, 'withprod')

In [10]:

print_tables(format_dfs('recall', sdf, df, idx, code_lookup, count_lookup, test_count), 'recall')        

\begin{subtable}[t]{\linewidth}
\caption{ Supervisors of Construction and Extraction Workers (SOC 471) (77/46) }
\begin{tabular*}{\textwidth}{l @{\extracolsep{\fill}} c c c}
\hline
    Classified Occupation     & SOC & Proportion & DOT/Test  \\
\hline
\hline
  Construction Trades Workers & 472 &       0.39 &  201/494  \\
 Other Management Occupations & 119 &       0.20 &  213/773  \\
Other Installation, Maintena… & 499 &       0.04 &  344/136  \\
Business Operations Speciali… & 131 &       0.04 &  115/925  \\
 Helpers, Construction Trades & 473 &       0.04 &   31/104  \\
\hline
\end{tabular*}
\end{subtable}
\vspace{5mm}


\begin{subtable}[t]{\linewidth}
\caption{ Supervisors of Office and Administrative Support… (SOC 431) (106/43) }
\begin{tabular*}{\textwidth}{l @{\extracolsep{\fill}} c c c}
\hline
    Classified Occupation     & SOC & Proportion & DOT/Test  \\
\hline
\hline
        Financial Specialists & 132 &       0.16 &  33/1034  \\
             Financial Clerks & 433 &       0.

In [11]:
print_tables(format_dfs('precision', sdf, df, idx, code_lookup, count_lookup, test_count), 'precision')

\begin{subtable}[t]{\linewidth}
\caption{ Supervisors of Installation, Maintenance, and Re… (SOC 491) (94/23) }
\begin{tabular*}{\textwidth}{l @{\extracolsep{\fill}} c c c}
\hline
       True Occupation        & SOC & Proportion & DOT/Test  \\
\hline
\hline
Military Enlisted Tactical O… & 553 &        nan &     40/8  \\
Supervisors of Protective Se… & 331 &        nan &    31/13  \\
Entertainers and Performers,… & 272 &        nan &    92/51  \\
Media and Communication Work… & 273 &        nan &   50/278  \\
Media and Communication Equi… & 274 &        nan &    56/42  \\
\hline
\end{tabular*}
\end{subtable}
\vspace{5mm}


\begin{subtable}[t]{\linewidth}
\caption{ Communications Equipment Operators (SOC 432) (13/6) }
\begin{tabular*}{\textwidth}{l @{\extracolsep{\fill}} c c c}
\hline
       True Occupation        & SOC & Proportion & DOT/Test  \\
\hline
\hline
Other Office and Administrat… & 439 &       0.30 &  154/303  \\
Material Recording, Scheduli… & 435 &       0.20 &  197/705  \\


In [29]:
f1(*micro(df)), f1(*macro(df, 'macro')), f1(*macro(df, 'weighted'))

(0.4480758680147347, 0.6518370133174584, 0.5242069823358642)

In [31]:
# US SS
f1(*micro(df)), f1(*macro(df, 'macro')), f1(*macro(df, 'weighted'))

(0.4880787295906956, 0.36041481773025075, 0.5159941375489523)

In [None]:
print_tables(format_dfs('recall', sdf, df, idx), 'recall')        

In [None]:
print_tables(format_dfs('precision', sdf, df, idx), 'precision')

In [206]:
X_test, y_test, ids = indeed_test_data('../data/us/everything.csv', 100000, 3)

In [207]:
test_data = pd.DataFrame({'X': X_test, 'y': y_test})