In [81]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import re
from statsmodels.iolib.summary2 import _df_to_simpletable, _formatter
from statsmodels.iolib.table import SimpleTable
from validation.dot_data import get_dictionary
from toolz import curry
from statsmodels.iolib.tableformatting import fmt_latex, fmt_txt

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
def counts(arr, i):
    tp = arr[i,i]
    fp = np.sum(arr[:,i]) - tp
    fn = np.sum(arr[i,:]) - tp
    return tp, fp, fn

def prec(tp, fp):
    return tp/(tp+fp) if (tp+fp) > 0 else 0.

def recall(tp, fn):
    return tp/(tp+fn) if (tp+fn) > 0 else 0.

def scores_and_weights(df):
    idx = df.values.sum(1) != 0
    m = df.iloc[idx,:].values
    weights = m.sum(1) / m.sum()
    c = [counts(m, i) for i in np.arange(m.shape[0])]
    return c, weights

def micro(df):
    c,_ = scores_and_weights(df)
    tp, fp, fn = np.array(c).sum(0)
    micro_precision, micro_recall = tp / (tp + fp), tp / (tp + fn)
    return micro_precision, micro_recall

def macro(df, mode='weighted'):
    """ mode is {'weighted', 'raw', 'macro'} """
    c,weights = scores_and_weights(df)
    precisions = np.array([prec(tp,fp) for tp,fp,fn in c])
    recalls = np.array([recall(tp,fn) for tp,fp,fn in c])
    if mode == 'raw':
        return precisions, recalls
    elif mode == 'weighted':
        return precisions.dot(weights), recalls.dot(weights)
    else:
        return np.mean(precisions), np.mean(recalls)

In [197]:
def get_percentage(score, t, s):
    i = get_idx(t)
    if score == 'recall':
        tot = get_trues(i)
    elif score == 'precision':
        tot = get_classified(i)
    return s/tot

def get_score(t, sdf, score):
    i = get_idx(t)
    return sdf[score][i]

def map_series(ser, df):
    ser.index = df.columns[ser.index]
    return ser

def single_tabular(s, title, score):
    beg = 'begin{tabular}|end{tabular}'
    a = [re.search(beg, i) for i in s.split('\n')]
    tabulars = np.argwhere(np.array(a) != None).reshape(-1)
    insides = tabulars[1:-1]
    rows = [e for i,e in enumerate(s.split('\n')) if i not in insides]
    rows = rows[2:]
    rows = rows[:-2]
    pre = ['\\begin{subtable}[t]{\linewidth}',
           '\\begin{tabular*}{\\textwidth}{l @{\\extracolsep{\\fill}} c c c}']

    post = ['\\end{tabular*}', 
            '\caption{{ {} }}'.format(title),
            '\end{subtable}',
            '\\vspace{5mm}']
    rows = pre + rows + post
    return '\n'.join(rows)

def print_tables(x, score):
    for title,value,d in x:
        table = _df_to_simpletable(d, float_format="%.2f", index=False)
        s = table.as_latex_tabular()
        s = single_tabular(s, title, score)
        print(s)
        print('\n')

def make_code_lookup(SOC_LEVEL):
    dot_dict = get_dictionary('', SOC_LEVEL)
    di = (dot_dict[[f'desc_soc{SOC_LEVEL}', 'soc']]
          .groupby('soc')
          .head(1)
          .set_index('desc_soc3')
          .to_dict(orient='index'))
    return {k:v['soc'] for k,v in di.items()}

@curry
def truncate(lim, s):
    if len(s) > lim:
        return s[0:lim-2] + chr(8230)
    return s

def format_scores(s, code_lookup, count_lookup, test_count):
    dots = [str(count_lookup[code_lookup[c]]) for c in s.index ]
    tests = [str(test_count[c]) for c in s.index ]
    
    return pd.DataFrame({'Occupation': s.index.map(truncate(30)), 
                         'Percentage': s.values, 
                         'DOT/Test': [f'{d}/{t}' for d,t in zip(dots, tests)],
                         'SOC': [str(code_lookup[c]) for c in s.index ]}).reset_index(drop=True)

def format_dfs(score, sdf, df, idx, code_lookup, count_lookup, test_count):
    low = sdf[idx].sort_values(score).index
    df.index = df.columns
    x = [(df.columns[i], df.iloc[i,:].sort_values(ascending=False)) 
         for i in low[0:5]]

    if score == 'precision':
        x = [(df.columns[i], df.iloc[:,i].sort_values(ascending=False)) 
             for i in low[0:5]]
        # x = [(t, map_series(ser,df)) for t,ser in x]

    x = [(t,s[0:5]) for t,s in x]

    # x = [(t,s[s.index != t][0:5]) for t,s in x]

    # Get score and percentage
    x = [(t,get_score(t,sdf,score),get_percentage(score,t,s)) 
         for t,s in x]

    format_title = lambda t: f'{code_lookup[t]} - ({count_lookup[code_lookup[t]]}/{test_count[t]}) - {truncate(50, t)}'    

    x = [(format_title(t),score,format_scores(s, code_lookup, count_lookup, test_count)) for t,score,s in x]

    return x

def get_scores(country):
    df = pd.read_csv(f'confusion-matrices/soc-3/sentencespace_100_{country}.csv')
    vals = f1(*micro(df)), f1(*macro(df, 'macro')), f1(*macro(df, 'weighted'))
    cols = 'micro', 'macro', 'weighted-macro'
    return pd.Series((dict(zip(cols, vals))))

f1 = lambda p,r: 2*p*r / (p+r)

In [202]:
train_data = pd.DataFrame({'X': X_train, 'y': y_train})

In [None]:
train_data.X[train_data.y == 473]

In [None]:
test_data.X[test_data.y == 473]

In [200]:
print_tables(format_dfs('recall', sdf, df, idx, code_lookup, count_lookup, test_count), 'recall')        

\begin{subtable}[t]{\linewidth}
\begin{tabular*}{\textwidth}{l @{\extracolsep{\fill}} c c c}
\hline
          Occupation          & Percentage & DOT/Test & SOC  \\
\hline
\hline
         Retail Sales Workers &       0.36 & 190/1371 & 412  \\
Other Sales and Related Work… &       0.10 & 192/2130 & 419  \\
       Art and Design Workers &       0.08 & 253/1614 & 271  \\
Entertainers and Performers,… &       0.07 &  362/282 & 272  \\
Business Operations Speciali… &       0.07 & 746/4363 & 131  \\
\hline
\end{tabular*}
\caption{ 411 - (65/2049) - Supervisors of Sales Workers }
\end{subtable}
\vspace{5mm}


\begin{subtable}[t]{\linewidth}
\begin{tabular*}{\textwidth}{l @{\extracolsep{\fill}} c c c}
\hline
          Occupation          & Percentage &  DOT/Test & SOC  \\
\hline
\hline
  Construction Trades Workers &       0.59 &  815/2525 & 472  \\
Other Installation, Maintena… &       0.12 &   883/674 & 499  \\
Vehicle and Mobile Equipment… &       0.08 &  385/1206 & 493  \\
 Helpers, Constru

In [199]:
print_tables(format_dfs('precision', sdf, df, idx, code_lookup, count_lookup, test_count), 'precision')

\begin{subtable}[t]{\linewidth}
\begin{tabular*}{\textwidth}{l @{\extracolsep{\fill}} c c c}
\hline
          Occupation          & Percentage & DOT/Test & SOC  \\
\hline
\hline
  Construction Trades Workers &       0.23 & 815/2525 & 472  \\
Other Installation, Maintena… &       0.18 &  883/674 & 499  \\
  Grounds Maintenance Workers &       0.08 &  91/1016 & 373  \\
Electrical and Electronic Eq… &       0.06 &  339/336 & 492  \\
Material Recording, Scheduli… &       0.05 & 471/2915 & 435  \\
\hline
\end{tabular*}
\caption{ 499 - (883/674) - Other Installation, Maintenance, and Repair Occu… }
\end{subtable}
\vspace{5mm}


\begin{subtable}[t]{\linewidth}
\begin{tabular*}{\textwidth}{l @{\extracolsep{\fill}} c c c}
\hline
          Occupation          & Percentage & DOT/Test & SOC  \\
\hline
\hline
Other Food Preparation and S… &       0.54 &  70/3031 & 359  \\
Food and Beverage Serving Wo… &       0.20 &  138/767 & 353  \\
         Retail Sales Workers &       0.06 & 190/1371 & 412  \\


In [131]:
SOC_LEVEL=3

code_lookup = make_code_lookup(SOC_LEVEL)

df.index = df.columns
test_count = df.sum(1)

In [191]:
df = pd.read_csv('confusion-matrices/soc-3/sentencespace_100_us-lr.csv')
# df.index = df.columns
idx = df.sum(1) > 100

scores = [counts(df.values, i) for i,_ in enumerate(df.values)]
sdf = pd.DataFrame({'precision': [prec(tp,fp) for tp,fp,fn in scores], 'recall': [recall(tp,fn) for tp,fp,fn in scores]})

get_idx = lambda t: np.argwhere(t == df.columns)[0][0]
get_trues = lambda i: df.iloc[i,:].sum()
get_classified = lambda i: df.iloc[:,i].sum()

In [None]:
countries = ['us', 'india', 'uk']
pd.DataFrame([get_scores(country) for country in countries], index=countries).to_csv('confusion-matrices/soc-3/scores.csv', index_label='country')

In [None]:
df = pd.read_csv('confusion-matrices/soc-3/sentencespace_100_uk.csv')
f1(*micro(df)), f1(*macro(df, 'macro')), f1(*macro(df, 'weighted'))

In [170]:
df = pd.read_csv('confusion-matrices/soc-3/sentencespace_100_us-lr.csv')
f1(*micro(df)), f1(*macro(df, 'macro')), f1(*macro(df, 'weighted'))

(0.4880787295906956, 0.36041481773025075, 0.5159941375489523)

In [None]:
print_tables(format_dfs('recall', sdf, df, idx), 'recall')        

In [None]:
print_tables(format_dfs('precision', sdf, df, idx), 'precision')

In [104]:
from validation.data import indeed_test_data, dot_train_data, get_soc_n

X_train, y_train = dot_train_data(SOC_LEVEL, include_tasks=True)

In [206]:
X_test, y_test, ids = indeed_test_data('../data/us/everything.csv', 100000, 3)

In [207]:
test_data = pd.DataFrame({'X': X_test, 'y': y_test})

In [109]:
count_lookup = y_train.value_counts().to_dict()