In [20]:
import pandas as pd
import numpy as np
import re
from statsmodels.iolib.summary2 import _df_to_simpletable

In [4]:
def counts(arr, i):
    tp = arr[i,i]
    fp = np.sum(arr[:,i]) - tp
    fn = np.sum(arr[i,:]) - tp
    return tp, fp, fn

def prec(tp, fp):
    return tp/(tp+fp) if (tp+fp) > 0 else 0.

def recall(tp, fn):
    return tp/(tp+fn) if (tp+fn) > 0 else 0.

def scores_and_weights(df):
    idx = df.values.sum(1) != 0
    m = df.iloc[idx,:].values
    weights = m.sum(1) / m.sum()
    c = [counts(m, i) for i in np.arange(m.shape[0])]
    return c, weights

def micro(df):
    c,_ = scores_and_weights(df)
    tp, fp, fn = np.array(c).sum(0)
    micro_precision, micro_recall = tp / (tp + fp), tp / (tp + fn)
    return micro_precision, micro_recall

def macro(df, mode='weighted'):
    """ mode is {'weighted', 'raw', 'macro'} """
    c,weights = scores_and_weights(df)
    precisions = np.array([prec(tp,fp) for tp,fp,fn in c])
    recalls = np.array([recall(tp,fn) for tp,fp,fn in c])
    if mode == 'raw':
        return precisions, recalls
    elif mode == 'weighted':
        return precisions.dot(weights), recalls.dot(weights)
    else:
        return np.mean(precisions), np.mean(recalls)

In [21]:
def get_percentage(score, t, s):
    i = get_idx(t)
    if score == 'recall':
        tot = get_trues(i)
    elif score == 'precision':
        tot = get_classified(i)
    return s/tot

def get_score(t, sdf, score):
    i = get_idx(t)
    return sdf[score][i]

def map_series(ser, df):
    ser.index = df.columns[ser.index]
    return ser

def single_tabular(s):
    beg = 'begin{tabular}|end{tabular}'
    a = [re.search(beg, i) for i in s.split('\n')]
    tabulars = np.argwhere(np.array(a) != None).reshape(-1)
    insides = tabulars[1:-1]
    return '\n'.join([e for i,e in enumerate(s.split('\n')) if i not in insides])

def print_tables(x):
    for title,score,s in x:
        d = pd.DataFrame(s)
        d.columns = ['count']
        table = _df_to_simpletable(d, float_format="%.2f")
        s = table.as_latex_tabular()
        s = single_tabular(s)
        print(title)
        print('%.2f' % score)
        print(s)
        print('\n')

def format_dfs(score, sdf, df, idx):
    low = sdf[idx].sort_values(score).index
    x = [(df.columns[i], df.iloc[i,:].sort_values(ascending=False)) 
         for i in low[0:5]]

    if score == 'precision':
        x = [(df.columns[i], df.iloc[:,i].sort_values(ascending=False)) 
             for i in low[0:5]]
        x = [(t, map_series(ser,df)) for t,ser in x]

    # Remove the diagonal
    x = [(t,s[s.index != t][0:5]) for t,s in x]

    # Get score and percentage
    x = [(t,get_score(t,sdf,score),get_percentage(score,t,s)) 
         for t,s in x]
    return x

In [22]:
df = pd.read_csv('confusion-matrices/soc-3/embed-lr.csv')
idx = df.sum(1) > 150
scores = [counts(df.values, i) for i,_ in enumerate(df.values)]
sdf = pd.DataFrame({'precision': [prec(tp,fp) for tp,fp,fn in scores], 'recall': [recall(tp,fn) for tp,fp,fn in scores]})

get_idx = lambda t: np.argwhere(t == df.columns)[0][0]
get_trues = lambda i: df.iloc[i,:].sum()
get_classified = lambda i: df.iloc[:,i].sum()

In [23]:
print_tables(format_dfs('recall', sdf, df, idx))        

Supervisors of Sales Workers
0.08
\begin{center}
\begin{tabular}{lc}
\hline
                                & count  \\
\hline
\hline
Other Management Occupations    &  0.12  \\
Business Operations Specialists &  0.10  \\
Retail Sales Workers            &  0.10  \\
Sales Representatives, Services &  0.10  \\
Other Sales and Related Workers &  0.08  \\
\hline
\end{tabular}
\end{center}


Information and Record Clerks
0.11
\begin{center}
\begin{tabular}{lc}
\hline
                                             & count  \\
\hline
\hline
Financial Specialists                        &  0.25  \\
Baggage Porters, Bellhops, and Concierges    &  0.17  \\
Entertainment Attendants and Related Workers &  0.09  \\
Financial Clerks                             &  0.06  \\
Sales Representatives, Services              &  0.04  \\
\hline
\end{tabular}
\end{center}


Operations Specialties Managers
0.16
\begin{center}
\begin{tabular}{lc}
\hline
                                             & count  \\
\hlin

In [24]:
print_tables(format_dfs('precision', sdf, df, idx))

Information and Record Clerks
0.21
\begin{center}
\begin{tabular}{lc}
\hline
                                                & count  \\
\hline
\hline
Financial Specialists                           &  0.11  \\
Other Sales and Related Workers                 &  0.11  \\
Business Operations Specialists                 &  0.11  \\
Financial Clerks                                &  0.11  \\
Other Office and Administrative Support Workers &  0.07  \\
\hline
\end{tabular}
\end{center}


Other Production Occupations
0.23
\begin{center}
\begin{tabular}{lc}
\hline
                                                                      & count  \\
\hline
\hline
Other Food Preparation and Serving Related Workers                    &  0.15  \\
Material Recording, Scheduling, Dispatching, and Distributing Workers &  0.11  \\
Material Moving Workers                                               &  0.10  \\
Computer Occupations                                                  &  0.04  \\
Other Healthc