# Confusion Matrices

In [29]:
import numpy as np
import pandas as pd

from sklearn.metrics import classification_report, confusion_matrix
from validation.dot_data import get_dictionary
from validation.data import indeed_test_data, dot_train_data

def print_confusion_matrices(model_names, preds, y_test, SOC_LEVEL):
    dot_dict = get_dictionary('', SOC_LEVEL)
    un = dot_dict.groupby('soc').apply(lambda df: df.head(1))
    category_names = un['desc_soc{}'.format(SOC_LEVEL)]
    for name,p in zip(model_names, preds):
        df = pd.DataFrame(confusion_matrix(y_test, p, un.soc), 
                          index=category_names, 
                          columns=category_names)
        filename = 'confusion-matrices/soc-{}/{}.csv'.format(SOC_LEVEL, name)
        df.to_csv(filename, index=False)

In [26]:
for country in ['uk', 'us', 'india']:
    SOC_LEVEL=3
    preds = np.load(f'ss_models/sentencespace_100_{country}/predictions-63.npy').astype(int)
    model = f'sentencespace_100_{country}'
    _,y_test,_ = indeed_test_data(f'data/{country}/everything.csv', 500000, SOC_LEVEL)
    print_confusion_matrices([model], [preds], y_test, SOC_LEVEL)

# Make Predictions

In [None]:
def make_predictions_df(df, preds):
    key = 'predicted_soc{}'.format(SOC_LEVEL)
    desc = 'desc_soc{}'.format(SOC_LEVEL)
    dot_dict = get_dictionary('', SOC_LEVEL)
    dd = dot_dict.groupby('soc').first()[desc].reset_index()
    found = (pd.DataFrame({key: preds})
             .merge(dd, how='left', left_on=key, right_on='soc')
             .drop('soc', 1))
    return (pd.concat([df, found], 1)
            .rename(columns = {'content': 'description'}))

def print_predictions(model, df, outpath, SOC_LEVEL, model_name):
    X_train, y_train = dot_train_data(SOC_LEVEL)    
    all_preds = model.fit(X_train, y_train).predict(df.content)
    filename = f'{outpath}/{model_name}-soc{SOC_LEVEL}-predictions.csv'
    make_predictions_df(df, all_preds).to_csv(filename, index=False)    

def make_and_print_preds(country, model_name):
    df = pd.read_csv(f'data/{country}/everything.csv')
    model = Pipeline([(model_name, PreEmbeddedVectorizer(f'./ss_models/{model_name}/embeds.txt', 
                                                         f'./ss_models/{model_name}/model', 
                                                         df.shape[0], 
                                                         100)),
                    ('lr', LogisticRegression(C=5., solver='newton-cg', multi_class="multinomial", n_jobs=-1))])

    print_predictions(model, df, 'predictions', 3, model_name)

In [None]:
SOC_LEVEL = 6

for country in ['us', 'uk', 'india']:
    make_and_print_preds(country, f'sentencespace_100_{country}')