In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import re
import attr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

from validation.data import indeed_test_data, dot_train_data, get_soc_n
from embed_software.preprocess import *
from embed_software.utils import get_embeddings, embed_docs
from validation.dot_data import LemmaTokenizer, get_dictionary
from classification.embedding import PreEmbeddedVectorizer, Embedding, WordEmbeddingVectorizer

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 10]

pd.set_option('max_colwidth',50)
pd.set_option('display.width',200)

In [2]:
SOC_LEVEL = 6

In [3]:
X_train, y_train = dot_train_data(SOC_LEVEL, include_tasks=False)

In [66]:
X_train_tasks, y_train_tasks = dot_train_data(SOC_LEVEL, include_tasks=True, include_dot=False)

In [74]:
y_train.unique().shape

(793,)

In [73]:
y_train_tasks.unique().shape

(775,)

In [71]:
y_train_tasks.value_counts()[y_train_tasks.value_counts() <= 5].shape

(4,)

In [45]:
y_train.value_counts()[y_train.value_counts() <= 3].shape

(350,)

In [37]:
lookup(211011)

'Substance Abuse and Behavioral Disorder Counselors'

In [32]:
from validation.data import make_desc_lookup
from validation.dot_data import get_dictionary, LemmaTokenizer


lookup = make_desc_lookup('', 6)

In [None]:
goods = y_train.value_counts()[y_train.value_counts() >= 3].index

pd.Series([str(g)[:3] for g in goods]).value_counts()

In [4]:
models = [
    Pipeline([('glove_100_va', WordEmbeddingVectorizer('../glove-models/glove-va-100.txt', cache_dir='va_glove_embed_cache')),
              ('lr', LogisticRegression(C=5., solver='lbfgs', class_weight='balanced', multi_class="multinomial", n_jobs=-1))]),
    Pipeline([('sentencespace_100_va', PreEmbeddedVectorizer('../ss-models/va-ss-100', cache_dir='va_embed_cache')),
              ('lr', LogisticRegression(C=5., solver='lbfgs', class_weight='balanced', multi_class="multinomial", n_jobs=-1))]),
    Pipeline([('sentencespace_100_indeed', PreEmbeddedVectorizer('../indeed-embeds/model', cache_dir='indeed_embed_cache')),
              ('lr', LogisticRegression(C=5., solver='lbfgs', class_weight='balanced', multi_class="multinomial", n_jobs=-1))]),
    Pipeline([('tfidf', TfidfVectorizer(min_df=3, max_df=.8, ngram_range=(1,2))),
              ('lr', LogisticRegression(C=5., solver='lbfgs', class_weight='balanced', multi_class="multinomial", n_jobs=-1))])
]

In [5]:
# WITHIN DOT ACCURACY 
from sklearn.model_selection import cross_val_predict, LeaveOneOut

def in_sample_predict(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model.predict(X_train)

preds = [None, None, None, None]

In [11]:
preds[0] = in_sample_predict(models[0], X_train, y_train)
accuracy_score(y_train, preds[0])

0.5200250803354495

In [9]:
preds[1] = in_sample_predict(models[1], X_train, y_train)
accuracy_score(y_train, preds[1])

0.27968604289710913

In [6]:
preds[2] = in_sample_predict(models[2], X_train, y_train)
accuracy_score(y_train, preds[2])

0.4323904258626049

In [None]:
preds[3] = in_sample_predict(models[3], X_train, y_train)
accuracy_score(y_train, preds[3])

In [7]:
from validation.scoring import make_code_lookup
from validation.data import make_desc_lookup

desc_lookup = make_desc_lookup('', 6)

In [8]:
comp = pd.DataFrame({ 'content': X_train, 
                      'dot_desc': [desc_lookup(y) for y in y_train], 
                      'pred_desc': [desc_lookup(p) for p in preds[2]], 
                      'dots': y_train, 
                      'predicted': preds[2]})

falses = preds[2] != y_train
mistakes = comp[falses]

In [9]:
comp.content.str.len().mean()

674.9191016474977

In [10]:
mistakes.content.str.len().mean()

615.4550930996714

In [11]:
mistakes.loc[83].content

'performs duties as described under apprentice any industry master title'

In [35]:
mistakes.content.str.contains('performs duties as described under').sum() / mistakes.shape[0]

0.043674698795180725

In [37]:
comp[comp.content.str.contains('duties as described under')].to_csv('weird-duties.csv', index=False)

In [38]:
comp[comp.content.str.contains('performs duties as described under')].to_csv('weird-performs-duties.csv', index=False)

In [14]:
mistakes.content.values

array(['directs and coordinates through subordinate supervisors department activities in commercial industrial or service establishment reviews and analyzes reports records and directives and confers with supervisors to obtain data required for planning department activities such as new commitments status of work in progress and problems encountered\tassigns or delegates responsibility for specified work or functional activities and disseminates policy to supervisors\tgives work directions resolves problems prepares schedules and sets deadlines to ensure timely completion of work\tcoordinates activities of department with related activities of other departments to ensure efficiency and economy\tmonitors and analyzes costs and prepares budget using computer\tprepares reports and records on department activities for management using computer\tevaluates current procedures and practices for accomplishing department objectives to develop and implement improved procedures and practices\tmay 

In [31]:
X_train[y_train == 119111].values

array(['administers nursing program in hospital nursing home or other medical facility to maintain standards of patient care and advises medical staff department heads and administrators in matters related to nursing service recommends establishment or revision of policies and develops organizational structure and standards of performance\tinterprets policies and objectives of nursing service to staff and community groups\tpromotes working relationships with community agencies and with other establishment departments\tassists in preparation of departmental budget\testablishes personnel qualification requirements drafts procedure manuals initiates in service programs installs record and reporting system and performs other personnel management tasks\tinitiates studies to evaluate effectiveness of nursing services in relation to their objectives and costs\tmay assist nursing schools with curricular problems',
       'plans with management medical director and legal counsel scope and objec

In [27]:
mistakes.loc[930].content

'directs activities of state board of nursing develops procedures relative to licensure by examination or endorsement renewal of licenses certification to other states and discipline of licensees\tassumes responsibility for keeping board informed of relevant matter to aid in policy making and decisions\tdetermines specific needs of program and provides for written plans and statement of policies\tascertains number scope and responsibility of positions on board selects staff and delegates duties\tdirects and supervises activities of staff and evaluates performance\testablishes system of record keeping and maintains up to date comprehensive records of board activities and office procedures\testimates and submits request for budget\tcollects analyzes and prepares for publication data relating to nursing education and licensure\tinitiates and cooperates in research projects\tparticipates in interpreting nursing laws board rules and policies and trends in nursing and nursing education\tissu

In [22]:
mistakes.loc[800:1200]

Unnamed: 0,content,dot_desc,pred_desc,dots,predicted
820,entertains audience in amusement park by exhib...,Actors,Dancers,272011,272031
821,parades across stage to display costumes and p...,Actors,Costume Attendants,272011,393092
832,directs and coordinates activities of workers ...,Accountants and Auditors,"Education Administrators, Postsecondary",132011,119033
843,buys and sells grain on commission for custome...,"Securities, Commodities, and Financial Service...","Purchasing Agents and Buyers, Farm Products",413031,131021
847,sells livestock at stockyards as agent for own...,"Sales Representatives, Wholesale and Manufactu...",Farm and Home Management Advisors,414012,259021
848,contacts landowners to negotiate purchase of c...,"Purchasing Agents and Buyers, Farm Products",Forest and Conservation Workers,131021,454011
857,compiles and analyzes statistical data to dete...,"Business Operations Specialists, All Other","Purchasing Agents, Except Wholesale, Retail, a...",131199,131023
878,directs and coordinates vocational work progra...,Industrial Production Managers,Probation Officers and Correctional Treatment ...,113051,211092
882,recruits independent sales agents insurance in...,First-Line Supervisors/Managers of Non-Retail ...,Insurance Sales Agents,411012,413021
883,develops and conducts training programs for em...,Vocational Education Teachers Postsecondary,Training and Development Managers,251194,113042


In [13]:
def print_confusion_matrices(models, preds, y, path, SOC_LEVEL):
    dot_dict = get_dictionary('', SOC_LEVEL)
    model_names = ['-'.join(m.named_steps.keys()) for m in models]
    un = dot_dict.groupby('soc').apply(lambda df: df.head(1))
    category_names = un['desc_soc{}'.format(SOC_LEVEL)]
    for name,p in zip(model_names, preds):
        df = pd.DataFrame(confusion_matrix(y, p, un.soc), 
                          index=category_names, 
                          columns=category_names)
        filename = path.format(SOC_LEVEL, name)
        df.to_csv(filename, index=False)

In [14]:
print_confusion_matrices([models[2]], [preds[2]], y_train, 'confusion-matrices/insample-{}/{}.csv', 3)