In [1]:
%load_ext autoreload 
%autoreload 2

import pandas as pd
from validation.data import indeed_test_data, dot_train_data, get_soc_n
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from classification.embedding import PreEmbeddedVectorizer

from urllib.parse import urlparse, urlunparse, parse_qs, urlencode
from validation.dot_data import get_dictionary

def make_title_lookup(path, N):
    dot_codes = get_dictionary('', N).groupby('soc').first()
    return lambda code: dot_codes[f'desc_soc{N}'].to_dict()[code]

def remove_qs(url):
    """ Removes query string except for key jk """
    u = urlparse(url)
    qs = parse_qs(u.query)
    jk = qs.get('jk')
    if jk:
        qs = urlencode({'jk': jk}, True)
    else:
        qs = None        
    return urlunparse(u._replace(query = qs))

In [2]:
COUNTRY = 'india'

In [36]:
everything = pd.read_csv(f'data/{COUNTRY}/everything.csv')

In [150]:
# company = pd.read_csv(f'matched-company/{COUNTRY}_indeed_matched123.csv', sep='\t', quoting=csv.QUOTE_ALL, encoding='utf8')
company = pd.read_csv(f'matched-company/{COUNTRY}_url_only.csv')
company['url'] = company.url.map(remove_qs)
# idx = ~company.url.isna()
# company.loc[idx, 'url'] = company[idx].url.map(remove_qs)

In [151]:
df = pd.merge(company, everything, how='left', on='url')

In [3]:
SOC_LEVEL = 6
X_train, y_train = dot_train_data(SOC_LEVEL)

In [4]:
model = Pipeline([('sentencespace_100', 
                   PreEmbeddedVectorizer(f'./ss_models/sentencespace_100_{COUNTRY}/model', 
                                         100, 
                                         cache_dir=f'embed_cache_{COUNTRY}')),
                  ('lr', LogisticRegression(C=5., solver='newton-cg', multi_class="multinomial", n_jobs=-1))])

model.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('sentencespace_100', PreEmbeddedVectorizer(cache_dir=None, chunk_size=1000, dims=100,
           model='./ss_models/sentencespace_100_india/model')), ('lr', LogisticRegression(C=5.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=-1, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False))])

In [5]:
def _get_soc_n(df, n):
    return (df.T
            .reset_index()
            .pipe(lambda df: df.assign(soc = df['index'].map(lambda i: str(i)[0:n])))
            .set_index('soc')
            .drop('index', 1)
            .groupby('soc').sum().T)


def get_pred(model, X):
    vals = model.predict_proba(X)
    df = pd.DataFrame(vals)
    df.columns = model.classes_
    n=3
    return _get_soc_n(df, n)
    
def make_title_lookup(path, N):
    dot_codes = get_dictionary('', N).groupby('soc').first()
    d = dot_codes[f'desc_soc{N}'].to_dict()
    def lookup(code):
        try:
            return d[int(code)]
        except KeyError:
            return code
    return lookup

In [6]:
from s3fs import S3FileSystem

fs = S3FileSystem()
with fs.open('s3://oecd-scraping/indeed-india-v1/2018-08-02T09-00-19.jl') as f:
    raw = pd.read_json(f, lines=True)

In [7]:
from embed_software.preprocess import claims_processor, readme_processor, Preprocessor
string_processor = Preprocessor(readme_processor, 4).process

# raw['url'] = raw.url.map(remove_qs)
raw = raw.drop_duplicates(subset=['url'])
raw['content'] = raw.description.map(string_processor)

In [8]:
company = pd.read_csv(f'matched-company/{COUNTRY}_url_only.csv')

In [9]:
df = pd.merge(company, raw, how='left', on='url')

In [10]:
to_predict = ~df.content.isna()

In [11]:
preds = get_pred(model, df[to_predict].content)

In [12]:
import numpy as np

chosen = np.argmax(preds.values, 1)
lookup = make_title_lookup('', 3)
soc = pd.Series(chosen).map(lambda i: preds.columns[i])
desc = soc.map(lookup)
prob = preds.max(1)

In [13]:
df.loc[to_predict, 'prediction_code'] = soc.values
df.loc[to_predict, 'prediction_description'] = desc.values
df.loc[to_predict, 'prediction_confidence'] = prob.values

In [None]:
df

In [15]:
import csv

(df
 .drop(['content'], 1)
 .rename(columns = {'title_x': 'title'})
 .to_csv(f'matched-company/{COUNTRY}_indeed_matched123_predicted.csv', 
         index=False, 
         sep='\t', 
         quoting=csv.QUOTE_ALL))