# Indeed Machine Learning CodeSprint

Load the important packages:

In [25]:
import numpy as np
import pandas as pd
import sklearn
import nltk

## Data loading

Load the training data:

In [26]:
import csv

def load_train_data(filename):
    X = []
    y = []
    
    with open(filename) as fd:
        reader = csv.reader(fd, delimiter='\t')

        # ignore header row
        next(reader, None)
        
        for row in reader:
            X.append(row[1])
            y.append(row[0].split())

    return np.array(X), np.array(y)

X, y = load_train_data('../data/train.tsv')

Show some input and output data:

In [27]:
print 'Input:', X[0]
print
print 'Output:', y[0]

Input: THE COMPANY    Employer is a midstream service provider to the onshore Oil and Gas markets.  It is a a fast growing filtration technology company providing environmentally sound solutions to the E&P’s for water and drilling fluids management and recycling.    THE POSITION    The North Dakota Regional Technical Sales Representative reports directly to the VP of Sales and covers a territory that includes North Dakota and surrounding areas of South Dakota, Wyoming and Montana.  Specific duties for this position include but are not limited to:     Building sales volume within the established territory from existing and new accounts   Set up and maintain a strategic sales plan for the territory   Present technical presentations, product demonstrations & training   Maintain direct contact with customers, distributors and representatives   Prospect new customer contacts and referrals   Gather and record customer & competitor information   Provide accurate and updated forecasts f

## Preprocessing definition

Define input data preprocessor as bag-of-words and tf-idf feature extraction:

- `CountVectorizer`: Transforms text to vector of occurrences for each word found in training set (bag-of-words representation).
- `TfidfTransformer`: Transforms bag-of-words to its relative frequency, removing too frequent or rare words from the final representation.

In [28]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

X_preprocessor = Pipeline([
    ('count', CountVectorizer(max_df=0.95, min_df=2)),
    ('tfidf', TfidfTransformer())
])

Define multi-label binarizer for output data. Each target sample will be a binary array: 0 if not present, 1 otherwise.

In [29]:
from sklearn.preprocessing import MultiLabelBinarizer

y_preprocessors = {
    'job': MultiLabelBinarizer(),
    'wage': MultiLabelBinarizer(),
    'degree': MultiLabelBinarizer(),
    'experience': MultiLabelBinarizer(),
    'supervising': MultiLabelBinarizer()
}

## Classifier definition

Define classifier as SVM with one-vs-all strategy for multilabel classification.

In [30]:
# F1 score: 0.422
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

models = {
    'job': OneVsRestClassifier(LinearSVC()),
    'wage': OneVsRestClassifier(LinearSVC()),
    'degree': OneVsRestClassifier(LinearSVC()),
    'experience': OneVsRestClassifier(LinearSVC()),
    'supervising': OneVsRestClassifier(LinearSVC())
}

## Model evaluation

In [31]:
# Separate targets for mutually exclusive tags
def split_exclusive_tags(y):
    split_y = {
        'job': [],
        'wage': [],
        'degree': [],
        'experience': [],
        'supervising': []
    }
    
    for target in y:
        split_y['job'].append(filter(lambda x: x in ['part-time-job', 'full-time-job'], target))
        split_y['wage'].append(filter(lambda x: x in ['hourly-wage', 'salary'], target))
        split_y['degree'].append(filter(lambda x: x in ['associate-needed', 'bs-degree-needed', 'ms-or-phd-needed', 'licence-needed'], target))
        split_y['experience'].append(filter(lambda x: x in ['1-year-experience-needed', '2-4-years-experience-needed', '5-plus-years-experience-needed'], target))
        split_y['supervising'].append(filter(lambda x: x in ['supervising-job'], target))
        
    return split_y

In [32]:
def fit_models(models, X_preprocessor, y_preprocessors, X, y):
    print 'Fitting models'
    split_y = split_exclusive_tags(y)

    for tag_type, model in models.items():
        # Learn one preprocessor for each mutually exclusive tag
        X_processed = X_preprocessor.fit_transform(X)
        y_processed = y_preprocessors[tag_type].fit_transform(split_y[tag_type])
        
        # Learn one model for each mutually exclusive tag
        model.fit(X_processed, y_processed)

In [33]:
def predict_models(models, X_preprocessor, y_preprocessors, X):
    print 'Predicting with models'
    
    output = [[] for _ in X]
    
    for tag_type, model in models.items():
        # Preprocess and use model for the given type of tag
        X_processed = X_preprocessor.transform(X)
        model_output = model.predict(X_processed)
        
        # Adjust in case the output is only 0 or 1 instead of an array
        if len(model_output.shape) == 1:
            model_output = model_output.tolist()
            for i in range(len(model_output)):
                model_output[i] = [model_output[i]]
            model_output = np.array(model_output)
        
        tag_type_output = y_preprocessors[tag_type].inverse_transform(model_output)

        # Aggregate outputs for all types of tags in the same array
        for i, out in enumerate(tag_type_output):
            if type(out) in [list, tuple]:
                output[i].extend(out)
            else:
                output[i].append(out)

    return output

In [34]:
def calculate_f1_score(y_test, y_output):
    print 'Calculating F1 score'
    
    tags = ['part-time-job', 'full-time-job', 'hourly-wage', 'salary', 'associate-needed', 'bs-degree-needed',
            'ms-or-phd-needed', 'licence-needed', '1-year-experience-needed', '2-4-years-experience-needed',
            '5-plus-years-experience-needed', 'supervising-job']

    true_positive = np.array([0.0 for _ in tags])
    true_negative = np.array([0.0 for _ in tags])
    false_positive = np.array([0.0 for _ in tags])
    false_negative = np.array([0.0 for _ in tags])
    
    for target, output in zip(y_test, y_output):
        for i, tag in enumerate(tags):
            if tag in target and tag in output:
                true_positive[i] += 1
            elif tag not in target and tag not in output:
                true_negative[i] += 1
            elif tag in target and tag not in output:
                false_negative[i] += 1
            elif tag not in target and tag in output:
                false_positive[i] += 1
            else:
                raise Exception('Unknown situation - tag: {} target: {} output: {}'.format(tag, target, output))
                
    tags_precision = np.array([0.0 for _ in tags])
    tags_recall = np.array([0.0 for _ in tags])
    tags_f1_score = np.array([0.0 for _ in tags])
    
    for i, tag in enumerate(tags):
        tags_precision[i] = true_positive[i] / (true_positive[i] + false_positive[i])
        tags_recall[i] = true_positive[i] / (true_positive[i] + false_negative[i])
        tags_f1_score[i] = 2*tags_precision[i]*tags_recall[i] / (tags_precision[i] + tags_recall[i])
        
    min_tags_precision = np.argmin(tags_precision)
    min_tags_recall = np.argmin(tags_recall)
    min_tags_f1_score = np.argmin(tags_f1_score)
    
    print 'Tags:', tags
    print 'Precision:', tags_precision
    print 'Recall:', tags_recall
    print 'F1 score:', tags_f1_score
    print
    
    print 'Worst precision:', tags[min_tags_precision]
    print 'Worst recall:', tags[min_tags_recall]
    print 'Worst F1 score:', tags[min_tags_f1_score]
    print
        
    precision = np.sum(true_positive) / (np.sum(true_positive) + np.sum(false_positive))
    recall = np.sum(true_positive) / (np.sum(true_positive) + np.sum(false_negative))
    f1_score = 2*precision*recall / (precision + recall)
    
    print 'General:'
    print 'Precision: {0:.3f}'.format(precision)
    print 'Recall: {0:.3f}'.format(recall)
    print 'F1 score: {0:.3f}'.format(f1_score)
    
    return f1_score

Calculate F1 score with cross-validation:

In [35]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

scores = []
k_fold = KFold(n_splits=5)

for i, (train, validation) in enumerate(k_fold.split(X)):
    X_train, X_validation, y_train, y_validation = X[train], X[validation], y[train], y[validation]

    fit_models(models, X_preprocessor, y_preprocessors, X_train, y_train)
    y_output = predict_models(models, X_preprocessor, y_preprocessors, X_validation)
    
    score = calculate_f1_score(y_validation, y_output)
    scores.append(score)
    print '#{}: {}'.format(i, score)
    print
    
f1_score = np.mean(scores)
    
print 'Total F1 score: {0:.3f}'.format(f1_score)

Fitting models
Predicting with models
Calculating F1 score
Tags: ['part-time-job', 'full-time-job', 'hourly-wage', 'salary', 'associate-needed', 'bs-degree-needed', 'ms-or-phd-needed', 'licence-needed', '1-year-experience-needed', '2-4-years-experience-needed', '5-plus-years-experience-needed', 'supervising-job']
Precision: [ 0.625       0.67948718  0.75        0.65333333         nan  0.77294686
  1.          0.66666667  0.77777778  0.53488372  0.56043956  0.7254902 ]
Recall: [ 0.2173913   0.31736527  0.34177215  0.33793103  0.          0.64
  0.04545455  0.36190476  0.12068966  0.27710843  0.34931507  0.43786982]
F1 score: [ 0.32258065  0.43265306  0.46956522  0.44545455         nan  0.70021882
  0.08695652  0.4691358   0.20895522  0.36507937  0.43037975  0.54612546]

Worst precision: associate-needed
Worst recall: associate-needed
Worst F1 score: associate-needed

General:
Precision: 0.673
Recall: 0.363
F1 score: 0.471
#0: 0.471359860079

Fitting models




Predicting with models
Calculating F1 score
Tags: ['part-time-job', 'full-time-job', 'hourly-wage', 'salary', 'associate-needed', 'bs-degree-needed', 'ms-or-phd-needed', 'licence-needed', '1-year-experience-needed', '2-4-years-experience-needed', '5-plus-years-experience-needed', 'supervising-job']
Precision: [ 0.88235294  0.59782609  0.82926829  0.64912281  0.33333333  0.74556213
         nan  0.69387755  0.          0.52941176  0.66216216  0.65934066]
Recall: [ 0.24590164  0.3125      0.45333333  0.29365079  0.02941176  0.63        0.
  0.30088496  0.          0.29166667  0.34507042  0.38461538]
F1 score: [ 0.38461538  0.41044776  0.5862069   0.40437158  0.05405405  0.68292683
         nan  0.41975309         nan  0.3761194   0.4537037   0.48582996]

Worst precision: ms-or-phd-needed
Worst recall: ms-or-phd-needed
Worst F1 score: ms-or-phd-needed

General:
Precision: 0.660
Recall: 0.343
F1 score: 0.451
#1: 0.451213707758

Fitting models




Predicting with models
Calculating F1 score
Tags: ['part-time-job', 'full-time-job', 'hourly-wage', 'salary', 'associate-needed', 'bs-degree-needed', 'ms-or-phd-needed', 'licence-needed', '1-year-experience-needed', '2-4-years-experience-needed', '5-plus-years-experience-needed', 'supervising-job']
Precision: [ 0.85185185  0.6627907   0.86666667  0.72727273         nan  0.77348066
         nan  0.59259259  1.          0.49640288  0.65714286  0.68571429]
Recall: [ 0.31506849  0.33333333  0.39795918  0.25        0.          0.68627451
  0.          0.36781609  0.03225806  0.33333333  0.36507937  0.51798561]
F1 score: [ 0.46        0.44357977  0.54545455  0.37209302         nan  0.72727273
         nan  0.45390071  0.0625      0.39884393  0.46938776  0.59016393]

Worst precision: associate-needed
Worst recall: associate-needed
Worst F1 score: associate-needed

General:
Precision: 0.680
Recall: 0.379
F1 score: 0.486
#2: 0.486460807601

Fitting models
Predicting with models
Calculating F1 s

## Model usage

Load the data:

In [36]:
def load_test_data(filename):
    with open(filename) as fd:
        reader = csv.reader(fd, delimiter='\t')
        next(reader, None) # ignore header row
        X = [row[0] for row in reader]

    return np.array(X)

X_train, y_train = load_train_data('../data/train.tsv')
X_test = load_test_data('../data/test.tsv')

Train the model with all training data:

In [37]:
fit_models(models, X_preprocessor, y_preprocessors, X_train, y_train)

Fitting models


Predict output from test data:

In [38]:
y_output = predict_models(models, X_preprocessor, y_preprocessors, X_validation)

Predicting with models


Show some output data:

In [39]:
print y_output[:10]

[[], ['salary', 'supervising-job', '2-4-years-experience-needed', 'bs-degree-needed'], ['hourly-wage', 'full-time-job'], ['part-time-job'], ['5-plus-years-experience-needed'], [], ['hourly-wage', 'part-time-job'], ['hourly-wage', 'full-time-job'], ['hourly-wage'], []]


Save output data:

In [40]:
def save_output(filename, output):
    with open(filename, 'w') as fd:
        fd.write('tags\n')
        
        for i, tags in enumerate(output):
            fd.write(' '.join(tags))
            
            if i < len(output) - 1:
                fd.write('\n')
            
save_output('../data/tags.tsv', y_output)

Save preprocessors and model:

In [41]:
import pickle

def save(filename, obj):
    pickle.dump(obj, open(filename, 'w'))

save('../models/X_preprocessor.pickle', X_preprocessor)
save('../models/y_preprocessor.pickle', y_preprocessors)
save('../models/clf_{0:.3f}_f1_score.pickle'.format(f1_score), models)

## Load saved model

In [42]:
def load(filename):
    return pickle.load(open(filename))

models = load('../models/clf_0.461_f1_score.pickle')
X_preprocessors = load('../models/X_preprocessor.pickle')
y_preprocessors = load('../models/y_preprocessor.pickle')