# Indeed Machine Learning CodeSprint

Load the important packages:

In [1]:
import numpy as np
import pandas as pd
import sklearn
import nltk

## Data loading

Load the training data:

In [2]:
import csv

def load_train_data(filename):
    X = []
    y = []
    
    with open(filename) as fd:
        reader = csv.reader(fd, delimiter='\t')

        # ignore header row
        next(reader, None)
        
        for row in reader:
            X.append(row[1])
            y.append(row[0].split())

    return np.array(X), np.array(y)

X, y = load_train_data('../data/train.tsv')

Show some input and output data:

In [3]:
print 'Input:', X[0]
print
print 'Output:', y[0]

Input: THE COMPANY    Employer is a midstream service provider to the onshore Oil and Gas markets.  It is a a fast growing filtration technology company providing environmentally sound solutions to the E&P’s for water and drilling fluids management and recycling.    THE POSITION    The North Dakota Regional Technical Sales Representative reports directly to the VP of Sales and covers a territory that includes North Dakota and surrounding areas of South Dakota, Wyoming and Montana.  Specific duties for this position include but are not limited to:     Building sales volume within the established territory from existing and new accounts   Set up and maintain a strategic sales plan for the territory   Present technical presentations, product demonstrations & training   Maintain direct contact with customers, distributors and representatives   Prospect new customer contacts and referrals   Gather and record customer & competitor information   Provide accurate and updated forecasts f

## Preprocessing definition

Define input data preprocessor as bag-of-words and tf-idf feature extraction:

- `CountVectorizer`: Transforms text to vector of occurrences for each word found in training set (bag-of-words representation).
- `TfidfTransformer`: Transforms bag-of-words to its relative frequency, removing too frequent or rare words from the final representation.

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

X_preprocessor = Pipeline([
    ('count', CountVectorizer(max_df=0.95, min_df=2, ngram_range=(1, 2))),
    ('tfidf', TfidfTransformer())
])

Define multi-label binarizer for output data. Each target sample will be a binary array: 0 if not present, 1 otherwise.

In [5]:
from sklearn.preprocessing import LabelEncoder

y_preprocessors = {
    'job': LabelEncoder(),
    'wage': LabelEncoder(),
    'degree': LabelEncoder(),
    'experience': LabelEncoder(),
    'supervising': LabelEncoder()
}

## Classifier definition

Define classifier as SVM with one-vs-all strategy for multilabel classification.

In [6]:
# F1 score: 0.422
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

models = {
    'job': OneVsRestClassifier(LinearSVC()),
    'wage': OneVsRestClassifier(LinearSVC()),
    'degree': OneVsRestClassifier(LinearSVC()),
    'experience': OneVsRestClassifier(LinearSVC()),
    'supervising': OneVsRestClassifier(LinearSVC())
}

## Model evaluation

In [7]:
# Separate targets for mutually exclusive tags
def split_exclusive_tags(y):
    split_y = {
        'job': [],
        'wage': [],
        'degree': [],
        'experience': [],
        'supervising': []
    }
    
    for target in y:
        split_y['job'].append(filter(lambda x: x in ['part-time-job', 'full-time-job'], target))
        split_y['wage'].append(filter(lambda x: x in ['hourly-wage', 'salary'], target))
        split_y['degree'].append(filter(lambda x: x in ['associate-needed', 'bs-degree-needed', 'ms-or-phd-needed', 'licence-needed'], target))
        split_y['experience'].append(filter(lambda x: x in ['1-year-experience-needed', '2-4-years-experience-needed', '5-plus-years-experience-needed'], target))
        split_y['supervising'].append(filter(lambda x: x in ['supervising-job'], target))
        
    return split_y

In [8]:
# Calculate the number of label occurrences for each tag type
def calculate_tag_type_count(split_y, tag_type):
    count = {}
    
    for y_type in split_y[tag_type]:
        y_type = y_type[0] if y_type else ''
        
        if y_type not in count:
            count[y_type] = 0

        count[y_type] += 1

    return count

In [9]:
# Generate stratified indicies considering the minimum number of label occurrences for each tag type
# For instance, given the data y = [0, 1, 1, 2, 3, 3, 3, 3, 2, 2, 1, 0], the minimum number of label occurrences is 2
# for label "0". Hence, the output will be something like [1, 2, 3, 4, 5, 6, 9, 11], where each class will have only
# two occurrences.
def stratify_classes(X, split_y, tag_type):
    tag_type_count = calculate_tag_type_count(split_y, tag_type)
    min_tag_type_count = min([count for _, count in tag_type_count.items()])
    
    stratified_indices = []
    stratification_count = {}
    
    for i, y in enumerate(split_y[tag_type]):
        index = y[0] if y else ''
        
        if index not in stratification_count:
            stratification_count[index] = 0
        else:
            stratification_count[index] += 1
        
        if stratification_count[index] < min_tag_type_count:
            stratified_indices.append(i)
    
    stratified_X = X[stratified_indices]
    stratified_y = np.array([y for i, y in enumerate(split_y[tag_type]) if i in stratified_indices])
    
    return stratified_X, stratified_y

In [10]:
def fit_models(models, X_preprocessor, y_preprocessors, X, y):
    print 'Fitting models'
    split_y = split_exclusive_tags(y)

    X_processed = X_preprocessor.fit_transform(X)
    
    for tag_type, model in models.items():
        stratified_X, stratified_y = stratify_classes(X, split_y, tag_type)
        
        # Learn one preprocessor for each mutually exclusive tag
        X_processed = X_preprocessor.transform(stratified_X)
        y_processed = y_preprocessors[tag_type].fit_transform(stratified_y)
        
        # Learn one model for each mutually exclusive tag
        model.fit(X_processed, y_processed)

In [11]:
def predict_models(models, X_preprocessor, y_preprocessors, X):
    print 'Predicting with models'
    
    output = [[] for _ in X]
    
    for tag_type, model in models.items():
        # Preprocess and use model for the given type of tag
        X_processed = X_preprocessor.transform(X)
        model_output = model.predict(X_processed)
        
        tag_type_output = y_preprocessors[tag_type].inverse_transform(model_output)

        # Aggregate outputs for all types of tags in the same array
        for i, out in enumerate(tag_type_output):
            if type(out) in [list, tuple]:
                output[i].extend(out)
            else:
                output[i].append(out)

    return output

In [12]:
def calculate_f1_score(y_test, y_output):
    print 'Calculating F1 score'
    
    tags = ['part-time-job', 'full-time-job', 'hourly-wage', 'salary', 'associate-needed', 'bs-degree-needed',
            'ms-or-phd-needed', 'licence-needed', '1-year-experience-needed', '2-4-years-experience-needed',
            '5-plus-years-experience-needed', 'supervising-job']

    true_positive = np.array([0.0 for _ in tags])
    true_negative = np.array([0.0 for _ in tags])
    false_positive = np.array([0.0 for _ in tags])
    false_negative = np.array([0.0 for _ in tags])
    
    for target, output in zip(y_test, y_output):
        for i, tag in enumerate(tags):
            if tag in target and tag in output:
                true_positive[i] += 1
            elif tag not in target and tag not in output:
                true_negative[i] += 1
            elif tag in target and tag not in output:
                false_negative[i] += 1
            elif tag not in target and tag in output:
                false_positive[i] += 1
            else:
                raise Exception('Unknown situation - tag: {} target: {} output: {}'.format(tag, target, output))
                
    tags_precision = np.array([0.0 for _ in tags])
    tags_recall = np.array([0.0 for _ in tags])
    tags_f1_score = np.array([0.0 for _ in tags])
    
    for i, tag in enumerate(tags):
        tags_precision[i] = true_positive[i] / (true_positive[i] + false_positive[i])
        tags_recall[i] = true_positive[i] / (true_positive[i] + false_negative[i])
        tags_f1_score[i] = 2*tags_precision[i]*tags_recall[i] / (tags_precision[i] + tags_recall[i])
        
    min_tags_precision = np.argmin(tags_precision)
    min_tags_recall = np.argmin(tags_recall)
    min_tags_f1_score = np.argmin(tags_f1_score)
    
    print
    print '{:30s} | {:5s} | {:5s} | {:5s}'.format('Tag', 'Prec.', 'Rec. ', 'F1')
    for i in range(len(tags)):
        print '{:30s} | {:.3f} | {:.3f} | {:.3f}'.format(
            tags[i], tags_precision[i], tags_recall[i], tags_f1_score[i])
    print
    
    print 'Worst precision:', tags[min_tags_precision]
    print 'Worst recall:', tags[min_tags_recall]
    print 'Worst F1 score:', tags[min_tags_f1_score]
    print
        
    precision = np.sum(true_positive) / (np.sum(true_positive) + np.sum(false_positive))
    recall = np.sum(true_positive) / (np.sum(true_positive) + np.sum(false_negative))
    f1_score = 2*precision*recall / (precision + recall)
    
    print 'General:'
    print 'Precision: {:.3f}'.format(precision)
    print 'Recall: {:.3f}'.format(recall)
    print 'F1 score: {:.3f}'.format(f1_score)
    
    return f1_score

Calculate F1 score with cross-validation:

In [13]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

scores = []
k_fold = KFold(n_splits=5)

for i, (train, validation) in enumerate(k_fold.split(X)):
    X_train, X_validation, y_train, y_validation = X[train], X[validation], y[train], y[validation]

    fit_models(models, X_preprocessor, y_preprocessors, X_train, y_train)
    y_output = predict_models(models, X_preprocessor, y_preprocessors, X_validation)
    
    score = calculate_f1_score(y_validation, y_output)
    scores.append(score)
    print '#{0} F1 score: {1:.3f}'.format(i, score)
    print
    
f1_score = np.mean(scores)
    
print 'Total F1 score: {0:.3f}'.format(f1_score)

Fitting models
Predicting with models
Calculating F1 score

Tag                            | Prec. | Rec.  | F1   
part-time-job                  | 0.283 | 0.739 | 0.410
full-time-job                  | 0.315 | 0.599 | 0.413
hourly-wage                    | 0.418 | 0.772 | 0.542
salary                         | 0.382 | 0.779 | 0.512
associate-needed               | 0.148 | 0.500 | 0.228
bs-degree-needed               | 0.624 | 0.584 | 0.603
ms-or-phd-needed               | 0.215 | 0.636 | 0.322
licence-needed                 | 0.362 | 0.600 | 0.452
1-year-experience-needed       | 0.155 | 0.517 | 0.239
2-4-years-experience-needed    | 0.443 | 0.297 | 0.356
5-plus-years-experience-needed | 0.376 | 0.623 | 0.469
supervising-job                | 0.469 | 0.757 | 0.579

Worst precision: associate-needed
Worst recall: 2-4-years-experience-needed
Worst F1 score: associate-needed

General:
Precision: 0.367
Recall: 0.592
F1 score: 0.453
#0 F1 score: 0.453

Fitting models
Predicting with models


## Model usage

Load the data:

In [14]:
def load_test_data(filename):
    with open(filename) as fd:
        reader = csv.reader(fd, delimiter='\t')
        next(reader, None) # ignore header row
        X = [row[0] for row in reader]

    return np.array(X)

X_train, y_train = load_train_data('../data/train.tsv')
X_test = load_test_data('../data/test.tsv')

Train the model with all training data:

In [15]:
fit_models(models, X_preprocessor, y_preprocessors, X_train, y_train)

Fitting models


Predict output from test data:

In [16]:
y_output = predict_models(models, X_preprocessor, y_preprocessors, X_validation)

Predicting with models


Show some output data:

In [17]:
print y_output[:10]

[['hourly-wage', 'part-time-job', 'supervising-job', '1-year-experience-needed'], ['supervising-job', '5-plus-years-experience-needed', 'bs-degree-needed'], ['hourly-wage', 'part-time-job', 'bs-degree-needed'], ['hourly-wage', 'part-time-job'], ['2-4-years-experience-needed', 'bs-degree-needed'], ['hourly-wage', 'part-time-job'], ['hourly-wage', 'part-time-job', '1-year-experience-needed'], ['hourly-wage', '5-plus-years-experience-needed'], ['hourly-wage'], []]


Save output data:

In [18]:
def save_output(filename, output):
    with open(filename, 'w') as fd:
        fd.write('tags\n')
        
        for i, tags in enumerate(output):
            fd.write(' '.join(tags))
            
            if i < len(output) - 1:
                fd.write('\n')
            
save_output('../data/tags.tsv', y_output)

Save preprocessors and model:

In [19]:
import pickle

def save(filename, obj):
    pickle.dump(obj, open(filename, 'w'))

save('../models/X_preprocessor.pickle', X_preprocessor)
save('../models/y_preprocessor.pickle', y_preprocessors)
save('../models/clf_{0:.3f}_f1_score.pickle'.format(f1_score), models)

## Load saved model

In [20]:
def load(filename):
    return pickle.load(open(filename))

models = load('../models/clf_0.461_f1_score.pickle')
X_preprocessors = load('../models/X_preprocessor.pickle')
y_preprocessors = load('../models/y_preprocessor.pickle')