# Indeed Machine Learning CodeSprint

Load the important packages:

In [1]:
import numpy as np
import pandas as pd
import sklearn
import nltk

## Data loading

Load the training data:

In [2]:
import csv

def load_train_data(filename):
    X = []
    y = []
    
    with open(filename) as fd:
        reader = csv.reader(fd, delimiter='\t')

        # ignore header row
        next(reader, None)
        
        for row in reader:
            X.append(row[1])
            y.append(row[0].split())

    return np.array(X), np.array(y)

X, y = load_train_data('../data/train.tsv')

Show some input and output data:

In [3]:
for i in range(3):
    print 'Data #{}'.format(i)
    print
    print 'Input:', X[i]
    print
    print 'Output:', y[i]
    print

Data #0

Input: THE COMPANY    Employer is a midstream service provider to the onshore Oil and Gas markets.  It is a a fast growing filtration technology company providing environmentally sound solutions to the E&P’s for water and drilling fluids management and recycling.    THE POSITION    The North Dakota Regional Technical Sales Representative reports directly to the VP of Sales and covers a territory that includes North Dakota and surrounding areas of South Dakota, Wyoming and Montana.  Specific duties for this position include but are not limited to:     Building sales volume within the established territory from existing and new accounts   Set up and maintain a strategic sales plan for the territory   Present technical presentations, product demonstrations & training   Maintain direct contact with customers, distributors and representatives   Prospect new customer contacts and referrals   Gather and record customer & competitor information   Provide accurate and updated fo

## Preprocessing definition

Define input data preprocessor as bag-of-words and tf-idf feature extraction:

- `CountVectorizer`: Transforms text to vector of occurrences for each word found in training set (bag-of-words representation).
- `TfidfTransformer`: Transforms bag-of-words to its relative frequency, removing too frequent or rare words from the final representation.

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

X_preprocessor = Pipeline([
    ('count', CountVectorizer()),
    ('tfidf', TfidfTransformer())
])

Define multi-label binarizer for output data. Each target sample will be a binary array: 0 if not present, 1 otherwise.

In [5]:
from sklearn.preprocessing import MultiLabelBinarizer

y_preprocessor = MultiLabelBinarizer()

## Classifier definition

Define classifier as SVM with one-vs-all strategy for multilabel classification.

In [6]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

clf = Pipeline([
    ('svc', OneVsRestClassifier(LinearSVC()))
])

## Model evaluation

Calculate F1 score with cross-validation:

In [7]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

scores = []
k_fold = KFold(n_splits=5)

for train, validation in k_fold.split(X):
    X_train, X_validation, y_train, y_validation = X[train], X[validation], y[train], y[validation]
    
    X_train_processed = X_preprocessor.fit_transform(X_train)
    X_validation_preprocessed = X_preprocessor.transform(X_validation)
    
    y_train_processed = y_preprocessor.fit_transform(y_train)
    y_validation_processed = y_preprocessor.transform(y_validation)
    
    clf.fit(X_train_processed, y_train_processed)
    y_output = clf.predict(X_validation_preprocessed)
    
    scores.append(f1_score(y_validation_processed, y_output, average='micro'))
    
f1_score = np.mean(scores)
    
print 'F1 score: {0:.3f}'.format(f1_score)

F1 score: 0.461


## Model usage

Load the data:

In [8]:
def load_test_data(filename):
    with open(filename) as fd:
        reader = csv.reader(fd, delimiter='\t')
        next(reader, None) # ignore header row
        X = [row[0] for row in reader]

    return np.array(X)

X_train, y_train = load_train_data('../data/train.tsv')
X_test = load_test_data('../data/test.tsv')

Train the model with all training data:

In [9]:
X_train_processed = X_preprocessor.fit_transform(X_train)
X_test_processed = X_preprocessor.transform(X_test)
y_train_processed = y_preprocessor.fit_transform(y_train)
clf.fit(X_train_processed, y_train_processed)

Pipeline(steps=[('svc', OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=1))])

Predict output from test data:

In [10]:
y_output_processed = clf.predict(X_test_processed)
y_output = [list(y) for y in y_preprocessor.inverse_transform(y_output_processed)]

Show some output data:

In [11]:
print y_output[:10]

[['licence-needed'], ['hourly-wage'], [], [], ['2-4-years-experience-needed'], [], [], ['2-4-years-experience-needed', 'bs-degree-needed'], ['bs-degree-needed'], []]


Save output data:

In [12]:
def save_output(filename, output):
    with open(filename, 'w') as fd:
        fd.write('tags\n')
        
        for i, tags in enumerate(output):
            fd.write(' '.join(tags))
            
            if i < len(output) - 1:
                fd.write('\n')
            
save_output('../data/tags.tsv', y_output)

Save preprocessors and model:

In [13]:
import pickle

def save(filename, obj):
    pickle.dump(obj, open(filename, 'w'))

save('../models/X_preprocessor.pickle', X_preprocessor)
save('../models/y_preprocessor.pickle', y_preprocessor)
save('../models/clf_{0:.3f}_f1_score.pickle'.format(f1_score), clf)

## Load saved model

In [14]:
def load(filename):
    return pickle.load(open(filename))

clf = load('../models/clf_0.461_f1_score.pickle')
X_preprocessor = load('../models/X_preprocessor.pickle')
y_preprocessor = load('../models/y_preprocessor.pickle')