In [67]:
import numpy as np
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import pickle

## Basic model

Explain how load_files extract the individual datasets and categories from the folder structure

In [2]:
job_descriptions = load_files('data/training')

Exaplin the struscture returned by load_files

In [3]:
job_descriptions.data[0]

b"Software Developer- C#, Visual Studio, .Net, MVC- Chorley\n\nAbout the role\n\nRapid growth, expansion into new areas and demand from our customers has led to the need to expand our in-house software development team. We are now seeking a Software Developer to join the team. You will be responsible for playing a critical role in the development of new and existing applications and integrations in an agile environment.\n\nAbout Capita- Parking Eye\n\nWe are Capita, the UK's leading provider of business process management and integrated professional support service solutions. Through bespoke, quality solutions, we've helped countless organisations unlock value and maximise their potential. With access to our range of unique and diverse opportunities, offering real career advancement and progression, we can unlock your potential too.\n\nParkingEye (part of Capita) is the market leading car park management company. ParkingEye not only provide full circle car park management services but 

In [4]:
print(job_descriptions.target[0])
print(job_descriptions.target_names[job_descriptions.target[0]])

1
technical


In [5]:
job_descriptions.data[31]

b'Redbooth Ltd require roofing joiners immediately \n\nRedbooth Ltd require roofing tilers immediately for works on new build housing projects around the stockton and north east areas. Must be time served experienced and proficient with slating and concrete roof tiling. \n\nPlease note this is an immediate start so please call us on 07510439507.\n'

In [6]:
print(job_descriptions.target[31])
print(job_descriptions.target_names[job_descriptions.target[31]])

0
non_technical


In [7]:
X_train, X_test, y_train, y_test = train_test_split(job_descriptions.data, job_descriptions.target, test_size=0.3, random_state=0)

In [8]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(33, 2475)

In [9]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(33, 2475)

Training code extracted from http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [10]:
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [11]:
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
y_pred = clf.predict(X_test_tfidf)

In [12]:
print(classification_report(y_test, y_pred, labels=None, target_names=job_descriptions.target_names))

               precision    recall  f1-score   support

non_technical       1.00      0.38      0.55         8
    technical       0.58      1.00      0.74         7

  avg / total       0.81      0.67      0.63        15



## Using pipelines

In [13]:
pip_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])

In [14]:
pip_clf.fit(X_train, y_train) 

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [15]:
y_pred = pip_clf.predict(X_test)

In [16]:
print(classification_report(y_test, y_pred, labels=None, target_names=job_descriptions.target_names))

               precision    recall  f1-score   support

non_technical       1.00      0.38      0.55         8
    technical       0.58      1.00      0.74         7

  avg / total       0.81      0.67      0.63        15



## Model selection: cross validation

In [17]:
cv_scores = cross_val_score(pip_clf, job_descriptions.data, job_descriptions.target, cv=10)
print([np.mean(cv_scores), np.std(cv_scores)])

[0.80000000000000004, 0.13540064007726602]


## Model selection: grid search

Introducing the tf_max and tf_min parameters

In [47]:
class TfidTransformerOptional(TfidfTransformer):
    def __init__(self, activate=True):
        super().__init__()
        self.activate = activate
    
    def fit_transform(self, X, y=None):
        if self.activate:
            return super().fit_transform(X, y)
        else:
            return X
        
    def transform(self, X, y=None):
        if self.activate:
            return super().transform(X, y)
        else:
            return X

In [56]:
pip_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidTransformerOptional()),
                    ('clf', MultinomialNB()),
])

param_grid = [
    {
        'vect__min_df': np.arange(0,0.6,0.1),
        'vect__max_df': np.arange(0.6,1.1,0.1),
        'tfidf__activate': [True, False],
        'clf': [MultinomialNB()],
    },
    {
        'vect__min_df': np.arange(0,0.6,0.1),
        'vect__max_df': np.arange(0.6,1.1,0.1),
        'tfidf__activate': [True, False],
        'clf': [SVC()],
        'clf__C': [0.5, 1, 1.5]
    },
]

grid = GridSearchCV(pip_clf, cv=10, n_jobs=3, param_grid=param_grid)
grid.fit(job_descriptions.data, job_descriptions.target)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...formerOptional(activate=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=3,
       param_grid=[{'vect__min_df': array([ 0. ,  0.1,  0.2,  0.3,  0.4,  0.5]), 'vect__max_df': array([ 0.6,  0.7,  0.8,  0.9,  1. ,  1.1]), 'tfidf__activate': [True, False], 'clf': [MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)]}, {'vect__min_df': array([ 0. ,  0.1,  0.2,  0.3,  0.4,  0.5]),...ty=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)], 'clf__C': [0.5, 1, 1.5]}],
       pre_dispatch='2*n_jobs', refit=True, return_tr

In [64]:
mean_scores = list(grid.cv_results_['mean_test_score'])
print(mean_scores)
index_max_score = mean_scores.index(max(mean_scores))
print(mean_scores[index_max_score])
print(grid.cv_results_['param_vect__min_df'][index_max_score])
print(grid.cv_results_['param_vect__max_df'][index_max_score])
print(grid.cv_results_['param_clf'][index_max_score])
print(grid.cv_results_['param_clf__C'][index_max_score])
print(grid.cv_results_['param_tfidf__activate'][index_max_score])

[0.875, 0.85416666666666663, 0.85416666666666663, 0.79166666666666663, 0.60416666666666663, 0.58333333333333337, 0.875, 0.85416666666666663, 0.85416666666666663, 0.8125, 0.625, 0.5625, 0.875, 0.83333333333333337, 0.83333333333333337, 0.8125, 0.66666666666666663, 0.625, 0.83333333333333337, 0.85416666666666663, 0.85416666666666663, 0.8125, 0.70833333333333337, 0.6875, 0.85416666666666663, 0.85416666666666663, 0.8125, 0.75, 0.6875, 0.66666666666666663, 0.8125, 0.875, 0.79166666666666663, 0.75, 0.66666666666666663, 0.66666666666666663, 0.875, 0.85416666666666663, 0.875, 0.83333333333333337, 0.47916666666666669, 0.47916666666666669, 0.875, 0.85416666666666663, 0.89583333333333337, 0.8125, 0.6875, 0.64583333333333337, 0.875, 0.85416666666666663, 0.85416666666666663, 0.8125, 0.66666666666666663, 0.625, 0.875, 0.85416666666666663, 0.85416666666666663, 0.79166666666666663, 0.72916666666666663, 0.70833333333333337, 0.85416666666666663, 0.85416666666666663, 0.875, 0.79166666666666663, 0.70833333

Using all the training data to traing a model using the best parameters, and storing it into disk

In [81]:
pip_clf = Pipeline([('vect', CountVectorizer(min_df = 0.2, max_df = 0.7)),
                    ('tfidf', TfidTransformerOptional(activate=False)),
                    ('clf', MultinomialNB()),
])
pip_clf.fit(job_descriptions.data, job_descriptions.target) 

pickle.dump([pip_clf, job_descriptions.target_names], open('models/model.p', 'wb'))

## Testing our data with the validation model

In [82]:
files = ['non_technical_01.txt', 'non_technical_02.txt', 'non_technical_03.txt',
         'technical_01.txt', 'technical_02.txt', 'technical_03.txt']
for fil in files:
    with open('data/validation/' + fil, 'r') as f:
        text = [' '.join(f.readlines())] # The input to the pipeline has to be an array of documents
                                           # even if we are only planning to process a single document
        pred = pip_clf.predict(text)
        
        h = '------------------------------------------'
        print(h + ' ' + fil + ' ' + h)
        print('Predicted class: ' + job_descriptions.target_names[pred[0]])
        print(h + h)
        print(' ')
        print(text[0])

------------------------------------------ non_technical_01.txt ------------------------------------------
Predicted class: non_technical
------------------------------------------------------------------------------------
 
WE HAVE A VACANCY FOR A CAR MECHANIC AT OUR CAR SALES IN LLANTWIT FARDRE.EXPERIENCE IS ESSENTIAL AS WILL BE DOING ALL ASPECTS OF MECHANICS INCLUDING HEAD GASKETS GEARBOX CHANGE AND CLUTCHES.DUTIES ALSO INCLUDE SERVICING CHANGING TYRES AND WORKING AS PART OF A TEAM TO REACH OUR TARGETS.A FULL DRIVING LICENSE IS ALSO VITAL AS THE JOB INVOLVES DRIVING BETWEEN OUR CAR SALES AND TO THE MOT STATION.THE POSITION IS FULL TIME AND PERMANENT HOURS AND WAGES TO BE DISCUSSED AT INTERVIEW PROCESS.PLEASE CALL REVOLUTION CARS FOR MORE DETAILS

------------------------------------------ non_technical_02.txt ------------------------------------------
Predicted class: technical
------------------------------------------------------------------------------------
 
Over our 25 years o

## Conclusions

Several things to improve. Didn't do proper model selection with the Bayes classifier (parameter alpha). Didn't tokenize. Dataset is very small. We could label automatically basically on searching skills on universal jobmatch, rather than going document per document. Biased (Lift engineer as technical, should be non-techinical, but many techincal jobs have the word engineer in it, whereas non-technical jobs in the training dataset didnt have it.) The java developer is non-technical maybe due to the short length and many of the technology words probably discarded. 