# ESCO Occupation Classifier

Explore our dataset with SQL

In [0]:
%sql
select * from default.esco_4occupations


title,idesco_level_4,esco_level_4


Now check the distribution of ESCO OCCUPATION in the dataset

In [0]:
%sql
select esco_level_4, count(*) from default.esco_4occupations group by esco_level_4

esco_level_4,count(1)
Advertising and marketing professionals,5000
Software developers,5000
"Mathematicians, actuaries and statisticians",4976
2120,22
"and Evaluation Expert""""",1
Industrial and production engineers,5000
"Lön: 400 kr""",1


Read data from database (lowercase, alias, ...)

In [0]:
dataset = spark.sql("select regexp_replace(lower(title), '[0-9]', ' ') as title_cleaned,title,idesco_level_4 as target, esco_level_4 as target_label from default.esco_4occupations")
display(dataset.select("*"))

title_cleaned,title,target,target_label
b -c softwareentwickler c++ und c#/.net (m/w),B93-C04 Softwareentwickler C++ und C#/.NET (m/w),2512,Software developers
gezocht: oracle developer #freelance #pands #jobs #vacatures (req: –loc:bxl),Gezocht: Oracle Developer #Freelance #PandS #Jobs #Vacatures (Req:9096–Loc:Bxl),2512,Software developers
senior (gxp process excellence) engineer,Senior (GXP Process Excellence) Engineer,2512,Software developers
software-entwickler (m/w/d) buildsystem / integration,Software-Entwickler (m/w/d) Buildsystem / Integration,2512,Software developers
business intelligence developer,Business Intelligence Developer,2512,Software developers
microsoft dynamics nav functional consultant,Microsoft Dynamics NAV Functional Consultant,2512,Software developers
senior ecotoxicological expert,Senior Ecotoxicological Expert,2512,Software developers
senior physical layer software engineer -,Senior Physical Layer Software Engineer - 729984,2512,Software developers
junior of senior inkoper,Junior of Senior Inkoper,2512,Software developers
software engineer .net (se ),Software Engineer .Net (SE01),2512,Software developers


### Pre-processing (clean text and reduce features)

In [0]:
from pyspark.ml.feature import RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

regexTokenizer = RegexTokenizer(inputCol="title_cleaned", outputCol="words", pattern="\\W")
# alternatively, pattern="\\w+", gaps(False)

countTokens = udf(lambda words: len(words), IntegerType())

tokenized = regexTokenizer.transform(dataset)
tokenized.select("title_cleaned", "words").withColumn("tokens", countTokens(col("words"))).show(truncate=False)


+-------------------------------------------------------------------------------+------------------------------------------------------------------------------+------+
|title_cleaned                                                                  |words                                                                         |tokens|
+-------------------------------------------------------------------------------+------------------------------------------------------------------------------+------+
|b  -c   softwareentwickler c++ und c#/.net (m/w)                               |[b, c, softwareentwickler, c, und, c, net, m, w]                              |9     |
|gezocht: oracle developer #freelance #pands #jobs #vacatures (req:    –loc:bxl)|[gezocht, oracle, developer, freelance, pands, jobs, vacatures, req, loc, bxl]|10    |
|senior (gxp process excellence) engineer                                       |[senior, gxp, process, excellence, engineer]                                  |

In [0]:
from pyspark.sql.functions import explode, desc
tokens = tokenized.select(explode(col("words")).alias("word")).groupBy(col("word")).count().orderBy(desc("count"))
tokens.show()

We have to filter shor sentences, stopwords, ...

Start short string (len < 3) with a user defined funtion

In [0]:
from pyspark.sql.functions import udf, explode, desc, col
from pyspark.sql.types import ArrayType, StringType
def filter_by_len(words):
  filtered = [word for word in words if len(word) >= 3]
  return filtered

filter_by_len_udf = udf(filter_by_len, ArrayType(StringType()))

filtered = tokenized.withColumn("filtered", filter_by_len_udf(col("words")))
filtered.show()

+--------------------+--------------------+------+-------------------+--------------------+--------------------+
|       title_cleaned|               title|target|       target_label|               words|            filtered|
+--------------------+--------------------+------+-------------------+--------------------+--------------------+
|b  -c   softwaree...|B93-C04 Softwaree...|  2512|Software developers|[b, c, softwareen...|[softwareentwickl...|
|gezocht: oracle d...|Gezocht: Oracle D...|  2512|Software developers|[gezocht, oracle,...|[gezocht, oracle,...|
|senior (gxp proce...|Senior (GXP Proce...|  2512|Software developers|[senior, gxp, pro...|[senior, gxp, pro...|
|software-entwickl...|Software-Entwickl...|  2512|Software developers|[software, entwic...|[software, entwic...|
|business intellig...|Business Intellig...|  2512|Software developers|[business, intell...|[business, intell...|
|microsoft dynamic...|Microsoft Dynamic...|  2512|Software developers|[microsoft, dynam...|[micr

In [0]:
filtered.printSchema()

root
 |-- title_cleaned: string (nullable = true)
 |-- title: string (nullable = true)
 |-- target: string (nullable = true)
 |-- target_label: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [0]:
tokens = filtered.select(explode(col("filtered")).alias("word")).groupBy(col("word")).count().orderBy(desc("count"))
tokens.show()

+-----------+-----+
|       word|count|
+-----------+-----+
|   engineer| 2699|
|     senior| 2696|
|    manager| 2071|
|  marketing| 2060|
|  developer| 1813|
|   business| 1506|
|   software| 1305|
| consultant| 1283|
|development| 1102|
|    analyst| 1021|
|  analytics|  805|
|        and|  793|
|     supply|  793|
|      chain|  772|
| specialist|  732|
|    actuary|  682|
|  actuarial|  677|
|       java|  573|
|        job|  573|
|     junior|  540|
+-----------+-----+
only showing top 20 rows



Now we remove the stopwords...

In [0]:
from pyspark.ml.feature import StopWordsRemover


remover = StopWordsRemover(inputCol="filtered", outputCol="cleaned")
cleaned = remover.transform(filtered)
cleaned.show()

+--------------------+--------------------+------+-------------------+--------------------+--------------------+--------------------+
|       title_cleaned|               title|target|       target_label|               words|            filtered|             cleaned|
+--------------------+--------------------+------+-------------------+--------------------+--------------------+--------------------+
|b  -c   softwaree...|B93-C04 Softwaree...|  2512|Software developers|[b, c, softwareen...|[softwareentwickl...|[softwareentwickl...|
|gezocht: oracle d...|Gezocht: Oracle D...|  2512|Software developers|[gezocht, oracle,...|[gezocht, oracle,...|[gezocht, oracle,...|
|senior (gxp proce...|Senior (GXP Proce...|  2512|Software developers|[senior, gxp, pro...|[senior, gxp, pro...|[senior, gxp, pro...|
|software-entwickl...|Software-Entwickl...|  2512|Software developers|[software, entwic...|[software, entwic...|[software, entwic...|
|business intellig...|Business Intellig...|  2512|Software dev

In [0]:
tokens = cleaned.select(explode(col("cleaned")).alias("word")).groupBy(col("word")).count().orderBy(desc("count"))
tokens.show()

N-grams creation....

In [0]:
from pyspark.ml.feature import NGram
ngrams2 = NGram(n=2, inputCol="cleaned", outputCol="ngrams_2")
ngrams3 = NGram(n=3, inputCol="cleaned", outputCol="ngrams_3")
ngrams4 = NGram(n=4, inputCol="cleaned", outputCol="ngrams_4")


ngrams = ngrams2.transform(cleaned)
ngrams = ngrams3.transform(ngrams)
ngrams = ngrams4.transform(ngrams)






display(ngrams)

title_cleaned,title,target,target_label,words,filtered,cleaned,ngrams_2,ngrams_3,ngrams_4
b -c softwareentwickler c++ und c#/.net (m/w),B93-C04 Softwareentwickler C++ und C#/.NET (m/w),2512,Software developers,"List(b, c, softwareentwickler, c, und, c, net, m, w)","List(softwareentwickler, und, net)","List(softwareentwickler, und, net)","List(softwareentwickler und, und net)",List(softwareentwickler und net),List()
gezocht: oracle developer #freelance #pands #jobs #vacatures (req: –loc:bxl),Gezocht: Oracle Developer #Freelance #PandS #Jobs #Vacatures (Req:9096–Loc:Bxl),2512,Software developers,"List(gezocht, oracle, developer, freelance, pands, jobs, vacatures, req, loc, bxl)","List(gezocht, oracle, developer, freelance, pands, jobs, vacatures, req, loc, bxl)","List(gezocht, oracle, developer, freelance, pands, jobs, vacatures, req, loc, bxl)","List(gezocht oracle, oracle developer, developer freelance, freelance pands, pands jobs, jobs vacatures, vacatures req, req loc, loc bxl)","List(gezocht oracle developer, oracle developer freelance, developer freelance pands, freelance pands jobs, pands jobs vacatures, jobs vacatures req, vacatures req loc, req loc bxl)","List(gezocht oracle developer freelance, oracle developer freelance pands, developer freelance pands jobs, freelance pands jobs vacatures, pands jobs vacatures req, jobs vacatures req loc, vacatures req loc bxl)"
senior (gxp process excellence) engineer,Senior (GXP Process Excellence) Engineer,2512,Software developers,"List(senior, gxp, process, excellence, engineer)","List(senior, gxp, process, excellence, engineer)","List(senior, gxp, process, excellence, engineer)","List(senior gxp, gxp process, process excellence, excellence engineer)","List(senior gxp process, gxp process excellence, process excellence engineer)","List(senior gxp process excellence, gxp process excellence engineer)"
software-entwickler (m/w/d) buildsystem / integration,Software-Entwickler (m/w/d) Buildsystem / Integration,2512,Software developers,"List(software, entwickler, m, w, d, buildsystem, integration)","List(software, entwickler, buildsystem, integration)","List(software, entwickler, buildsystem, integration)","List(software entwickler, entwickler buildsystem, buildsystem integration)","List(software entwickler buildsystem, entwickler buildsystem integration)",List(software entwickler buildsystem integration)
business intelligence developer,Business Intelligence Developer,2512,Software developers,"List(business, intelligence, developer)","List(business, intelligence, developer)","List(business, intelligence, developer)","List(business intelligence, intelligence developer)",List(business intelligence developer),List()
microsoft dynamics nav functional consultant,Microsoft Dynamics NAV Functional Consultant,2512,Software developers,"List(microsoft, dynamics, nav, functional, consultant)","List(microsoft, dynamics, nav, functional, consultant)","List(microsoft, dynamics, nav, functional, consultant)","List(microsoft dynamics, dynamics nav, nav functional, functional consultant)","List(microsoft dynamics nav, dynamics nav functional, nav functional consultant)","List(microsoft dynamics nav functional, dynamics nav functional consultant)"
senior ecotoxicological expert,Senior Ecotoxicological Expert,2512,Software developers,"List(senior, ecotoxicological, expert)","List(senior, ecotoxicological, expert)","List(senior, ecotoxicological, expert)","List(senior ecotoxicological, ecotoxicological expert)",List(senior ecotoxicological expert),List()
senior physical layer software engineer -,Senior Physical Layer Software Engineer - 729984,2512,Software developers,"List(senior, physical, layer, software, engineer)","List(senior, physical, layer, software, engineer)","List(senior, physical, layer, software, engineer)","List(senior physical, physical layer, layer software, software engineer)","List(senior physical layer, physical layer software, layer software engineer)","List(senior physical layer software, physical layer software engineer)"
junior of senior inkoper,Junior of Senior Inkoper,2512,Software developers,"List(junior, of, senior, inkoper)","List(junior, senior, inkoper)","List(junior, senior, inkoper)","List(junior senior, senior inkoper)",List(junior senior inkoper),List()
software engineer .net (se ),Software Engineer .Net (SE01),2512,Software developers,"List(software, engineer, net, se)","List(software, engineer, net)","List(software, engineer, net)","List(software engineer, engineer net)",List(software engineer net),List()


In [0]:

# union of the results

def union_ngrams(c1,c2,c3,c4):
  return c1 + c2 + c3 + c4

union_ngrams_udf = udf(union_ngrams, ArrayType(StringType()))

ngrams_final = ngrams.filter("cleaned is not Null").withColumn("ngrams", union_ngrams_udf(col("cleaned"), col("ngrams_2"), col("ngrams_3"), col("ngrams_4")))
display(ngrams_final)



title_cleaned,title,target,target_label,words,filtered,cleaned,ngrams_2,ngrams_3,ngrams_4,ngrams
b -c softwareentwickler c++ und c#/.net (m/w),B93-C04 Softwareentwickler C++ und C#/.NET (m/w),2512,Software developers,"List(b, c, softwareentwickler, c, und, c, net, m, w)","List(softwareentwickler, und, net)","List(softwareentwickler, und, net)","List(softwareentwickler und, und net)",List(softwareentwickler und net),List(),"List(softwareentwickler, und, net, softwareentwickler und, und net, softwareentwickler und net)"
gezocht: oracle developer #freelance #pands #jobs #vacatures (req: –loc:bxl),Gezocht: Oracle Developer #Freelance #PandS #Jobs #Vacatures (Req:9096–Loc:Bxl),2512,Software developers,"List(gezocht, oracle, developer, freelance, pands, jobs, vacatures, req, loc, bxl)","List(gezocht, oracle, developer, freelance, pands, jobs, vacatures, req, loc, bxl)","List(gezocht, oracle, developer, freelance, pands, jobs, vacatures, req, loc, bxl)","List(gezocht oracle, oracle developer, developer freelance, freelance pands, pands jobs, jobs vacatures, vacatures req, req loc, loc bxl)","List(gezocht oracle developer, oracle developer freelance, developer freelance pands, freelance pands jobs, pands jobs vacatures, jobs vacatures req, vacatures req loc, req loc bxl)","List(gezocht oracle developer freelance, oracle developer freelance pands, developer freelance pands jobs, freelance pands jobs vacatures, pands jobs vacatures req, jobs vacatures req loc, vacatures req loc bxl)","List(gezocht, oracle, developer, freelance, pands, jobs, vacatures, req, loc, bxl, gezocht oracle, oracle developer, developer freelance, freelance pands, pands jobs, jobs vacatures, vacatures req, req loc, loc bxl, gezocht oracle developer, oracle developer freelance, developer freelance pands, freelance pands jobs, pands jobs vacatures, jobs vacatures req, vacatures req loc, req loc bxl, gezocht oracle developer freelance, oracle developer freelance pands, developer freelance pands jobs, freelance pands jobs vacatures, pands jobs vacatures req, jobs vacatures req loc, vacatures req loc bxl)"
senior (gxp process excellence) engineer,Senior (GXP Process Excellence) Engineer,2512,Software developers,"List(senior, gxp, process, excellence, engineer)","List(senior, gxp, process, excellence, engineer)","List(senior, gxp, process, excellence, engineer)","List(senior gxp, gxp process, process excellence, excellence engineer)","List(senior gxp process, gxp process excellence, process excellence engineer)","List(senior gxp process excellence, gxp process excellence engineer)","List(senior, gxp, process, excellence, engineer, senior gxp, gxp process, process excellence, excellence engineer, senior gxp process, gxp process excellence, process excellence engineer, senior gxp process excellence, gxp process excellence engineer)"
software-entwickler (m/w/d) buildsystem / integration,Software-Entwickler (m/w/d) Buildsystem / Integration,2512,Software developers,"List(software, entwickler, m, w, d, buildsystem, integration)","List(software, entwickler, buildsystem, integration)","List(software, entwickler, buildsystem, integration)","List(software entwickler, entwickler buildsystem, buildsystem integration)","List(software entwickler buildsystem, entwickler buildsystem integration)",List(software entwickler buildsystem integration),"List(software, entwickler, buildsystem, integration, software entwickler, entwickler buildsystem, buildsystem integration, software entwickler buildsystem, entwickler buildsystem integration, software entwickler buildsystem integration)"
business intelligence developer,Business Intelligence Developer,2512,Software developers,"List(business, intelligence, developer)","List(business, intelligence, developer)","List(business, intelligence, developer)","List(business intelligence, intelligence developer)",List(business intelligence developer),List(),"List(business, intelligence, developer, business intelligence, intelligence developer, business intelligence developer)"
microsoft dynamics nav functional consultant,Microsoft Dynamics NAV Functional Consultant,2512,Software developers,"List(microsoft, dynamics, nav, functional, consultant)","List(microsoft, dynamics, nav, functional, consultant)","List(microsoft, dynamics, nav, functional, consultant)","List(microsoft dynamics, dynamics nav, nav functional, functional consultant)","List(microsoft dynamics nav, dynamics nav functional, nav functional consultant)","List(microsoft dynamics nav functional, dynamics nav functional consultant)","List(microsoft, dynamics, nav, functional, consultant, microsoft dynamics, dynamics nav, nav functional, functional consultant, microsoft dynamics nav, dynamics nav functional, nav functional consultant, microsoft dynamics nav functional, dynamics nav functional consultant)"
senior ecotoxicological expert,Senior Ecotoxicological Expert,2512,Software developers,"List(senior, ecotoxicological, expert)","List(senior, ecotoxicological, expert)","List(senior, ecotoxicological, expert)","List(senior ecotoxicological, ecotoxicological expert)",List(senior ecotoxicological expert),List(),"List(senior, ecotoxicological, expert, senior ecotoxicological, ecotoxicological expert, senior ecotoxicological expert)"
senior physical layer software engineer -,Senior Physical Layer Software Engineer - 729984,2512,Software developers,"List(senior, physical, layer, software, engineer)","List(senior, physical, layer, software, engineer)","List(senior, physical, layer, software, engineer)","List(senior physical, physical layer, layer software, software engineer)","List(senior physical layer, physical layer software, layer software engineer)","List(senior physical layer software, physical layer software engineer)","List(senior, physical, layer, software, engineer, senior physical, physical layer, layer software, software engineer, senior physical layer, physical layer software, layer software engineer, senior physical layer software, physical layer software engineer)"
junior of senior inkoper,Junior of Senior Inkoper,2512,Software developers,"List(junior, of, senior, inkoper)","List(junior, senior, inkoper)","List(junior, senior, inkoper)","List(junior senior, senior inkoper)",List(junior senior inkoper),List(),"List(junior, senior, inkoper, junior senior, senior inkoper, junior senior inkoper)"
software engineer .net (se ),Software Engineer .Net (SE01),2512,Software developers,"List(software, engineer, net, se)","List(software, engineer, net)","List(software, engineer, net)","List(software engineer, engineer net)",List(software engineer net),List(),"List(software, engineer, net, software engineer, engineer net, software engineer net)"


Train a Word2Vec model...

In [0]:
from pyspark.ml.feature import Word2Vec

word2Vec = Word2Vec(vectorSize=300, minCount=10, inputCol="ngrams", outputCol="features")
model = word2Vec.fit(ngrams_final)

Try our model

In [0]:
model.findSynonyms("java", 10).show(truncate=False)
model.findSynonyms("senior developer", 10).show(truncate=False)

+-----------------------+------------------+
|word                   |similarity        |
+-----------------------+------------------+
|jee                    |0.9312806129455566|
|backend                |0.9187647700309753|
|php                    |0.9084030985832214|
|java backend           |0.9025614857673645|
|scala                  |0.8990962505340576|
|softwareentwickler java|0.8912721872329712|
|java entwickler        |0.8900978565216064|
|fullstack              |0.8858669996261597|
|junior senior          |0.884949803352356 |
|senior java            |0.8828367590904236|
+-----------------------+------------------+

+----------------------+------------------+
|word                  |similarity        |
+----------------------+------------------+
|developer junior      |0.9705222249031067|
|learn                 |0.9683489799499512|
|python developer      |0.9672991037368774|
|junior senior         |0.9660220742225647|
|angular               |0.9651327133178711|
|developer contra

Now apply our model to the data...

In [0]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors

ngrams_featured = model.transform(ngrams_final)
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
scalerModel = scaler.fit(ngrams_featured)
scaledData = scalerModel.transform(ngrams_featured)
print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax()))
scaledData.select("features", "scaledFeatures").show()


Features scaled to range: [0.000000, 1.000000]
+--------------------+--------------------+
|            features|      scaledFeatures|
+--------------------+--------------------+
|[-0.0357836397985...|[0.35636445123551...|
|[-0.0044207780293...|[0.41421106729170...|
|[-0.0028463451391...|[0.41711499919610...|
|[-0.0447060678154...|[0.33990765365186...|
|[-0.0217656299161...|[0.38221969497910...|
|[-0.0028701475821...|[0.41707109724634...|
|[0.02844163667759...|[0.47482350439064...|
|[-0.0141771302691...|[0.39621615450986...|
|[0.03355969898014...|[0.48426341415225...|
|[-0.0800628668318...|[0.27469449827317...|
|[-0.0173679931710...|[0.39033082993297...|
|[-0.0077159447957...|[0.40813336149489...|
|[0.00419744849205...|[0.43010678638569...|
|[-0.0315386446192...|[0.36419404944318...|
|[-0.0061797209084...|[0.41096681955246...|
|[0.05712341765562...|[0.52772505349836...|
|[-0.0026739795825...|[0.41743291547069...|
|[0.02378323125756...|[0.46623139976226...|
|[-0.0431319226821...|[0.3428

Convert target label to index

In [0]:
from pyspark.ml.feature import IndexToString, StringIndexer

indexer = StringIndexer(inputCol="target", outputCol="label")
indexer_model = indexer.fit(scaledData)
indexed = indexer_model.transform(scaledData)

converter = IndexToString(inputCol="prediction", outputCol="prediction_category", labels=indexer_model.labels)


Prepare train and test

In [0]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

train, test = indexed.randomSplit([0.8, 0.2], seed=12345)

nb = NaiveBayes(smoothing=1.0, modelType="multinomial", labelCol="label", featuresCol="scaledFeatures")
paramGrid = ParamGridBuilder()\
    .addGrid(nb.smoothing, [0.1, 0.5, 1.0]) \
    .build()

tvs = TrainValidationSplit(estimator=nb,
                           estimatorParamMaps=paramGrid,
                           evaluator=MulticlassClassificationEvaluator(),
                           trainRatio=0.8)

In [0]:
model = tvs.fit(train)
predictions = model.transform(test).select("title", "label", "prediction")
display(predictions)

Compute accuracy on the test set

In [0]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="weightedPrecision")
precision = evaluator.evaluate(predictions)
print("Test set weighted precision = " + str(precision))