In [32]:
import pandas as pd
import numpy as np
from core.helpers import yakeHelperFunctions as yhf
from sklearn.cluster import SpectralClustering
import itertools

In [2]:
data_job = pd.read_csv("dataset/job.csv")

In [3]:
vectors = pd.read_csv("word2vec.model", delim_whitespace=True, skiprows=[0], header=None)


In [4]:
# set words as index rather than first column
vectors.index = vectors[0]
vectors.drop(0,axis=1, inplace=True)

In [5]:
data_job.shape

(8165, 1)

# make the sample of 15% of original data

In [6]:
samples_data = data_job.job.sample(frac=.15, random_state=45).astype(str)

# extract keywords & tokenize

In [7]:
keywords = yhf.keywords_yake(samples_data)
keywords_tokenized = yhf.tokenizing_after_yake(keywords)
keywords_set = [ set(job) for job in keywords_tokenized]

In [20]:
#remove empty sets
keywords_set_no_empty = [x for x in keywords_set if x]

# get word vectors  for keywords in jobs description

In [8]:
vectors.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,291,292,293,294,295,296,297,298,299,300
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
de,-0.189453,0.181641,0.335938,-0.024048,-0.069824,0.075195,0.128906,-0.341797,-0.074707,-0.101074,...,0.000546,-0.017822,-0.060059,0.34375,-0.18457,0.112305,0.148438,0.116699,0.056885,0.105469
et,1.476465,0.650921,1.125034,-0.641027,2.13323,1.00961,0.522009,0.281066,-1.012411,1.352205,...,0.992428,0.372222,1.381234,-0.745683,0.519218,0.505964,-0.132756,0.549315,1.261921,0.157704
des,0.074707,0.112793,0.251953,-0.172852,-0.011658,0.233398,0.349609,-0.219727,-0.060059,0.287109,...,-0.023071,0.152344,0.111816,0.064453,-0.237305,0.000614,0.107422,0.141602,-0.036377,0.337891
la,2.521357,0.398546,0.327319,-0.384584,0.468277,0.268836,0.918718,1.271349,-2.448235,0.402296,...,1.211048,-0.172852,1.216892,1.828425,0.227053,0.490466,0.163657,0.464512,0.553507,1.263063
les,-0.120117,0.095703,0.168945,-0.030273,0.119141,0.147461,0.086426,-0.168945,0.051758,0.308594,...,0.182617,0.168945,-0.104492,0.033203,-0.488281,-0.053223,0.134766,-0.027832,0.087891,0.289062


In [9]:
def vectors_from_job(job):
        all_words = []
        for words in job:
                all_words.append(words)
        return vectors[vectors.index.isin(all_words)]

In [10]:
# test function for getting job description embedding (in the dumbest way possible)
def doc_embed_from_job(job):
        test_vectors = vectors_from_job(job)
        return test_vectors.mean()


In [26]:
# get document embeddings for job
num_of_jobs = len(keywords_set_no_empty)
doc_embeddings = np.zeros([num_of_jobs,300])
# TODO: handle jobs where all word out vocabulary
for i in range(num_of_jobs):
        embeddings =np.array(doc_embed_from_job(keywords_set_no_empty[i]))
        if np.isnan(embeddings).any():
                doc_embeddings[i,:] = np.zeros([1,300])
        else:
                doc_embeddings[i,:] = embeddings



In [27]:
doc_embeddings.shape

(1225, 300)

# clustering

In [30]:
# k-means label assignement didn't work well
clustering = SpectralClustering(assign_labels='discretize', random_state=42).fit(doc_embeddings)

In [31]:
clustering.labels_

array([0, 0, 0, ..., 0, 0, 0])

In [35]:
# explore our jobs by clusternumber
jobs_subset = keywords_set_no_empty[0:num_of_jobs]
def get_keyword_set_by_cluster_number(number):
        cluster_index = list(clustering.labels_ == number)
        return list(itertools.compress(jobs_subset, cluster_index))

In [36]:
get_keyword_set_by_cluster_number(3)

[{'antenne',
  'antennes',
  'assurer',
  'dans',
  'des',
  'deutsche',
  'environnement',
  'finance',
  'finances',
  'foret',
  'für',
  'gesellschaft',
  'internationale',
  'l',
  'les',
  'pour',
  'projet',
  'responsable',
  'zusammenarbeit'},
 {'bon',
  'cameroun',
  'communautaire',
  'conseiller',
  'dans',
  'des',
  'description',
  'deutsche',
  'developpement',
  'economique',
  'endogene',
  'für',
  'gesellschaft',
  'giz',
  'gmbh',
  'innovations',
  'internationale',
  'le',
  'les',
  'numeriques',
  'pour',
  'projet',
  'promouvoir',
  'technique',
  'zusammenarbeit'},
 {'action',
  'agraire',
  'agraires',
  'appui',
  'coordination',
  'croix',
  'd',
  'dans',
  'des',
  'diagnostic',
  'diagnostics',
  'diplômés',
  'due',
  'est',
  'etude',
  'européenne',
  'financement',
  'française',
  'internationale',
  'jeunes',
  'l',
  'les',
  'par',
  'première',
  'pro',
  'projet',
  'rapport',
  'resilience',
  'rouge',
  'références',
  'régions',
  'union',

In [37]:
get_keyword_set_by_cluster_number(0)

[{'au',
  'communication',
  'conduire',
  'd',
  'dans',
  'des',
  'direction',
  'délivrées',
  'développer',
  'emploi',
  'est',
  'et',
  'expertise',
  'garant',
  'l',
  'les',
  'marketing',
  'mission',
  'mode',
  'normes',
  'offres',
  'place',
  'prestations',
  'principale',
  'projet',
  'promouvoir',
  'respect',
  'réseaux',
  'sein',
  'stockage',
  'votre',
  'wise'},
 {'appareils',
  'aptitudes',
  'assigné',
  'aux',
  'charge',
  'd',
  'différentes',
  'défectueuses',
  'esprit',
  'essences',
  'etre',
  'famille',
  'fois',
  'forte',
  'grumes',
  'les',
  'missions',
  'opérations',
  'ouverture',
  'pannes',
  'pièces',
  'principales',
  'qualifications',
  'requises',
  'réaliser',
  'sans',
  'sciées',
  'secondaires',
  'serez',
  'sur',
  'toutes',
  'tâches',
  'une',
  'vous'},
 {'concevoir',
  'convertir',
  'css',
  'des',
  'design',
  'designer',
  'documenter',
  'graphiques',
  'html',
  'maquettes',
  'nous',
  'pouvant',
  'professionnels',
 

In [38]:
get_keyword_set_by_cluster_number(2)

[{'a',
  'analysts',
  'breathe',
  'creative',
  'crystal',
  'digital',
  'engine',
  'forming',
  'group',
  'house',
  'marketing',
  'media',
  'optimization',
  'platforms',
  'ranging',
  'search',
  'social'},
 {'aids',
  'applications',
  'dans',
  'date',
  'des',
  'description',
  'données',
  'due',
  'elizabeth',
  'end',
  'foundation',
  'glaser',
  'job',
  'les',
  'number',
  'of',
  'pediatric',
  'positions',
  'programme',
  'reports',
  'start',
  'sud',
  'title'},
 {'accountability',
  'announcement',
  'assistance',
  'demonstrated',
  'experience',
  'gis',
  'highly',
  'integrated',
  'job',
  'manager',
  'managing',
  'mastery',
  'meal',
  'mechanism',
  'monitoring',
  'motivated',
  'of',
  'organization',
  'program',
  'project',
  'qualified',
  'services',
  'significant',
  'skills',
  'social',
  'system',
  'systems',
  'years'},
 {'aids',
  'announcement',
  'contract',
  'duration',
  'elizabeth',
  'foundation',
  'glaser',
  'hiv',
  'ict',
