In [76]:
import joblib
from scipy import spatial
import numpy as np
from transformers import pipeline, AutoTokenizer, TFPreTrainedModel  
from tqdm import tqdm
from gensim.models import Word2Vec


df = joblib.load("../raw_data/ip_2021-03-03.joblib")

## Baseline Words2vec model

In [11]:
### instanciate model


w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=20,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     )

### building vocab with tokenized words
w2v_model.build_vocab(df["job_text_tokenized"], progress_per=10000) 


###training the model on the dataset
w2v_model.train(df["job_text_tokenized"], total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)


### most similar words example
w2v_model.wv.most_similar(["python"])


[('r', 0.9423992037773132),
 ('java', 0.9318829774856567),
 ('scala', 0.9018955826759338),
 ('pyspark', 0.8928499221801758),
 ('scripting', 0.8636457920074463),
 ('bash', 0.8596066236495972),
 ('proficient', 0.8571724891662598),
 ('sas', 0.8519904017448425),
 ('css', 0.8489394187927246),
 ('html', 0.8456547260284424)]

## Training a Words2vec model with bi-gram parser

### Preprocessing

In [71]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from nltk.tokenize import sent_tokenize
'''
Preprocessing for the Job descriptions in paresed in senteneces.
Modified form the other preprocessing pipeline
'''
def to_lower(text):
    return text.lower()


def remove_number(text):
    text = ''.join(word for word in text if not word.isdigit())
    
    return text


def remove_stopwords_mod(text):

    stop_words = set(stopwords.words('english')) 
    new_text = []
    for sentence in text:
        new_sentence = tokenize(sentence)
        new_sentence = [w for w in sentence if not w in stop_words and w] 
        new_text.append(new_sentence)
  
    return new_text


def remove_punctuation_mod(text):
    
    punct = string.punctuation.replace(".","")
    for punctuation in punct:
        text = text.replace(punctuation, '')
    
    return text


def remove_stopwords(text):

    stop_words = set(stopwords.words('english'))
    new_text = []
    for sent in text:
        sent = word_tokenize(sent) 
        sent = [w for w in sent if w not in stop_words and w not in string.punctuation and w]  
        new_text.append(sent)
    return new_text


def lemmatize_words(text):
    
    lemmatizer = WordNetLemmatizer()
    for sentence in text:
        for word in sentence:
            word = lemmatizer.lemmatize(word)
        
    return text



In [80]:
### filter out german offers
df = df[df["tag_language"] == "en"]

In [81]:
### Apply preprocessing to df
df["job_text_sent"]= df["job_text"].apply(to_lower).apply(remove_number)\
                                    .apply(lambda x : x.replace('\n',' '))\
                                    .apply(remove_punctuation_mod)\
                                    .apply(lambda x: sent_tokenize(x))\
                                    .apply(remove_stopwords)\
                                    .apply(lemmatize_words)




In [82]:
df["job_text_sent"][0]

[['responsible',
  'improvement',
  'taledo',
  '’',
  'search',
  'matching',
  'engine',
  'candidates',
  'jobs',
  'business',
  'drivers',
  'data',
  'science'],
 ['develop',
  'compare',
  'different',
  'algorithmic',
  'approaches',
  'andor',
  'ml',
  'models'],
 ['monitor', 'production', 'performance', 'measure', 'success', 'work'],
 ['update', 'outdated', 'models'],
 ['research',
  'discuss',
  'algorithmical',
  'well',
  'model',
  'improvements',
  'regularly'],
 ['knowledgeable',
  'developed',
  'ai',
  'community',
  'propose',
  '’',
  'applicable',
  'taledo'],
 ['expect', 'curious', 'nature'],
 ['like',
  'solve',
  'challenging',
  'problems',
  'proficient',
  'python',
  'worked',
  'relevant',
  'libraries',
  'know',
  'use',
  'data',
  'handling',
  'numpy',
  'pandas',
  'dask',
  'psycopg',
  'mldl',
  'scikitlearn',
  'xgboost',
  'keras',
  'pytorch',
  'spacy',
  'visualization',
  'seaborn',
  'matplotlib',
  'experience',
  'evaluating',
  'different

### Parse the sentences

In [83]:
### Turn df into list of sentences
sentences = df["job_text_sent"].tolist()

### reduce the nesting of the list to fit the format of the Phrases module
sentence = []
for second in sentences:
    for first in second:
        sentence.append(first)
        

In [84]:
### Train the phraser to detect bi-grams
from gensim.models.phrases import Phrases

phrases = Phrases(sentence, min_count=30, progress_per=10000)
### transform the list of sentences to detect bigrams
sent = []
for phrase in phrases[sentence]:
    sent.append(phrase)

### Word2vec model v2
    

In [85]:
from gensim.models import Word2Vec

w2v_model2 = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     )

### building vocab with tokenized words
w2v_model2.build_vocab(sent, progress_per=10000) 


###training the model on the dataset
w2v_model2.train(sent, total_examples=w2v_model2.corpus_count, epochs=30, report_delay=1)


### most similar words example
w2v_model2.wv.most_similar(["python"])


[('scala', 0.9750685095787048),
 ('java', 0.9682307243347168),
 ('programming_languages', 0.966687798500061),
 ('r', 0.9658426642417908),
 ('programming', 0.9524619579315186),
 ('sql', 0.9322541952133179),
 ('javascript', 0.9288321733474731),
 ('proficiency', 0.9236559867858887),
 ('least_one', 0.9181228876113892),
 ('proficient', 0.9148669242858887)]

In [94]:
w2v_model2.wv.most_similar(["science","python","r"])

[('hands', 0.9449827075004578),
 ('programming_languages', 0.9349161386489868),
 ('similar', 0.9235848784446716),
 ('least_one', 0.923090934753418),
 ('software_engineering', 0.9210233688354492),
 ('programming', 0.9171253442764282),
 ('java', 0.9100304841995239),
 ('scala', 0.9072116613388062),
 ('solid', 0.9025342464447021),
 ('theoretical', 0.8921900391578674)]

## Vectorizer

In [19]:
def vectorizer(text):
    '''
    Replace the text with the respective vectors if there are in the model vocabulary
    '''
    new_text = []
    for word in text:
        if word in w2v_model.wv.vocab:
            vector = w2v_model.wv.__getitem__(word)
            new_text.append(vector)
    
    return new_text
    

In [22]:
df["vectorized_jobs"] = df["job_text_tokenized"].apply(vectorizer)

## Quick test for Translation pipeline and saving code for posterity


In [7]:
### quick test for transformer

from transformers import pipeline

summarizer = pipeline("summarization")

summarizer(df["job_text"][10],min_length=20, max_length=60)

All model checkpoint layers were used when initializing TFT5Model.

All the layers of TFT5Model were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5Model for predictions without further training.
All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
Token indices sequence length is longer than the specified maximum sequence length for this model (1091 > 512). Running this sequence through the model will result in indexing errors


[{'summary_text': 'TD Reply is an innovation and marketing consultancy and part of the Reply Group . we are working on international data science projects for our clients such as Audi, Adidas, Coca-Cola, Miele, Telefonica, and BMW . you will collaborate with an experienced and enthusiastic'}]

In [None]:
### vector extraction from transfomer model



feature_extractor = pipeline("feature-extraction", model = "distilbert-base-cased")

def similarity(s1, s2):
    return  1 - spatial.distance.cosine(feature_extractor(s1)[0][-1], feature_extractor(s2)[0][-1])

def get_features(s):
    return feature_extractor(s)[0][-1]

sentance1 = "no one loves sushi"
sentance2 = "I use java for backend stuff and I'm important"
sentance3 = "I use html and css for be the frontend guy there is"

print(similarity(sentance1, sentance2))
print(similarity(sentance2, sentance3))


# modeling
from sklearn.cluster import KMeans
model  = KMeans(n_clusters=2)
X= np.array([get_features(sentance1),get_features(sentance2),get_features(sentance3)])
model.fit(X)
model.predict(X)
tokens = s.lower().replace('  ',' ').replace('\n',' ').split(' ')
threshold = 0.8
for token in tqdm(set(tokens)):
    if threshold < similarity(token.lower(), 'Skills'.lower()):
        print(token)
