In [6]:
import pandas as pd
pd.options.mode.chained_assignment = None 
import numpy as np
import re
import nltk
import os
import gensim
import gzip

from gensim.models import word2vec, KeyedVectors
from gensim.test.utils import common_texts, get_tmpfile

from nltk.stem import PorterStemmer
from nltk import word_tokenize

from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [8]:
os.chdir('/Users/patrickrs/Documents/GitLab/patrick-steiner/revealapp/00_exploration/Pat')

data = pd.read_csv('/Users/patrickrs/Documents/Gitlab/patrick-steiner/Exercises/data/job_ads_eng.csv')  # .sample(50000, random_state=23)
data_original = data

In [7]:
#Function for cleaning and stemming the data
nltk.download('stopwords')
STOP_WORDS = nltk.corpus.stopwords.words('english')

def clean_sentence(val):
    "remove chars that are not letters or numbers, downcase, then remove stop words"
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val).lower()
    sentence = re.sub("xxxx", "", sentence)
    sentence = re.sub("xxx", "", sentence)
    sentence = re.sub("xx", "", sentence)
    sentence = re.sub("\s\s+", " ", sentence)
       
    # stemming of words (seems not to affect accuracy, but should make things faster
   # porter = PorterStemmer()
   # words = word_tokenize(sentence)
   # sentence = " ".join([porter.stem(word) for word in words])
      
    sentence = sentence.split(" ")
    for word in list(sentence):
        if word in STOP_WORDS:
            sentence.remove(word)  
    sentence = " ".join(sentence)
    
    return sentence


def clean_dataframe(data):
    "drop nans, then apply 'clean_sentence' function to question1 and 2"
    data = data[data['Content'] == data['Content']]  # removes nan since nan == nan -> False
    
    for col in ['Content']:
        data[col] = data[col].apply(clean_sentence)
    
    return data

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/patrickrs/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
data = clean_dataframe(data)
data = data['Content']

In [10]:
#Build Corpus:
def build_corpus(data):
    "Creates a list of lists containing words from each sentence"
    corpus = []
    for content in data:
        corpus_temp = nltk.word_tokenize(content)
        corpus.append(corpus_temp)
    return corpus

corpus = build_corpus(data)              

# Word2Vec Implementation
This section compares a pre-trained model to a model trained on the data alone.

In [67]:
# Model 1 is trained only on the available data
model_1 = word2vec.Word2Vec(corpus, size=300, min_count=1)

In [18]:
# Importing pre-trained model, updating vocab to include only words present in current dataset.
# and training the model (takes long to run)
model_2 = word2vec.Word2Vec(size=300, min_count=1)
model_2.build_vocab(corpus)
total_examples = model_2.corpus_count
model = gensim.models.KeyedVectors.load_word2vec_format('/Users/patrickrs/Documents/Gitlab/patrick-steiner/revealapp/Playground/Tag/GoogleNews-vectors-negative300.bin', binary=True)
model_2.build_vocab([list(model.vocab.keys())], update=True)
model_2.intersect_word2vec_format('/Users/patrickrs/Documents/Gitlab/patrick-steiner/revealapp/Playground/Tag/GoogleNews-vectors-negative300.bin', binary=True, lockf=1.0)
# intersect_word2vec_format() will let you bring vectors from an external file into a model that's already had its own vocabulary initialized
# see https://tedboy.github.io/nlps/generated/generated/gensim.models.Word2Vec.intersect_word2vec_format.html
model_2.train(corpus, total_examples=total_examples, epochs=model_2.iter)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/patrickrs/Documents/Gitlab/patrick-steiner/revealapp/Playground/Tag/GoogleNews-vectors-negative300.bin'

In [70]:
# Let's save this model to disk:
wordvectors = model_2.wv
path = get_tmpfile("wordvectors.kv")
model_2.wv.save(path)
wv = KeyedVectors.load("model_2.wv", mmap='r')

FileNotFoundError: [Errno 2] No such file or directory: 'model_2.wv'

In [115]:
swords = model_2.most_similar('system', restrict_vocab=None)

  """Entry point for launching an IPython kernel.


In [116]:
swords

[('systems', 0.9117391705513),
 ('maintenance', 0.8235427737236023),
 ('integration', 0.8211653232574463),
 ('etc', 0.8199548125267029),
 ('control', 0.8183053731918335),
 ('software', 0.8153436779975891),
 ('operation', 0.812597393989563),
 ('monitoring', 0.8113700151443481),
 ('implementation', 0.8096862435340881),
 ('component', 0.8049724102020264)]

In [117]:
swords = [word[0] for word in swords]
swords

['systems',
 'maintenance',
 'integration',
 'etc',
 'control',
 'software',
 'operation',
 'monitoring',
 'implementation',
 'component']

In [118]:
model_1.most_similar('system', restrict_vocab=None)

  """Entry point for launching an IPython kernel.


[('systems', 0.7568676471710205),
 ('configuration', 0.6172717809677124),
 ('onmandatory', 0.6038140058517456),
 ('workflow', 0.5684077739715576),
 ('upgrades', 0.5452710390090942),
 ('operation', 0.5377562046051025),
 ('maintenance', 0.533307671546936),
 ('security', 0.5248088836669922),
 ('storage', 0.5245387554168701),
 ('deployment', 0.5200053453445435)]

In [None]:
#model['system'] # word vectror with cosine distance

In [78]:
#model.wv.word_vec('system', use_norm = True) # Word vector with euclidian distances

In [126]:
def return_similar_rows(sim_words, corpus):
    indexes = []
    for word in sim_words:
        for i in range(len(corpus)):
            if str(corpus[i]).find(word) > (-1):
                indexes.append(i)
    similar_rows = data_original['Content'][indexes]
    return similar_rows

In [129]:
similar_rows = return_similar_rows(sim_words = swords, corpus = corpus)
similar_rows

1       Ihre Herausforderung  You plan and implement A...
2         Since 1989 - more than a quarter of a centur...
8       Website Innovation Manager       ABOUT LA PRAI...
11      Senior Software Engineer Senior Software Engin...
13      Two Scientific researchers in Photovoltaic Mod...
                              ...                        
9945    Global Medical Director   Back to Previous Pag...
9948    Control Systems Engineer/Programmer (m/f)Dutie...
9975    SPS ENGINEER AUTOMATION (m/w) - 100%RHI Magnes...
9978    Strategic Product Portfolio Architect - Workfo...
9986    SWIFT Alliance / Financial Messaging Specialis...
Name: Content, Length: 20721, dtype: object

# Applying word2vec to whole sentences
This section applies word2vec to whole sentences by simply using the average of the word vectors.

In [134]:
def avg_sentence_vector(sentence, model):
    #function to average all words vectors in a given paragraph/sentence
    words = word_tokenize(sentence)
    sentenceVec = model[words[0]]
    for i in range(1, len(words)):
        sentenceVec = sentenceVec + model[words[i]]
    return sentenceVec

In [142]:
sentenceVec1 = avg_sentence_vector("hello people", model = model_2)
sentenceVec2 = avg_sentence_vector("howdy guys", model = model_2)
sen1_sen2_similarity =  cosine_similarity(sentenceVec1.reshape(1, -1),sentenceVec2.reshape(1, -1))
sen1_sen2_similarity

  after removing the cwd from sys.path.
  


array([[0.28418714]], dtype=float32)

In [148]:
sentenceVec1 = avg_sentence_vector(data[0], model = model_2)
sentenceVec2 = avg_sentence_vector(data[1], model = model_2)
sen1_sen2_similarity =  cosine_similarity(sentenceVec1.reshape(1, -1),sentenceVec2.reshape(1, -1))
sen1_sen2_similarity

  after removing the cwd from sys.path.
  


array([[0.95182484]], dtype=float32)

 This clearly doesnt work for long sentences or paragraphs because of the LLN.... Will have to find alternate solution

In [153]:
data[1]

'ihre herausforderung plan implement automation systems pharmaceutical production develop solutions expansions product changes optimize automation processes accordingly coordinate monitor control system regard function performance costs deadlines full responsibility project planning till commissioning ensure machine safety optimized maintenance supervise support interdiscliplinary team engineers automation engineer deltavihre kompetenz bsc degree electrical engineering automation process technology relevant educational background several years practical experience similar tasks automation control systems plc architecture engineer expertise distributed control system dcs emerson deltav fluency english least knowledge german must ihre perspektiven innovative international working environment attractive employment conditions beautiful swiss scenery über spring professional ihre karriere voranbringen eine stelle finden die ihnen gefällt oder eine neue chance zur beruflichen weiterentwicklu

# Applying RNN for similarity calculation
inspired by: https://github.com/eliorc/Medium/blob/master/MaLSTM.ipynb

In [3]:
from time import time
import itertools
import datetime

from gensim.models import KeyedVectors
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Lambda
import tensorflow.keras.backend as K
from tensorflow.keras.optimizers import Adadelta
from tensorflow.keras.callbacks import ModelCheckpoint

In [19]:
s2n =[None] * len(data)  # s2n -> sentence to numbers representation
for i in range(len(data)):
        s2n[i]= [model_2[word] for word in word_tokenize(data[i])]
# this huge thing is created super fast

  This is separate from the ipykernel package so we can avoid doing imports until


In [28]:
# Define max length of sentence so we can pad shorter ones later. 
max_seq_length = max(map(len, s2n))
max_seq_length

3329

In [37]:
padded = pad_sequences(s2n[0], maxlen=max_seq_length)
padded

IndexError: index 3000 is out of bounds for axis 0 with size 378

In [30]:
s2n[0]

[array([-1.57953496e-03,  1.33766013e-03, -1.49923225e-03,  1.14570127e-03,
        -3.81482940e-04, -3.24279128e-04,  1.49553397e-03, -9.20949446e-04,
        -5.22417016e-04, -4.60400071e-04, -1.00061174e-04,  6.40882470e-04,
         1.63471757e-03, -1.12980793e-04,  1.45942916e-03, -1.57988836e-06,
        -9.40448197e-04,  4.91925923e-04, -8.56906292e-04, -2.02084804e-04,
        -1.15820020e-03,  1.60412083e-03, -1.19461201e-03, -1.56961801e-03,
        -1.09670882e-03,  1.25995337e-03, -7.24765123e-04, -1.66348647e-04,
         1.65672204e-03, -1.00963388e-03,  2.68728909e-04,  1.83009994e-04,
        -1.37240742e-03, -4.80467803e-04,  1.21182027e-04,  4.41803306e-04,
        -1.00395863e-03, -1.03433093e-03, -7.18812458e-04,  3.93619179e-04,
         1.21864211e-03,  1.31638383e-03,  5.27424912e-04,  2.00315990e-04,
        -9.37373319e-04, -8.03021947e-04,  1.21537806e-03,  3.08612216e-05,
        -1.64588995e-03,  2.65361101e-04, -1.01845211e-03,  1.23467948e-03,
        -5.7

['system',
 'engineerjob',
 'description',
 'overview',
 'business',
 'area',
 'project',
 'resource',
 'work',
 'strategic',
 'programs',
 'enterprise',
 'services',
 'department',
 'entire',
 'chief',
 'technology',
 'office',
 'within',
 'bank',
 'key',
 'deliveries',
 'based',
 'servicenow',
 'platform',
 'midserver',
 'infrastructure',
 'consists',
 'configuration',
 'management',
 'database',
 'cmdb',
 'multiple',
 'automatic',
 'semiautomatic',
 'integrations',
 'various',
 'tools',
 'applications',
 'key',
 'responsibilities',
 'exciting',
 'opportunity',
 'lead',
 'servicenow',
 'data',
 'integration',
 'engineering',
 'stream',
 'including',
 'development',
 'maintenance',
 'servicenow',
 'midserver',
 'infrastructure',
 'executing',
 'changes',
 'analysis',
 'incidents',
 'problem',
 'tickets',
 'role',
 'includes',
 'deep',
 'troubleshooting',
 'complex',
 'issues',
 'code',
 'level',
 'open',
 'position',
 'responsibility',
 'subject',
 'matter',
 'expert',
 'lead',
 'poin