# Steps
## To do : 
* Clean Text
* Tokenize
* Build vocab
* Generate Vectors
    * Probar una con industria y otra sin industria (quedarse con la probab

<img src="./img/bow.png" width="900"> 

In [168]:
import pandas as pd 
import numpy as np
import re
import itertools
import spacy
import string
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline 
from collections import Counter
from spacy.lemmatizer import Lemmatizer
# from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES

In [83]:
nlp = spacy.load('en_core_web_sm') 
nlp.max_length = 15000000

In [65]:
df_data_raw = pd.read_csv('data/fake_job_postings.csv')

In [183]:
# Grupo 1: title, requirements, benefits
# Grupo 2: company_profile, description 
l_grupo1 = ['title', 'requirements', 'benefits']
l_grupo2 = ['company_profile', 'description']

df_categorical_g1 = df_data_raw[l_grupo1].fillna('')
df_categorical_g2 = df_data_raw[l_grupo2].fillna('')

df_categorical_g1.head()

Unnamed: 0,title,requirements,benefits
0,Marketing Intern,Experience with content management systems a m...,
1,Customer Service - Cloud Video Production,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...
2,Commissioning Machinery Assistant (CMA),Implement pre-commissioning and commissioning ...,
3,Account Executive - Washington DC,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...
4,Bill Review Manager,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered


# title, requirements, benefits

In [184]:
str_bow_raw = ' '.join([' '.join(df_categorical_g1[x].astype(str).tolist()) for x in df_categorical_g1.columns])

In [185]:
def clean_text(str_text_raw):
#     translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
#     return(str_bow_raw.lower().translate(translator))
    return(str_bow_raw.lower())
str_bow = clean_text(str_bow_raw)

In [186]:
# str_bow = str_bow[0:1000000]
len(str_bow)

14850392

In [187]:
%%time
# nlp = English()
# nlp = en_core_web_sm.load()
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) 
nlp.max_length = 15000000
doc = nlp(str_bow)
#str_bow[0:1000000] Wall time: 43.7 s
# nlp.max_length = 15000000 Wall time: 4min 38s

Wall time: 3min 57s


## Tokenize words
* Text: The original text of the lexeme.
* Lemme: Lexeme.
* Orth: The hash value of the lexeme.
* is alpha: Does the lexeme consist of alphabetic characters?
* is digit: Does the lexeme consist of digits?
* is_title:
* lang:
* tag: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
* pos:

In [224]:
%%time
l_token = [[token.text, token.lemma_, token.orth, token.is_alpha, token.is_digit, 
           token.is_title, token.lang_, token.tag_,token.pos_, token.has_vector, token.vector_norm, token.is_oov] 
           for token in doc if not token.is_punct | token.is_space | token.is_stop]
print(l_token[0:5])

[['marketing', 'market', 5624371593702924111, True, False, False, 'en', 'VBG', 'VERB', True, 21.006645, True], ['intern', 'intern', 6051314601965010107, True, False, False, 'en', 'NN', 'NOUN', True, 19.317013, True], ['customer', 'customer', 14759225161440374483, True, False, False, 'en', 'NN', 'NOUN', True, 19.821066, True], ['service', 'service', 208172016153456603, True, False, False, 'en', 'NN', 'NOUN', True, 19.358261, True], ['cloud', 'cloud', 13981346438767540862, True, False, False, 'en', 'NN', 'NOUN', True, 20.506468, True]]
Wall time: 22.3 s


In [225]:
pd_token = pd.DataFrame(l_token, columns=['text', 'lemme', 'orth', 'is_alpha', 'is_digit', 'is_title', 'language',
                                          'tag', 'part_of_speech', 'has_vector', 'vector_norm', 'is_oov'])
pd_token.head()

Unnamed: 0,text,lemme,orth,is_alpha,is_digit,is_title,language,tag,part_of_speech,has_vector,vector_norm,is_oov
0,marketing,market,5624371593702924111,True,False,False,en,VBG,VERB,True,21.006645,True
1,intern,intern,6051314601965010107,True,False,False,en,NN,NOUN,True,19.317013,True
2,customer,customer,14759225161440374483,True,False,False,en,NN,NOUN,True,19.821066,True
3,service,service,208172016153456603,True,False,False,en,NN,NOUN,True,19.358261,True
4,cloud,cloud,13981346438767540862,True,False,False,en,NN,NOUN,True,20.506468,True


In [226]:
# Revisar si el singular y el plural se está contemplando en lemma
pd_token[pd_token['tag'].isin(['NNPS', 'NNS'])].drop_duplicates().head(5)

Unnamed: 0,text,lemme,orth,is_alpha,is_digit,is_title,language,tag,part_of_speech,has_vector,vector_norm,is_oov
46,months,month,11199438407386692767,True,False,False,en,NNS,NOUN,True,20.87306,True
50,applications,application,7746610524179417841,True,False,False,en,NNS,NOUN,True,18.486517,True
53,installers,installer,15921962764723844213,True,False,False,en,NNS,NOUN,True,21.151781,True
58,sales,sale,14348989930891670846,True,False,False,en,NNS,NOUN,True,20.751493,True
61,hands,hand,1689680727489136653,True,False,False,en,NNS,NOUN,True,19.578394,True


In [228]:
#Convert plural text to singular)
pd_token['text_to_singular'] = np.where(pd_token['tag'].isin(['NNPS', 'NNS']), pd_token['lemme'], pd_token['text'])
pd_token[pd_token['tag'].isin(['NNPS', 'NNS'])].drop_duplicates().head(5)

Unnamed: 0,text,lemme,orth,is_alpha,is_digit,is_title,language,tag,part_of_speech,has_vector,vector_norm,is_oov,text_to_singular
46,months,month,11199438407386692767,True,False,False,en,NNS,NOUN,True,20.87306,True,month
50,applications,application,7746610524179417841,True,False,False,en,NNS,NOUN,True,18.486517,True,application
53,installers,installer,15921962764723844213,True,False,False,en,NNS,NOUN,True,21.151781,True,installer
58,sales,sale,14348989930891670846,True,False,False,en,NNS,NOUN,True,20.751493,True,sale
61,hands,hand,1689680727489136653,True,False,False,en,NNS,NOUN,True,19.578394,True,hand


# Bag of words with lemme

In [229]:
%%time
words = list(zip(*l_token))[1] #1330127
word_freq = Counter(words) #88450 sin lemma 85060 palabras lematizadas
print(f'Tamaño de bag of words: {len(words)}\nTamaño de palabras únicas: {len(word_freq)}')

Tamaño de bag of words: 1330127
Tamaño de palabras únicas: 85060
Wall time: 4.38 s


In [230]:
common_words = word_freq.most_common(20)
print(common_words)

[('experience', 22383), ('work', 20253), ('skill', 11335), ('year', 10607), ('team', 8290), ('ability', 7455), ('service', 6629), ('customer', 6356), ('knowledge', 6241), ('communication', 6230), ('include', 5951), ('business', 5733), ('require', 5627), ('sale', 5610), ('company', 5580), ('management', 5520), ('development', 5453), ('time', 5442), ('amp', 4939), ('degree', 4848)]


# Bag of words with text and singular nouns

In [231]:
%%time
words = list(pd_token['text_to_singular'])
word_freq = Counter(words)
print(f'Tamaño de bag of words: {len(words)}\nTamaño de palabras únicas: {len(word_freq)}')

Tamaño de bag of words: 1330127
Tamaño de palabras únicas: 86681
Wall time: 564 ms


In [232]:
pd_token[pd_token['lemme']=='experience'].drop_duplicates().head(40)

Unnamed: 0,text,lemme,orth,is_alpha,is_digit,is_title,language,tag,part_of_speech,has_vector,vector_norm,is_oov,text_to_singular
1227,experience,experience,12090100147684300169,True,False,False,en,NN,NOUN,True,15.244206,True,experience
1404,experienced,experience,2982658188723080757,True,False,False,en,VBD,VERB,True,21.208199,True,experienced
1496,experience,experience,12090100147684300169,True,False,False,en,NN,NOUN,True,19.107916,True,experience
1707,experienced,experience,2982658188723080757,True,False,False,en,VBD,VERB,True,22.714855,True,experienced
2593,experience,experience,12090100147684300169,True,False,False,en,NN,NOUN,True,17.400497,True,experience
3458,experience,experience,12090100147684300169,True,False,False,en,NN,NOUN,True,19.395607,True,experience
5107,experienced,experience,2982658188723080757,True,False,False,en,VBD,VERB,True,21.271852,True,experienced
5661,experienced,experience,2982658188723080757,True,False,False,en,VBD,VERB,True,20.718897,True,experienced
6015,experienced,experience,2982658188723080757,True,False,False,en,VBN,VERB,True,21.645729,True,experienced
6165,experience,experience,12090100147684300169,True,False,False,en,NN,NOUN,True,20.246967,True,experience


In [233]:
common_words = word_freq.most_common(20)
print(common_words)

[('experience', 22062), ('work', 14187), ('skill', 10888), ('year', 10575), ('team', 8287), ('ability', 7455), ('service', 6597), ('working', 6562), ('customer', 6353), ('knowledge', 6241), ('communication', 6230), ('business', 5733), ('sale', 5607), ('company', 5580), ('management', 5518), ('development', 5453), ('time', 5427), ('amp', 4939), ('required', 4887), ('degree', 4848)]


In [234]:
print(doc.vocab.strings["experience"])

12090100147684300169


In [235]:
pd_token['has_vector'].value_counts()

True    1330127
Name: has_vector, dtype: int64