# Steps
## To do : 
* Clean Text
    * Pasar plural a singular texto base
* Tokenize
* Build vocab
* Generate Vectors
    * Probar una con industria y otra sin industria (quedarse con la probab

<img src="./img/bow.png" width="900"> 

In [70]:
import pandas as pd 
import numpy as np
import re
import itertools
import spacy
import string
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline 
from spacy.lemmatizer import Lemmatizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [32]:
nlp = spacy.load('en_core_web_sm') 
nlp.max_length = 15000000

In [33]:
df_data_raw = pd.read_csv('data/fake_job_postings.csv')

In [34]:
# Grupo 1: title, requirements, benefits
# Grupo 2: company_profile, description 
l_grupo1 = ['title', 'requirements', 'benefits']
l_grupo2 = ['company_profile', 'description']

df_categorical_g1 = df_data_raw[l_grupo1].fillna('')
df_categorical_g2 = df_data_raw[l_grupo2].fillna('')

df_categorical_g1.head()

Unnamed: 0,title,requirements,benefits
0,Marketing Intern,Experience with content management systems a m...,
1,Customer Service - Cloud Video Production,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...
2,Commissioning Machinery Assistant (CMA),Implement pre-commissioning and commissioning ...,
3,Account Executive - Washington DC,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...
4,Bill Review Manager,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered


# Group 1: title, requirements, benefits

In [35]:
str_bow_raw = ' '.join([' '.join(df_categorical_g1[x].astype(str).tolist()) for x in df_categorical_g1.columns])

In [36]:
def clean_text(str_text_raw):
#     translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
#     return(str_bow_raw.lower().translate(translator))
    return(str_bow_raw.lower())
str_bow = clean_text(str_bow_raw)

In [37]:
# str_bow = str_bow[0:1000000]
len(str_bow)

14850392

In [38]:
%%time
# nlp = English()
# nlp = en_core_web_sm.load()
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) 
nlp.max_length = 15000000
doc = nlp(str_bow)
#str_bow[0:1000000] Wall time: 43.7 s
# nlp.max_length = 15000000 Wall time: 4min 38s

Wall time: 4min 48s


## Tokenize words
* Text: The original text of the lexeme.
* Lemme: Lexeme.
* Orth: The hash value of the lexeme.
* is alpha: Does the lexeme consist of alphabetic characters?
* is digit: Does the lexeme consist of digits?
* is_title:
* lang:
* tag: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html spacy.explain("RB")
* pos:

In [39]:
%%time
l_token = [[token.text, token.lemma_, token.orth, token.is_alpha, token.is_digit, 
           token.is_title, token.lang_, token.tag_,token.pos_, token.has_vector, token.vector_norm, token.is_oov] 
           for token in doc if not token.is_punct | token.is_space | token.is_stop]
print(l_token[0:5])

[['marketing', 'market', 5624371593702924111, True, False, False, 'en', 'VBG', 'VERB', True, 21.006645, True], ['intern', 'intern', 6051314601965010107, True, False, False, 'en', 'NN', 'NOUN', True, 19.317013, True], ['customer', 'customer', 14759225161440374483, True, False, False, 'en', 'NN', 'NOUN', True, 19.821066, True], ['service', 'service', 208172016153456603, True, False, False, 'en', 'NN', 'NOUN', True, 19.358261, True], ['cloud', 'cloud', 13981346438767540862, True, False, False, 'en', 'NN', 'NOUN', True, 20.506468, True]]
Wall time: 41.2 s


In [40]:
pd_token = pd.DataFrame(l_token, columns=['text', 'lemme', 'orth', 'is_alpha', 'is_digit', 'is_title', 'language',
                                          'tag', 'part_of_speech', 'has_vector', 'vector_norm', 'is_oov'])
pd_token.head()

Unnamed: 0,text,lemme,orth,is_alpha,is_digit,is_title,language,tag,part_of_speech,has_vector,vector_norm,is_oov
0,marketing,market,5624371593702924111,True,False,False,en,VBG,VERB,True,21.006645,True
1,intern,intern,6051314601965010107,True,False,False,en,NN,NOUN,True,19.317013,True
2,customer,customer,14759225161440374483,True,False,False,en,NN,NOUN,True,19.821066,True
3,service,service,208172016153456603,True,False,False,en,NN,NOUN,True,19.358261,True
4,cloud,cloud,13981346438767540862,True,False,False,en,NN,NOUN,True,20.506468,True


In [41]:
# Revisar si el singular y el plural se está contemplando en lemma
pd_token[pd_token['tag'].isin(['NNPS', 'NNS'])].drop_duplicates().head(5)

Unnamed: 0,text,lemme,orth,is_alpha,is_digit,is_title,language,tag,part_of_speech,has_vector,vector_norm,is_oov
46,months,month,11199438407386692767,True,False,False,en,NNS,NOUN,True,20.87306,True
50,applications,application,7746610524179417841,True,False,False,en,NNS,NOUN,True,18.486517,True
53,installers,installer,15921962764723844213,True,False,False,en,NNS,NOUN,True,21.151781,True
58,sales,sale,14348989930891670846,True,False,False,en,NNS,NOUN,True,20.751493,True
61,hands,hand,1689680727489136653,True,False,False,en,NNS,NOUN,True,19.578394,True


In [42]:
#Convert plural text to singular)
pd_token['text_to_singular'] = np.where(pd_token['tag'].isin(['NNPS', 'NNS']), pd_token['lemme'], pd_token['text'])
pd_token[pd_token['tag'].isin(['NNPS', 'NNS'])].drop_duplicates().head(5)

Unnamed: 0,text,lemme,orth,is_alpha,is_digit,is_title,language,tag,part_of_speech,has_vector,vector_norm,is_oov,text_to_singular
46,months,month,11199438407386692767,True,False,False,en,NNS,NOUN,True,20.87306,True,month
50,applications,application,7746610524179417841,True,False,False,en,NNS,NOUN,True,18.486517,True,application
53,installers,installer,15921962764723844213,True,False,False,en,NNS,NOUN,True,21.151781,True,installer
58,sales,sale,14348989930891670846,True,False,False,en,NNS,NOUN,True,20.751493,True,sale
61,hands,hand,1689680727489136653,True,False,False,en,NNS,NOUN,True,19.578394,True,hand


# Bag of words with lemme

In [None]:
word_freq[word_freq]

In [64]:
%%time
words = list(pd_token['lemme']) #1330127
cv = CountVectorizer(words)
# show resulting vocabulary; the numbers are not counts, they are the position in the sparse vector.
count_vector=cv.fit_transform(words)
word_freq = Counter(words)
print(f'Tamaño de bag of words: {count_vector.shape}\nTamaño de palabras únicas: {len(word_freq)}')

Tamaño de bag of words: (1330127, 57319)
Tamaño de palabras únicas: 85060
Wall time: 10.2 s


In [65]:
common_words = word_freq.most_common(20)
print(common_words)

[('experience', 22383), ('work', 20253), ('skill', 11335), ('year', 10607), ('team', 8290), ('ability', 7455), ('service', 6629), ('customer', 6356), ('knowledge', 6241), ('communication', 6230), ('include', 5951), ('business', 5733), ('require', 5627), ('sale', 5610), ('company', 5580), ('management', 5520), ('development', 5453), ('time', 5442), ('amp', 4939), ('degree', 4848)]


# Bag of words with text and singular nouns

In [73]:
%%time
words = list(pd_token['text_to_singular'])
cv = CountVectorizer(words)
# show resulting vocabulary; the numbers are not counts, they are the position in the sparse vector.
count_vector=cv.fit_transform(words)
word_freq = Counter(words)
print(f'Tamaño de bag of words: {count_vector.shape}\nTamaño de palabras únicas: {len(word_freq)}')

Tamaño de bag of words: (1330127, 58487)
Tamaño de palabras únicas: 86681
Wall time: 10.2 s


In [76]:
#Imprime el vector
# print(cv.vocabulary_)
print(count_vector.shape)
print(type(count_vector))
print(count_vector[0:10].toarray())

(1330127, 58487)
<class 'scipy.sparse.csr.csr_matrix'>
[[0 0 0 ... 0 0 0]]


In [67]:
common_words = word_freq.most_common(20)
print(common_words)

[('experience', 22383), ('work', 20253), ('skill', 11335), ('year', 10607), ('team', 8290), ('ability', 7455), ('service', 6629), ('customer', 6356), ('knowledge', 6241), ('communication', 6230), ('include', 5951), ('business', 5733), ('require', 5627), ('sale', 5610), ('company', 5580), ('management', 5520), ('development', 5453), ('time', 5442), ('amp', 4939), ('degree', 4848)]


# Word Frequencies

In [82]:
# create the transform
vectorizer = TfidfVectorizer()
# tokenize and build vocab
vectorizer.fit(words)
# encode document
vector = vectorizer.transform(words)

In [81]:
# show vocab
# print(vectorizer.vocabulary_)
print(vectorizer.idf_)
#Imprime el vector
# print(cv.vocabulary_)
print(vector.shape)
print(type(vector))
print(count_vector[0:10].toarray())

[ 9.45533884  8.57869294 14.00217345 ... 14.40763856 14.40763856
 14.40763856]
(1330127, 58487)
<class 'scipy.sparse.csr.csr_matrix'>
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
