In [40]:
import pandas as pd 
import numpy as np
import spacy
from nltk.stem.snowball import SnowballStemmer
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
nlp = spacy.load('en_core_web_sm')

In [2]:
data = pd.read_csv('data/fake_job_postings.csv')

In [17]:
categorical = data.select_dtypes(include = 'object')
row = categorical.apply(lambda row: ' '.join(row.values.astype(str)), axis=0)
row

title                  Marketing Intern Customer Service - Cloud Vide...
location               US, NY, New York NZ, , Auckland US, IA, Wever ...
department             Marketing Success nan Sales nan nan ANDROIDPIT...
salary_range           nan nan nan nan nan nan 20000-28000 nan nan na...
company_profile        We're Food52, and we've created a groundbreaki...
description            Food52, a fast-growing, James Beard Award-winn...
requirements           Experience with content management systems a m...
benefits               nan What you will get from usThrough being par...
employment_type        Other Full-time nan Full-time Full-time nan Fu...
required_experience    Internship Not Applicable nan Mid-Senior level...
required_education     nan nan nan Bachelor's Degree Bachelor's Degre...
industry               nan Marketing and Advertising nan Computer Sof...
function               Marketing Customer Service nan Sales Health Ca...
dtype: object

In [18]:
text = row.str.cat(sep = ' ')

In [21]:
prueba = row.loc['title']
prueba

'Marketing Intern Customer Service - Cloud Video Production Commissioning Machinery Assistant (CMA) Account Executive - Washington DC Bill Review Manager Accounting Clerk Head of Content (m/f) Lead Guest Service Specialist    HP BSM SME Customer Service Associate - Part Time  ASP.net Developer Job opportunity at United States,New Jersey Talent Sourcer (6 months fixed-term contract) Applications Developer, Digital  Installers Account Executive - Sydney VP of Sales - Vault Dragon Hands-On QA Leader  Southend-on-Sea Traineeships Under NAS 16-18 Year Olds Only Visual Designer Process Controls Engineer - DCS PLC MS Office - PA Marketing Assistant Front End Developer    Engagement Manager  Vice President, Sales and Sponsorship (Businessfriend.com) Customer Service  H1B SPONSOR FOR L1/L2/OPT Marketing Exec HAAD/DHA Licensed Doctors Opening in UAE Talent Management Process Manager Customer Service Associate  Customer Service Technical Specialist  Software Applications Specialist Craftsman Asso

In [35]:
prueba_nlp = nlp(prueba)


In [37]:
tokens = [token.orth_.lower() for token in prueba_nlp if not token.is_punct | token.is_space | token.is_stop]
tokens = [t.lower() for t in tokens if t.isalpha()]

In [38]:
lemmatizacion = nlp(' '.join(tokens))

tokens_lemma = [token.lemma_ for token in lemmatizacion]

In [45]:
tokens_lemma

['market',
 'intern',
 'customer',
 'service',
 'cloud',
 'video',
 'production',
 'commissioning',
 'machinery',
 'assistant',
 'cma',
 'account',
 'executive',
 'washington',
 'dc',
 'bill',
 'review',
 'manager',
 'accounting',
 'clerk',
 'head',
 'content',
 'm',
 'f',
 'lead',
 'guest',
 'service',
 'specialist',
 'hp',
 'bsm',
 'sme',
 'customer',
 'service',
 'associate',
 'time',
 'developer',
 'job',
 'opportunity',
 'united',
 'states',
 'new',
 'jersey',
 'talent',
 'sourcer',
 'month',
 'fix',
 'term',
 'contract',
 'application',
 'developer',
 'digital',
 'installer',
 'account',
 'executive',
 'sydney',
 'vp',
 'sale',
 'vault',
 'dragon',
 'hand',
 'qa',
 'leader',
 'southend',
 'sea',
 'traineeships',
 'nas',
 'year',
 'old',
 'visual',
 'designer',
 'process',
 'control',
 'engineer',
 'dcs',
 'plc',
 'ms',
 'office',
 'pa',
 'marketing',
 'assistant',
 'end',
 'developer',
 'engagement',
 'manager',
 'vice',
 'president',
 'sale',
 'sponsorship',
 'customer',
 'servi

In [41]:
stemmer = SnowballStemmer(language='english')
tokens_lemma_stemming = [stemmer.stem(token) for token in tokens_lemma]

In [43]:
tokens_lemma_stemming

['market',
 'intern',
 'custom',
 'servic',
 'cloud',
 'video',
 'product',
 'commiss',
 'machineri',
 'assist',
 'cma',
 'account',
 'execut',
 'washington',
 'dc',
 'bill',
 'review',
 'manag',
 'account',
 'clerk',
 'head',
 'content',
 'm',
 'f',
 'lead',
 'guest',
 'servic',
 'specialist',
 'hp',
 'bsm',
 'sme',
 'custom',
 'servic',
 'associ',
 'time',
 'develop',
 'job',
 'opportun',
 'unit',
 'state',
 'new',
 'jersey',
 'talent',
 'sourcer',
 'month',
 'fix',
 'term',
 'contract',
 'applic',
 'develop',
 'digit',
 'instal',
 'account',
 'execut',
 'sydney',
 'vp',
 'sale',
 'vault',
 'dragon',
 'hand',
 'qa',
 'leader',
 'southend',
 'sea',
 'traineeship',
 'nas',
 'year',
 'old',
 'visual',
 'design',
 'process',
 'control',
 'engin',
 'dcs',
 'plc',
 'ms',
 'offic',
 'pa',
 'market',
 'assist',
 'end',
 'develop',
 'engag',
 'manag',
 'vice',
 'presid',
 'sale',
 'sponsorship',
 'custom',
 'servic',
 'sponsor',
 'opt',
 'market',
 'exec',
 'haad',
 'dha',
 'licens',
 'doctor

In [44]:
dit = {}
for word in tokens_lemma_stemming:
    if word not in dit:
        dit[word] = 1
    else:
        dit[word] += 1
        
sorted(dit.items(), key=lambda x: x[1], reverse=True)
    

[('manag', 2473),
 ('develop', 2272),
 ('engin', 1804),
 ('sale', 1349),
 ('servic', 1038),
 ('senior', 989),
 ('custom', 979),
 ('market', 858),
 ('english', 786),
 ('teacher', 779),
 ('design', 745),
 ('assist', 707),
 ('account', 707),
 ('abroad', 692),
 ('associ', 669),
 ('softwar', 652),
 ('web', 560),
 ('specialist', 544),
 ('busi', 537),
 ('repres', 527),
 ('execut', 503),
 ('product', 496),
 ('analyst', 489),
 ('director', 488),
 ('time', 423),
 ('intern', 411),
 ('administr', 410),
 ('consult', 393),
 ('offic', 392),
 ('lead', 389),
 ('project', 362),
 ('support', 361),
 ('oper', 357),
 ('end', 305),
 ('year', 295),
 ('old', 284),
 ('graduat', 264),
 ('technic', 257),
 ('junior', 255),
 ('posit', 254),
 ('system', 248),
 ('nas', 244),
 ('convers', 239),
 ('technician', 239),
 ('entri', 237),
 ('apprenticeship', 236),
 ('digit', 235),
 ('sr', 230),
 ('java', 223),
 ('coordin', 221),
 ('home', 198),
 ('datum', 196),
 ('need', 195),
 ('ux', 189),
 ('data', 188),
 ('applic', 184),