In [20]:
import json
import pandas as pd
import numpy as np


from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.util import ngrams

import re
import string

stop_words = set(stopwords.words('english'))
porter = PorterStemmer()

In [43]:
from collections import Counter

In [3]:
Path = "Data/Jobs/"

In [31]:
def read_file(path):
    with open(path, encoding='utf8') as file:
        data = json.load(file)
    file.close()
    return data

In [21]:
def clean_string(text : str) -> str:

    text = text.lower().strip()
    spec_chars =  "([" + string.punctuation + "\° ].*?)"
    text = re.sub(spec_chars, ' ', text)
    
    word_tokens = word_tokenize(text) 
    
    return " ".join([porter.stem(w) for w in word_tokens if not w in stop_words])

In [32]:
data_frame = pd.DataFrame(columns=['title', 'company', 'description'])

In [33]:
for i in range(1,201):
    data = read_file(Path+"data scientist_"+str(i)+".json")
    data_frame = data_frame.append(data,ignore_index=True)

In [37]:
data_frame['title'] = data_frame['title'].apply(lambda title:clean_string(title))
data_frame['description'] = data_frame['description'].apply(lambda description:clean_string(description))

In [38]:
data_frame

Unnamed: 0,title,company,description
0,junior data scientist apprenticeship,IBM,inform age data valuabl resourc describ power ...
1,associ data scientist snkr,Nike,becom part nike inc team nike inc outfit world...
2,data scientist,Deloitte,deloitt servic lp includ intern support area s...
3,data scientist analyt univ grad,Facebook,facebook mission give peopl power build commun...
4,data scientist experi,Codecademy,hello world codecademi help 45 million peopl a...
5,data scientist,Butterfly Network,job descript look highli motiv experienc data ...
6,data scientist,AETNA,descript look opportun use cut edg technolog a...
7,data scientist,Source Enterprises,sourc system invit brightest creativ passion m...
8,data scientist analyt infer,Codecademy,hello world codecademi help 45 million peopl a...
9,data scientist basketb integr,the NBA,nba ’ passion grow celebr game basketbal inten...


# Title dictionnary 

In [61]:
title_dictionnary = list(data_frame.title)
descr_dictionnary = list(data_frame.description)

In [45]:
len(title_dictionnary)

200

In [46]:
counter_titles = Counter(title_dictionnary)

In [48]:
counter_titles.most_common(5)

[('data scientist', 55),
 ('senior data scientist', 11),
 ('data scientist experi', 2),
 ('associ data scientist', 2),
 ('data scientist ml engin', 2)]

In [59]:
counter_vocab_title = Counter((" ".join(title_dictionnary)).split(" "))

In [60]:
counter_vocab_title.most_common(10)

[('data', 201),
 ('scientist', 176),
 ('senior', 29),
 ('analyt', 23),
 ('scienc', 18),
 ('–', 12),
 ('associ', 11),
 ('machin', 11),
 ('learn', 11),
 ('engin', 10)]

In [52]:
tokens = [token for token in (" ".join(title_dictionnary)).split(" ") if token!=""]

In [57]:
bi_grams_title = Counter(list(ngrams(tokens, 2)))

In [58]:
bi_grams_title.most_common(10)

[(('data', 'scientist'), 173),
 (('scientist', 'data'), 61),
 (('senior', 'data'), 27),
 (('data', 'scienc'), 15),
 (('scientist', 'senior'), 14),
 (('machin', 'learn'), 11),
 (('analyt', 'data'), 9),
 (('scientist', '–'), 9),
 (('scientist', 'analyt'), 7),
 (('analyst', 'data'), 7)]

In [62]:
counter_vocab_desc = Counter((" ".join(descr_dictionnary)).split(" "))

In [63]:
counter_vocab_desc.most_common(10)

[('data', 2268),
 ('experi', 996),
 ('work', 748),
 ('team', 711),
 ('model', 637),
 ('busi', 625),
 ('’', 579),
 ('learn', 578),
 ('scienc', 563),
 ('analyt', 555)]

In [64]:
tokens = [token for token in (" ".join(descr_dictionnary)).split(" ") if token!=""]

In [65]:
bi_grams_descr = Counter(list(ngrams(tokens, 2)))

In [66]:
bi_grams_descr.most_common(10)

[(('machin', 'learn'), 371),
 (('data', 'scienc'), 363),
 (('data', 'scientist'), 356),
 (('data', 'set'), 145),
 (('e', 'g'), 140),
 (('new', 'york'), 136),
 (('comput', 'scienc'), 119),
 (('data', 'analysi'), 80),
 (('experi', 'work'), 78),
 (('experi', 'data'), 78)]

In [67]:
tri_grams_descr = Counter(list(ngrams(tokens, 3)))

In [68]:
tri_grams_descr.most_common()

[(('larg', 'data', 'set'), 67),
 (('data', 'scientist', 'respons'), 45),
 (('machin', 'learn', 'model'), 44),
 (('equal', 'opportun', 'employ'), 42),
 (('machin', 'learn', 'algorithm'), 38),
 (('without', 'regard', 'race'), 34),
 (('race', 'color', 'religion'), 33),
 (('machin', 'learn', 'techniqu'), 33),
 (('data', 'scientist', 'join'), 32),
 (('e', 'shaw', 'group'), 32),
 (('sexual', 'orient', 'gender'), 29),
 (('natur', 'languag', 'process'), 29),
 (('bachelor', '’', 'degre'), 29),
 (('statist', 'machin', 'learn'), 28),
 (('regard', 'race', 'color'), 28),
 (('orient', 'gender', 'ident'), 28),
 (('data', 'scientist', 'work'), 28),
 (('senior', 'data', 'scientist'), 28),
 (('data', 'scienc', 'team'), 26),
 (('new', 'york', 'citi'), 26),
 (('year', 'experi', 'data'), 25),
 (('look', 'data', 'scientist'), 24),
 (('data', 'scienc', 'analyt'), 24),
 (('experi', 'data', 'scienc'), 24),
 (('staff', 'data', 'scientist'), 24),
 (('affirm', 'action', 'employ'), 23),
 (('scienc', 'machin', 'lea