# Initial Cleaning

In [1]:
from imports import *

In [2]:
df = pd.read_csv('data_science.csv')

In [3]:
df.drop(columns = 'link', inplace = True)

In [4]:
# Checking null values
df.isnull().sum()

company         0
location        0
mode            0
type            0
level           0
role            0
requirements    0
edu_bachelor    0
edu_master      0
edu_phd         0
edu_other       0
skills          0
dtype: int64

### Create Dummy

In [None]:
level_dummy = pd.get_dummies(df[['level']], dummy_na=False, drop_first=False)
df = pd.concat([df, level_dummy], axis=1)

### Rename Columns

In [None]:
df.rename(columns = {'level_Associate':'associate', 'level_Entry':'entry',
                              'level_Mid-Senior':'mid_senior'}, inplace = True)

## Summarize Initial Cleaning

In [None]:
def prep_data(df):
    '''
    This function takes in a dataframe and return the dataframe with meaningless column dropped,
    and dummy variables for categorical feature concatenated.
    '''
    df.drop(columns = 'link', inplace = True)
    level_dummy = pd.get_dummies(df[['level']], dummy_na=False, drop_first=False)
    df = pd.concat([df, level_dummy], axis=1)
    df.rename(columns = {'level_Associate':'associate', 'level_Entry':'entry',
                              'level_Mid-Senior':'mid_senior'}, inplace = True)
    return df

# Text Preparation

- Convert text to all lower case for normalcy.
- Remove any accented characters, non-ASCII characters.
- Remove special characters.
- Stem or lemmatize the words.
- Remove stopwords.
- Store the clean text and the original text for use in future notebooks.

### Convert text to all lower case

In [5]:
df.requirements = df.requirements.str.lower()

In [6]:
df.requirements

0      bachelor degree\nminimum years of experience: 4 year(s)\ndemonstrates thorough abiliti...
1      identify and execute on predictive models to help internal teams at masterworks unders...
2      self-motivated, highly disciplined, and passionate about discovering the right therape...
3      2 years + experience with python, java, or other object-oriented programming languages...
4      2+ years of work experience doing quantitative analysis to tackle business problems\ns...
                                                 ...                                            
195    3 to 5 years of experience in applied data science in a retail marketing or operations...
196    bs in engineering, computer science, data science or equivalent\n2+ years of experienc...
197    4+ years of proven experience in an analytics role\nability to communicate the results...
198    3-7 years professional experience in data analysis or practical experience building cu...
199    develop and implement s

### Removing accented characters

In [5]:
def basic_clean(string):
    '''
    This function takes in a string and
    returns the string normalized.
    '''
    string = unicodedata.normalize('NFKD', string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
    # Removing white space
    string = re.sub(r'\s+', ' ',   string)
    # Removing anything that is not a-z, 0-9, a single quote, or whitespace
    string = re.sub(r"[^a-z0-9'\s]", '', string)
    return string

In [8]:
df.requirements = df.requirements.apply(basic_clean)

In [7]:
df.requirements

0      achelor egree inimum ears f xperience 4 years emonstrates thorough abilities andor a p...
1      dentify and execute on predictive models to help internal teams at asterworks understa...
2      elfmotivated highly disciplined and passionate about discovering the right therapeutic...
3      2 years  experience with ython ava or other objectoriented programming languages andso...
4      2 years of work experience doing quantitative analysis to tackle business problems tro...
                                                 ...                                            
195    3 to 5 years of experience in applied data science in a retail marketing or operations...
196     in ngineering omputer cience ata cience or quivalent 2 years of experience in ngineer...
197    4 years of proven experience in an analytics role bility to communicate the results of...
198    37 years professional experience in data analysis or practical experience building cus...
199    evelop and implement sc

### Tokenization

In [6]:
def tokenize(string):
    '''
    This function takes in a string and
    returns a tokenized string.
    '''
    # Create tokenizer.
    tokenizer = nltk.tokenize.ToktokTokenizer()

    # Use tokenizer
    string = tokenizer.tokenize(string, return_str = True)

    return string

In [11]:
df.requirements = df.requirements.apply(tokenize)

In [12]:
df.requirements

0      bachelor degree minimum years of experience 4 years demonstrates thorough abilities an...
1      identify and execute on predictive models to help internal teams at masterworks unders...
2      selfmotivated highly disciplined and passionate about discovering the right therapeuti...
3      2 years experience with python java or other objectoriented programming languages hand...
4      2 years of work experience doing quantitative analysis to tackle business problems str...
                                                 ...                                            
195    3 to 5 years of experience in applied data science in a retail marketing or operations...
196    bs in engineering computer science data science or equivalent 2 years of experience in...
197    4 years of proven experience in an analytics role ability to communicate the results o...
198    37 years professional experience in data analysis or practical experience building cus...
199    develop and implement s

### Stemming

In [7]:
def stem(string):
    '''
    This function takes in a string and
    returns a string with words stemmed.
    '''
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    
    # Use the stemmer to stem each word in the list of words we created by using split.
    stems = [ps.stem(word) for word in string.split()]
    
    # Join our lists of words into a string again and assign to a variable.
    string = ' '.join(stems)
    
    return string

### Lemmatization

In [9]:
def lemmatize(string):
    '''
    This function takes in string for and
    returns a string with words lemmatized.
    '''
    # Create the lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()

    # Use the lemmatizer on each word in the list of words we created by using split.
    lemmas = [wnl.lemmatize(word) for word in string.split()]

    # Join our list of words into a string again and assign to a variable.
    string = ' '.join(lemmas)
    
    return string

In [31]:
df.requirements = df.requirements.apply(lemmatize)

and           19
data           7
of             7
a              4
for            4
analytical     4
eg             3
etc            3
using          3
nlp            3
dtype: int64
to            9
market        8
and           8
in            6
the           5
experience    4
data          4
a             4
artist        4
for           3
dtype: int64
and           5
a             3
or            3
experience    3
learning      3
machine       3
of            3
the           2
right         2
with          2
dtype: int64
experience        4
a                 4
and               3
with              2
etc               2
year              2
objectoriented    2
programming       2
algorithm         2
data              2
dtype: int64
experience    4
to            4
with          3
and           3
a             2
or            2
of            2
analysis      2
business      2
in            2
dtype: int64
and           13
in             9
data           9
of             7
or             6


dtype: int64
and         11
in           7
data         6
to           6
skill        6
the          6
with         5
ability      4
analysis     4
of           3
dtype: int64
and            8
experience     4
application    3
a              3
security       3
in             3
with           3
software       2
to             2
year           2
dtype: int64
a             5
and           5
of            4
data          4
experience    4
with          3
in            3
product       2
role          2
business      2
dtype: int64
learning      6
with          5
model         5
experience    5
in            5
machine       4
and           4
or            4
to            4
science       4
dtype: int64
and           11
experience     9
with           8
of             8
to             7
strong         4
familiar       4
data           4
solution       4
in             3
dtype: int64
and              10
with              5
experience        4
familiarity       3
optimization      3
eg          

In [32]:
df.requirements

0      bachelor degree minimum year of experience 4 year demonstrates thorough ability andor ...
1      identify and execute on predictive model to help internal team at masterworks understa...
2      selfmotivated highly disciplined and passionate about discovering the right therapeuti...
3      2 year experience with python java or other objectoriented programming language handso...
4      2 year of work experience doing quantitative analysis to tackle business problem stron...
                                                 ...                                            
195    3 to 5 year of experience in applied data science in a retail marketing or operation e...
196    b in engineering computer science data science or equivalent 2 year of experience in e...
197    4 year of proven experience in an analytics role ability to communicate the result of ...
198    37 year professional experience in data analysis or practical experience building cust...
199    develop and implement s

### Removing Stopwords

In [33]:
stopwords_list = stopwords.words('english')

In [34]:
stopwords_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [10]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # Create stopword_list.
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list) - set(exclude_words)
    
    # Add in 'extra_words' to stopword_list.
    stopword_list = stopword_list.union(set(extra_words))

    # Split words in string.
    words = string.split()
    
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [word for word in words if word not in stopword_list]
    
    # Join words in the list back into strings and assign to a variable.
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords

In [36]:
df.requirements = df.requirements.apply(remove_stopwords)

In [37]:
df.requirements

0      bachelor degree minimum year experience 4 year demonstrates thorough ability andor pro...
1      identify execute predictive model help internal team masterworks understand artist mar...
2      selfmotivated highly disciplined passionate discovering right therapeutic right patien...
3      2 year experience python java objectoriented programming language handson experience u...
4      2 year work experience quantitative analysis tackle business problem strong analytical...
                                                 ...                                            
195    3 5 year experience applied data science retail marketing operation environment 5 year...
196    b engineering computer science data science equivalent 2 year experience engineering d...
197    4 year proven experience analytics role ability communicate result analysis clearly te...
198    37 year professional experience data analysis practical experience building customer f...
199    develop implement scala

### Text Cleaning Summarize

In [11]:
def prep_text(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    df[column] = df[column].str.lower()
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    df['stemmed'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(stem)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    df['lemmatized'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(lemmatize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    return df[['clean', 'stemmed', 'lemmatized']]

In [12]:
df[['clean', 'stemmed', 'lemmatized']] = prep_data(df, 'requirements')

In [13]:
df

Unnamed: 0,company,location,mode,type,level,role,requirements,edu_bachelor,edu_master,edu_phd,edu_other,skills,clean,stemmed,lemmatized
0,PwC,"Austin, TX",Remote,Full time,Entry,Data Science & Machine Learning SA w/ Conversational AI,bachelor degree\nminimum years of experience: 4 year(s)\ndemonstrates thorough abiliti...,22,44,11,23,Python (Programming Language)\nMachine Learning\nSQL\nData Science\nDeep Learning\nDat...,bachelor degree minimum years experience 4 years demonstrates thorough abilities andor...,bachelor degre minimum year experi 4 year demonstr thorough abil andor proven record s...,bachelor degree minimum year experience 4 year demonstrates thorough ability andor pro...
1,Mastersworks,New York City Metropolitan Area,On-site,Full time,Mid-Senior,"VP, Data Science",identify and execute on predictive models to help internal teams at masterworks unders...,23,40,28,9,SQL\nArtificial Intelligence (AI)\nJavaScript\nEntrepreneurship\nAgile Methodologies\n...,identify execute predictive models help internal teams masterworks understand artists ...,identifi execut predict model help intern team masterwork understand artist market seg...,identify execute predictive model help internal team masterworks understand artist mar...
2,ReviveMed,"Cambridge, MA",Hybrid,Full time,Mid-Senior,Biological Data Science,"self-motivated, highly disciplined, and passionate about discovering the right therape...",10,55,31,4,SQL\nJavaScript\nMachine Learning\nLeadership\nPython (Programming Language)\nProgramm...,selfmotivated highly disciplined passionate discovering right therapeutics right patie...,selfmotiv highli disciplin passion discov right therapeut right patient phd ms comput ...,selfmotivated highly disciplined passionate discovering right therapeutic right patien...
3,IBM,"Baton Rouge, LA",On-site,Full time,Entry,Data Engineer,"2 years + experience with python, java, or other object-oriented programming languages...",0,100,0,0,SQL\nGitHub\nLeadership\nOracle Database\nGoogle Cloud Platform (GCP)\nApache Phoenix\...,2 years experience python java objectoriented programming languages handson experience...,2 year experi python java objectori program languag handson experi understand objector...,2 year experience python java objectoriented programming language handson experience u...
4,Expedia Group,"Austin, TX",Hybrid,Full time,Entry,"Data Scientist II, Product Analytics",2+ years of work experience doing quantitative analysis to tackle business problems\ns...,13,72,7,8,SQL\nJavaScript\nLeadership\nMicrosoft SQL Server\nCascading Style Sheets (CSS)\nC#\nO...,2 years work experience quantitative analysis tackle business problems strong analytic...,2 year work experi quantit analysi tackl busi problem strong analyt skill includ abil ...,2 year work experience quantitative analysis tackle business problem strong analytical...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,HD Supply,"Atlanta, GA",On-site,Full time,Entry,Data Scientist,3 to 5 years of experience in applied data science in a retail marketing or operations...,38,44,9,9,SQL\nPython (Programming Language)\nMachine Learning\nTableau\nData Analysis\nMATLAB\n...,3 5 years experience applied data science retail marketing operations environment 5 ye...,3 5 year experi appli data scienc retail market oper environ 5 year quantit analyt exp...,3 5 year experience applied data science retail marketing operation environment 5 year...
196,Optum,"San Antonio, TX",On-site,Full time,Entry,Data Engineer,"bs in engineering, computer science, data science or equivalent\n2+ years of experienc...",50,50,0,0,SQL\nGit\nMicrosoft SQL Server\nLinux System Administration\nData Administration\nData...,bs engineering computer science data science equivalent 2 years experience engineering...,bs engin comput scienc data scienc equival 2 year experi engin data scienc travel 50 t...,b engineering computer science data science equivalent 2 year experience engineering d...
197,Grindr,"Chicago, IL",Remote,Full time,Entry,Data Scientist,4+ years of proven experience in an analytics role\nability to communicate the results...,23,62,9,6,Python (Programming Language)\nSQL\nMachine Learning\nMicrosoft PowerPoint\nLeadership...,4 years proven experience analytics role ability communicate results analyses clearly ...,4 year proven experi analyt role abil commun result analys clearli technic nontechn st...,4 year proven experience analytics role ability communicate result analysis clearly te...
198,1010data,United States,Remote,Full time,Entry,Data Scientist,3-7 years professional experience in data analysis or practical experience building cu...,33,67,0,0,Python (Programming Language)\nMicrosoft Excel\nSQL\nMicrosoft PowerPoint\nMachine Lea...,37 years professional experience data analysis practical experience building customer ...,37 year profession experi data analysi practic experi build custom face analyt product...,37 year professional experience data analysis practical experience building customer f...
