# Data Preparation

- Convert text to all lower case for normalcy.
- Remove any accented characters, non-ASCII characters.
- Remove special characters.
- Stem or lemmatize the words.
- Remove stopwords.
- Store the clean text and the original text for use in future notebooks.

In [15]:
import re
import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

from imports import *

In [2]:
df = pd.read_csv('data_science.csv')

In [3]:
df.drop(columns = 'link', inplace = True)

In [4]:
df.isnull().sum()

company         0
location        0
mode            0
type            0
level           0
role            0
requirements    0
edu_bachelor    0
edu_master      0
edu_phd         0
edu_other       0
skills          0
dtype: int64

## Convert text to all lower case

In [5]:
df.requirements = df.requirements.str.lower()

In [6]:
df.requirements

0      bachelor degree\nminimum years of experience: 4 year(s)\ndemonstrates thorough abiliti...
1      identify and execute on predictive models to help internal teams at masterworks unders...
2      self-motivated, highly disciplined, and passionate about discovering the right therape...
3      2 years + experience with python, java, or other object-oriented programming languages...
4      2+ years of work experience doing quantitative analysis to tackle business problems\ns...
                                                 ...                                            
195    3 to 5 years of experience in applied data science in a retail marketing or operations...
196    bs in engineering, computer science, data science or equivalent\n2+ years of experienc...
197    4+ years of proven experience in an analytics role\nability to communicate the results...
198    3-7 years professional experience in data analysis or practical experience building cu...
199    develop and implement s

## Removing accented characters

In [7]:
def basic_clean(string):
    '''
    This function takes in a string and
    returns the string normalized.
    '''
    string = unicodedata.normalize('NFKD', string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
    # Removing white space
    string = re.sub(r'\s+', ' ',   string)
    # Removing anything that is not a-z, 0-9, a single quote, or whitespace
    string = re.sub(r"[^a-z0-9+'\s]", '', string)
    return string

In [8]:
df.requirements = df.requirements.apply(basic_clean)

In [9]:
df.requirements

0      bachelor degree minimum years of experience 4 years demonstrates thorough abilities an...
1      identify and execute on predictive models to help internal teams at masterworks unders...
2      selfmotivated highly disciplined and passionate about discovering the right therapeuti...
3      2 years + experience with python java or other objectoriented programming languages ha...
4      2+ years of work experience doing quantitative analysis to tackle business problems st...
                                                 ...                                            
195    3 to 5 years of experience in applied data science in a retail marketing or operations...
196    bs in engineering computer science data science or equivalent 2+ years of experience i...
197    4+ years of proven experience in an analytics role ability to communicate the results ...
198    37 years professional experience in data analysis or practical experience building cus...
199    develop and implement s

## Tokenization

In [10]:
def tokenize(string):
    '''
    This function takes in a string and
    returns a tokenized string.
    '''
    # Create tokenizer.
    tokenizer = nltk.tokenize.ToktokTokenizer()

    # Use tokenizer
    string = tokenizer.tokenize(string, return_str = True)

    return string

In [None]:
df.requirements = df.requirements.apply(tokenize)

## Lemmatization

In [11]:
def lemmatize(string):
    '''
    This function takes in string for and
    returns a string with words lemmatized.
    '''
    # Create the lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()

    # Use the lemmatizer on each word in the list of words we created by using split.
    lemmas = [wnl.lemmatize(word) for word in string.split()]

    # Join our list of words into a string again and assign to a variable.
    string = ' '.join(lemmas)

    return string

In [None]:
df.requirements = df.requirements.apply(lemmatize)

## Removing Stopwords

In [16]:
stopwords_list = stopwords.words('english')

In [None]:
stopwords_list

In [None]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # Create stopword_list.
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list) - set(exclude_words)
    
    # Add in 'extra_words' to stopword_list.
    stopword_list = stopword_list.union(set(extra_words))

    # Split words in string.
    words = string.split()
    
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [word for word in words if word not in stopword_list]
    
    # Join words in the list back into strings and assign to a variable.
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords

In [18]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # Create stopword_list.
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list) - set(exclude_words)
    
    # Add in 'extra_words' to stopword_list.
    stopword_list = stopword_list.union(set(extra_words))

    # Split words in string.
    words = string.split()
    
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [word for word in words if word not in stopword_list]
    
    # Join words in the list back into strings and assign to a variable.
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords