In [34]:
import spacy
import pandas as pd
import re
import os

os.chdir("C:/Users/fib0/Desktop")
nlp = spacy.load('en_core_web_lg')

In [35]:
#Function to remove stopwords
def stop_word_remover(text):
    doc = nlp(str(text))
    text = ' '.join([token.text for token in doc if token.is_stop == False])
    return text

#Function to lemmatize
def lemmatizer(text):
    doc = nlp(str(text))
    text = ' '.join([token.lemma_ if token.lemma_ != '-PRON-' else token.text for token in doc])
    return text

In [36]:
# Loading the dataset
data = pd.read_csv("job_descriptions_dataset.csv")

# Excluding the scam advertisements
data = data[data.fraudulent == 'f']
data.reset_index(inplace = True)

# Concatinate "description" and "requirements" columns to create "job_advertisements"
job_advertisements = data["description"] + " " + data["requirements"]

# Create a new DataFrame, containing only Job Advertisements
#data = pd.DataFrame(data = job_advertisements, columns = ["Description"])

In [22]:
# Text lowercase
data["Description"] = data["Description"].str.lower()

# Handling newlines
data["Description"] = data["Description"].str.replace("\n\n", ' ')
data["Description"] = data["Description"].str.replace("\n", '. ')

# Removing leading and ending whitespaces
data["Description"] = data["Description"].str.strip()

In [5]:
# Create list of sentences
sentences_list = []

for i in range(len(data.Description)):
    try:
        sentences = list(nlp(data.Description[i]).sents)
        sentences_list = sentences_list + sentences
    except:
        pass

In [6]:
for i in range(len(sentences_list)):
    
    sentences_list[i] = str(sentences_list[i])
    
    # Removing special characters
    sentences_list[i] = re.sub('[^A-Za-z0-9]+', ' ', sentences_list[i])
    # sentences_list[i] = re.sub(r"""[/,-;@#&:()!?]+ \ * """, " ", sentences_list[i], flags=re.VERBOSE)
    
    # Replacing multiple spaces with a single space
    sentences_list[i] = re.sub(' +', ' ', sentences_list[i])
    
    # Removing leading and ending whitespaces
    sentences_list[i] = sentences_list[i].strip()   

# Removing empty sentences from the list
sentences_list = [sentence for sentence in sentences_list if sentence != '']

In [7]:
sentences = pd.DataFrame(data = sentences_list, columns = ["sentence"] )

In [8]:
sentences["lemmatized"] = sentences["sentence"].apply(lemmatizer)    

In [9]:
sentences["lemmatized_without_stopwords"] = sentences["lemmatized"].apply(stop_word_remover)

In [10]:
sentences.to_csv('sentences_preprocessed.csv', index = False)