In [None]:
pip install bs4


In [None]:
pip install nltk


In [33]:
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd

### Step-by-Step Methodology for Cleaning the Job Description

1. **Remove HTML Tags (if present)**:
   - Job descriptions scraped from LinkedIn may contain HTML tags. We use `BeautifulSoup` to remove these tags while preserving the text content for cleaner analysis.

2. **Remove Unnecessary Line Breaks and Whitespace**:
   - Multiple line breaks or extra spaces can clutter the description. We use regular expressions to replace them with single spaces for a cleaner and more readable description.

3. **Lowercase Transformation**:
   - Converting all text to lowercase ensures uniformity, especially important when matching skills, keywords, or performing text analysis.

4. **Remove Special Characters and Punctuation**:
   - We remove unnecessary special characters (e.g., #, $, @) while retaining meaningful punctuation like colons and commas to maintain sentence structure without adding noise.

5. **Remove Stopwords**:
   - Stopwords like "the," "and," or "in" do not provide significant meaning, so we remove them to focus on the key information in the text.

6. **Lemmatization**:
   - We apply lemmatization to reduce words to their root form (e.g., "running" to "run") for better consistency when analyzing or matching words in the description.


### `clean_description(description)` Function

This function performs text preprocessing on job descriptions to clean and standardize the content. The steps in the cleaning process are outlined below:

1. **HTML Tag Removal**:
   - Uses `BeautifulSoup` to remove any HTML tags present in the description, leaving only the text.

2. **Whitespace Cleanup**:
   - Removes unnecessary line breaks, multiple spaces, and trims any leading or trailing whitespace using regular expressions.

3. **Lowercase Transformation**:
   - Converts all text to lowercase to ensure consistency in further text processing.

4. **Special Character Removal**:
   - Removes special characters and punctuation, but retains essential punctuation like colons, commas, and periods to preserve the structure of the text.

5. **Stopword Removal**:
   - Filters out common stopwords (like "the", "is", "and") using the NLTK stopwords list, reducing noise in the description.

6. **Lemmatization**:
   - Applies lemmatization using `WordNetLemmatizer` to reduce words to their base form, ensuring that different forms of a word (e.g., "running", "ran") are treated as the same word.

This process cleans, simplifies, and standardizes the job descriptions, making them more suitable for text analysis or machine learning tasks.


In [34]:
def clean_description(description):
    
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    
    # 1. Remove HTML Tags
    soup = BeautifulSoup(description, "html.parser")
    cleaned_description = soup.get_text()

    # 2. Remove Unnecessary Line Breaks and Whitespace
    cleaned_description = re.sub(r'\s+', ' ', cleaned_description).strip()

    # 3. Lowercase Transformation
    cleaned_description = cleaned_description.lower()


    # 4. Remove Special Characters and Punctuation (retain colons, commas, and periods)
    cleaned_description = re.sub(r'[^\w\s.,:]', '', cleaned_description)

    # 5. Remove Stopwords
    cleaned_description = ' '.join([word for word in cleaned_description.split() if word not in stop_words])

    # 6. Lemmatization
    cleaned_description = ' '.join([lemmatizer.lemmatize(word) for word in cleaned_description.split()])

    return cleaned_description

In [35]:
data  = pd.read_csv(r"linkedin_jobs_filtered.csv")

In [36]:
data.head()

Unnamed: 0,url,title,location,description,city,state,place_of_work,country
0,https://www.linkedin.com/jobs/view/4030963908/...,Director Search Engine Optimization,"Wayne, PA (Hybrid)",About the job\nSEO Director - Technical Focus\...,Wayne,PA,Hybrid,USA
1,https://www.linkedin.com/jobs/view/4034610554/...,SAP Hana Project Manager,"Naperville, IL (On-site)",About the job\nThe Business Systems Project Ma...,Naperville,IL,On-site,USA
2,https://www.linkedin.com/jobs/view/4030614251/...,Tech-Savvy Financial Planner,"Las Vegas, NV (Hybrid)",About the job\nJob Title: Tech-Savvy Financial...,Las Vegas,NV,Hybrid,USA
3,https://www.linkedin.com/jobs/view/4026434663/...,Class C Licensed Groundwater Plant Operator,"Houston, TX (On-site)",About the job\nWe are seeking an experienced C...,Houston,TX,On-site,USA
4,https://www.linkedin.com/jobs/view/4022898839/...,ILI Specialist,"Houston, TX (Hybrid)",About the job\nCompany Description\nD2 Integri...,Houston,TX,Hybrid,USA


In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   url            1000 non-null   object
 1   title          1000 non-null   object
 2   location       1000 non-null   object
 3   description    1000 non-null   object
 4   city           1000 non-null   object
 5   state          1000 non-null   object
 6   place_of_work  946 non-null    object
 7   country        1000 non-null   object
dtypes: object(8)
memory usage: 62.6+ KB


In [38]:
data['description'].head()

0    About the job\nSEO Director - Technical Focus\...
1    About the job\nThe Business Systems Project Ma...
2    About the job\nJob Title: Tech-Savvy Financial...
3    About the job\nWe are seeking an experienced C...
4    About the job\nCompany Description\nD2 Integri...
Name: description, dtype: object

### Purpose of Sentence Splitting

This function splits a job description into individual sentences, preparing the text for further analysis. The goal is to manually tag each sentence as either "Qualified" or "Description" based on its relevance. This tagging process will help build a labeled dataset that can be used to train machine learning models for automated classification of job description sentences.

---

### `split_into_sentences(text)` Function

1. **Load spaCy's English Model**:
   - We use spaCy's `en_core_web_sm` model to handle sentence tokenization. This model is pre-trained to understand English syntax and structure.

2. **Split Text by New Lines**:
   - The text is first split by new line characters (`\n`) to break down paragraphs or sections.

3. **Sentence Segmentation**:
   - Each paragraph is processed using spaCy's sentence tokenizer. This extracts individual sentences from the text, allowing for precise sentence splitting based on language rules.

4. **Remove Leading Hyphens**:
   - Some sentences may start with hyphens (e.g., bullet points). We strip leading hyphens to clean up the sentences for easier tagging and processing.

5. **Return Cleaned Sentences**:
   - The function returns a list of cleaned, split sentences, ready for manual tagging or further processing.

This approach ensures that the job description is split into clear, usable sentences, facilitating the next step of manual tagging for model training.


In [39]:
import spacy

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")


In [40]:

# Function to split text into sentences, handle new lines, and remove leading hyphens
def split_into_sentences(text):
    # Split the text by new lines first
    paragraphs = text.split('\n')
    sentences = []
    
    for paragraph in paragraphs:
        # Process each paragraph with spaCy
        doc = nlp(paragraph)
        # Extract sentences and add to the list, removing leading hyphens
        for sent in doc.sents:
            cleaned_sentence = sent.text.strip()
            if cleaned_sentence.startswith('-'):
                cleaned_sentence = cleaned_sentence[1:].strip()  # Remove leading hyphen
            sentences.append(cleaned_sentence)
    
    return sentences

In [None]:

# Apply to dataset
data['sentences'] = data['description'].apply(split_into_sentences)


In [17]:
# Create a new DataFrame to hold the exploded sentences
sentences_df = pd.DataFrame(data['sentences'].explode()).reset_index(drop=True)

In [18]:
sentences_df

Unnamed: 0,sentences
0,About the job
1,SEO Director - Technical Focus
2,We're seeking an experienced SEO Director to l...
3,"In this role, you'll drive organic growth for ..."
4,Responsibilities:
...,...
40002,Program Management:
40003,Proven experience in managing multiple Data an...
40004,Agile Methodology: Strong experience with Agil...
40005,Industry Experience: Experience in the life sc...


In [None]:
sentences_df['cleaned_sentence'] = sentences_df['sentences'].apply(clean_description)

- Cleaning the sentences

In [20]:
sentences_df

Unnamed: 0,sentences,cleaned_sentence
0,About the job,job
1,SEO Director - Technical Focus,seo director technical focus
2,We're seeking an experienced SEO Director to l...,seeking experienced seo director lead agency s...
3,"In this role, you'll drive organic growth for ...","role, youll drive organic growth client innova..."
4,Responsibilities:,responsibilities:
...,...,...
40002,Program Management:,program management:
40003,Proven experience in managing multiple Data an...,proven experience managing multiple data bi pr...
40004,Agile Methodology: Strong experience with Agil...,agile methodology: strong experience agile met...
40005,Industry Experience: Experience in the life sc...,industry experience: experience life science p...


In [21]:
sentences_df.to_csv('processed_data.csv', index=False, encoding='utf-8')