# Uploading data

In [None]:
import pandas as pd
import glob
import os
from google.colab import files
import re
from IPython.display import display

# Upload the file manually
uploaded = files.upload()

# List the uploaded files
import pandas as pd
df = pd.read_csv("distinct_descriptions.csv")

Saving distinct_descriptions.csv to distinct_descriptions (1).csv


## Lists with general terms

In [None]:
boilerplate_terms = [
    "about us",
    "we are hiring",
    "company overview",
    "our mission",
    "job description",
    "key responsibilities",
    "qualifications",
    "skills required",
    "what we offer",
    "benefits and perks",
    "how to apply",
    "join our team",
    "careers",
    "why work with us",
    "our culture",
    "core values",
    "equal opportunity employer",
    "diversity and inclusion",
    "application process",
    "about the role",
    "desired experience",
    "employment type",
    "location",
    "compensation",
    "ready to apply",
    "job requirements",
    "minimum qualifications",
    "preferred qualifications",
    "team overview",
    "employee benefits",
    "work environment"
]

recruiter_terms = [
    "apply now",
    "hiring process",
    "equal opportunity employer",
    "full-time position",
    "we’re hiring",
    "immediate openings",
    "competitive salary",
    "flexible schedule",
    "career growth",
    "great opportunity",
    "join our team",
    "talented individuals",
    "submit your resume",
    "send your application",
    "available positions",
    "interview process",
    "remote work options",
    "part-time opportunities",
    "start your career",
    "qualified candidates",
    "professional development",
    "career advancement",
    "benefits package",
    "seeking talent",
    "onboarding process",
    "apply today",
    "growth opportunities",
    "work with us",
    "job opportunities",
    "dedicated professionals",
    "motivated candidates",
    "career path",
    "employment opportunity",
    "team-oriented culture",
    "dynamic environment",
    "skills and qualifications",
    "immediate start"
]

# EDA for preprocessing

In [None]:
# Analyze the dataset for line break (\n) patterns

# Total number of rows in the dataset
total_rows = len(df)

# Count rows containing at least one \n
rows_with_linebreaks = df['Job Description'].str.contains(r'\n').sum()

# Count occurrences of \n in each row
df['Line Break Count'] = df['Job Description'].str.count(r'\n')

# Calculate the mode of \n counts per row
mode_linebreaks = df['Line Break Count'].mode()[0]

# Additional metrics for evaluation
average_linebreaks = df['Line Break Count'].mean()
median_linebreaks = df['Line Break Count'].median()
max_linebreaks = df['Line Break Count'].max()
min_linebreaks = df['Line Break Count'].min()

# Display the results
linebreak_analysis = {
    "Total Rows": total_rows,
    "Rows with Line Breaks": rows_with_linebreaks,
    "Mode of Line Breaks Per Row": mode_linebreaks,
    "Average Line Breaks Per Row": average_linebreaks,
    "Median Line Breaks Per Row": median_linebreaks,
    "Max Line Breaks Per Row": max_linebreaks,
    "Min Line Breaks Per Row": min_linebreaks
}

linebreak_analysis

{'Total Rows': 498,
 'Rows with Line Breaks': 497,
 'Mode of Line Breaks Per Row': 31,
 'Average Line Breaks Per Row': 46.238955823293175,
 'Median Line Breaks Per Row': 43.0,
 'Max Line Breaks Per Row': 178,
 'Min Line Breaks Per Row': 0}

# Preprocessing

## Removal of unneccesary parts of the job description

In [None]:
def clean_job_description(text, boilerplate_terms, recruiter_terms):
    # Split job description into paragraphs (assuming paragraphs are separated by "\n")
    paragraphs = text.split("\n")

    # Filter out paragraphs containing boilerplate and recruiter terms
    filtered_paragraphs = [
        paragraph for paragraph in paragraphs
        if not any(term.lower() in paragraph.lower() for term in boilerplate_terms + recruiter_terms)
    ]

    # Join the filtered paragraphs back into a single string
    return "\n".join(filtered_paragraphs)

# Clean column
df['Cleaned Job Description'] = df['Job Description'].apply(
    lambda text: clean_job_description(text, boilerplate_terms, recruiter_terms)
)


In [None]:
df

Unnamed: 0,Search Term,Job Description,Line Break Count,Cleaned Job Description
0,Transformer Models,About Us:\nRivian and Volkswagen Group Technol...,42,Rivian and Volkswagen Group Technologies is a ...
1,Transformer Models,"Job Description\nAI Research Scientist, Founda...",68,"AI Research Scientist, Foundation Models\nPrim..."
2,Transformer Models,Valence has built the first-to-market AI nativ...,46,Valence has built the first-to-market AI nativ...
3,Transformer Models,Job Description\n\nIntel AI Lab is a research ...,44,\nIntel AI Lab is a research organization purs...
4,Transformer Models,Company Overview:\nAbout SS8 Networks: As a le...,53,Position: AI Applications Engineer\n\n\nLLM In...
...,...,...,...,...
493,Large Language Models,Why UT Southwestern Neurology Department?\n\nW...,73,Why UT Southwestern Neurology Department?\n\nW...
494,Large Language Models,#WeAreCrowdStrike and our mission is to stop b...,61,We’re seeking a Senior Information Architect w...
495,Large Language Models,"Data Scientist\nAt Applied Materials, we are b...",48,"Data Scientist\nAt Applied Materials, we are b..."
496,Large Language Models,Working Hours : Full Time (W2)\nLocations : Li...,30,Working Hours : Full Time (W2)\nExperience : 1...


In [None]:
# Remove missing or empty descriptions
df = df.dropna(subset=['Cleaned Job Description'])
df = df[df['Cleaned Job Description'].str.strip() != ""]

def clean_text(text):
    """
    Clean the input text by removing unwanted characters, HTML tags, and extra whitespace.
    """
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove non-alphanumeric characters (except spaces and punctuation)
    text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning to the Job Description column
df['Cleaned Job Description'] = df['Cleaned Job Description'].apply(clean_text)

In [None]:
# Filter out very short descriptions (e.g., fewer than 50 characters)
df = df[df["Cleaned Job Description"].str.len() > 50]

df

Unnamed: 0,Search Term,Job Description,Line Break Count,Cleaned Job Description
0,Transformer Models,About Us:\nRivian and Volkswagen Group Technol...,42,rivian and volkswagen group technologies is a ...
1,Transformer Models,"Job Description\nAI Research Scientist, Founda...",68,"ai research scientist, foundation models prima..."
2,Transformer Models,Valence has built the first-to-market AI nativ...,46,valence has built the firsttomarket ai native ...
3,Transformer Models,Job Description\n\nIntel AI Lab is a research ...,44,intel ai lab is a research organization pursui...
4,Transformer Models,Company Overview:\nAbout SS8 Networks: As a le...,53,position ai applications engineer llm integrat...
...,...,...,...,...
493,Large Language Models,Why UT Southwestern Neurology Department?\n\nW...,73,why ut southwestern neurology department? we a...
494,Large Language Models,#WeAreCrowdStrike and our mission is to stop b...,61,were seeking a senior information architect wi...
495,Large Language Models,"Data Scientist\nAt Applied Materials, we are b...",48,"data scientist at applied materials, we are bu..."
496,Large Language Models,Working Hours : Full Time (W2)\nLocations : Li...,30,working hours full time w2 experience 1 years ...


In [None]:
from google.colab import files
df.to_csv('cleaned_job_descriptions.csv', encoding = 'utf-8-sig')
files.download('cleaned_job_descriptions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##Analysis of removed characters


In [None]:
# Calculate the number of characters removed per row
df['Original Character Count'] = df['Job Description'].str.len()
df['Cleaned Character Count'] = df['Cleaned Job Description'].str.len()
df['Characters Removed'] = df['Original Character Count'] - df['Cleaned Character Count']

# Calculate statistics for "Characters Removed" to ensure correctness
character_removal_stats = {
    "Mean Characters Removed": df['Characters Removed'].mean(),
    "Median Characters Removed": df['Characters Removed'].median(),
    "Mode Characters Removed": df['Characters Removed'].mode()[0] if not df['Characters Removed'].mode().empty else None,
    "Max Characters Removed": df['Characters Removed'].max(),
    "Min Characters Removed": df['Characters Removed'].min(),
    "Total Characters Removed": df['Characters Removed'].sum()
}

# Display the corrected statistics
character_removal_stats

{'Mean Characters Removed': 1362.6465863453816,
 'Median Characters Removed': 1387.0,
 'Mode Characters Removed': 1924,
 'Max Characters Removed': 4371,
 'Min Characters Removed': 11,
 'Total Characters Removed': 678598}