In [44]:
## Libraries
import json, os, re, datetime, csv
from pathlib import Path
import pandas as pd
import nltk
from nltk.corpus import stopwords

### We have two datasets:
#### 1) Linkedin jobs ads (dynamic)
#### 2) ECSF (static)

The first dataset, has to be dynamic in other words, we need to put it in a producer-topic-consumer-casssandra

The second dataset, has to be in cassandra

### To clean the linkedin job ads dataset:

#### 1) delete irrelevent columns
#### 2) remove duplicates
#### 3) remove null values
#### 4) preprocess Skills column
#### 5) preprocess Description column
#### 6) preprocess Title column

In [49]:
df = pd.read_json('jobs.json')
# number of rows / samples
row_count = df.shape[0]
print(f"Number of rows in the dataset: {row_count}")
df.head()

Number of rows in the dataset: 14769


Unnamed: 0,Title,Description,Primary Description,Detail URL,Location,Skill,Insight,Job State,Poster Id,Company Name,Company Logo,Created At,Scraped At
0,Inhouse Consultant Information Security Awaren...,"About Us\nAs a self-financed family company, w...","Deichmann · Essen, North Rhine-Westphalia, Ger...",https://www.linkedin.com/jobs/view/4300643770,"Essen, North Rhine-Westphalia, Germany","Skills: Information Security, Phishing, +8 more",,LISTED,714127132,Deichmann,https://media.licdn.com/dms/image/v2/C560BAQHX...,2025-09-18T11:14:09.000Z,2025-09-29T11:53:40.552Z
1,Senior Storage Engineer,Vattenfall IT’s Data Centre department manages...,"Vattenfall · Stockholm, Stockholm County, Swed...",https://www.linkedin.com/jobs/view/4279274865,"Stockholm, Stockholm County, Sweden","Skills: English, Data Centers, +8 more",,LISTED,6883526,Vattenfall,https://media.licdn.com/dms/image/v2/D4D0BAQGT...,2025-08-04T13:49:16.000Z,2025-09-29T12:06:52.350Z
2,DBA (M/F/D),SoSafe has the ambition to become the leading ...,"SoSafe · Madrid, Community of Madrid, Spain (O...",https://www.linkedin.com/jobs/view/4292576067,"Madrid, Community of Madrid, Spain","Skills: Amazon Web Services (AWS), Amazon Rela...",,LISTED,28825988,SoSafe,https://media.licdn.com/dms/image/v2/C4D0BAQGx...,2025-08-28T20:51:45.000Z,2025-09-29T12:07:51.615Z
3,Business system analyst,"At Exness, we are not just a leading trading b...","Exness · Limassol, Cyprus (Hybrid)",https://www.linkedin.com/jobs/view/4272079301,"Limassol, Cyprus","Skills: English, Programming Languages, +8 more",,LISTED,154611961,Exness,https://media.licdn.com/dms/image/v2/D4E0BAQEr...,2025-07-25T13:52:28.000Z,2025-09-29T12:29:25.092Z
4,Data Privacy and Protection Consultant (h/m/x),"In Experites, we seek privacy and data protect...","EXPERTIS SPAIN · Three songs, Community of Mad...",https://www.linkedin.com/jobs/view/4300652034,"Tres Cantos, Community of Madrid, Spain","Skills: Data Privacy, Data Protection Act, +1 ...",,LISTED,154570714,Experis España,https://media.licdn.com/dms/image/v2/D4D0BAQFq...,2025-09-18T08:31:10.000Z,2025-09-29T12:07:11.626Z


In [6]:
df.columns

Index(['Title', 'Description', 'Primary Description', 'Detail URL', 'Location',
       'Skill', 'Insight', 'Job State', 'Poster Id', 'Company Name',
       'Company Logo', 'Created At', 'Scraped At'],
      dtype='object')

#### Check for duplicates

In [50]:
# Duplicate check (first 5 columns)
subset_cols = df.columns[:5].tolist()
initial_rows = df.shape[0]

duplicate_count = df.duplicated(subset=subset_cols).sum()
print(f"number of detected duplicates: {duplicate_count}")


number of detected duplicates: 0


#### Drop unrelated columns (meaningless columns)

In [51]:
# Drop hiring manager-related columns, scrape info, non needed columns
columns_to_drop = [
    'Insight', 'Job State', 'Poster Id','Company Logo', 'Created At', 'Scraped At',
]
df.drop(columns=columns_to_drop, errors="ignore", inplace=True)

df.columns

Index(['Title', 'Description', 'Primary Description', 'Detail URL', 'Location',
       'Skill', 'Company Name'],
      dtype='object')

#### Cleaning Description column

In [52]:
print('-> The first row of the Description column before cleaning: \n',df["Description"][1][:500])
# 1) remove \n
df["Description"] = df["Description"].str.replace("\n", ". ", regex=True)


# 2) remove unrelated sentences
# List of unwanted phrases to remove
unwanted_phrases = [
    "job description", "job title", "role description", "about the job",
    "about the role", "about us", "about the opportunity", "requirements",
    "job requirements", "role requirements", "your role", "your job",
    "offer", "employment offer", "your profile", "responsibilities",
    "job responsibilities", "role responsibilities", "overview", "position overview", 
    "who are we?", "who we are", "who are we ?"
]
# Precompile regex patterns for efficiency
patterns = [re.compile(rf"^{phrase}[:\s]*", re.IGNORECASE) for phrase in unwanted_phrases]

# Cleaning function
def clean_description(text):
    if isinstance(text, str):
        for pattern in patterns:
            text = pattern.sub("", text).strip()  # Remove matched phrase and trim spaces
        return text
    return ""

# Apply function to the description column

df["Description"] = df["Description"].apply(clean_description)
print('---------------------------------------------------------')
print('-> The first row of the Description column after cleaning: \n',df["Description"][1][:500])

-> The first row of the Description column before cleaning: 
 Vattenfall IT’s Data Centre department manages on-premises workloads and Public Cloud services under a hybrid strategy. While the Public Cloud Platform Services team handles cloud-based workloads, the Data Centre team oversees country-based Data Centers and is transitioning from traditional hosting to Virtual Private Cloud (VPC) platforms to meet regulatory and business-specific needs.

Private Cloud Platform Services

A new department within the Data Centre organization manages Private Cloud Pl
---------------------------------------------------------
-> The first row of the Description column after cleaning: 
 Vattenfall IT’s Data Centre department manages on-premises workloads and Public Cloud services under a hybrid strategy. While the Public Cloud Platform Services team handles cloud-based workloads, the Data Centre team oversees country-based Data Centers and is transitioning from traditional hosting to Virtual Private

#### Clean Skills column

In [53]:
print('-> The first row of the Description column before cleaning: \n',df["Skill"][1][:400])

# Preprocess the Skill column
def clean_skills(skill_str):
    if isinstance(skill_str, str):
        # Remove "Skills: " if it appears at the beginning
        skill_str = re.sub(r"^Skills:\s*", "", skill_str).strip()
        # Remove "X of Y skills match your profile - you may be ..." pattern
        skill_str = re.sub(r"\d+\s+of\s+\d+\s+skills match your profile - you may be.*", "", skill_str, flags=re.IGNORECASE).strip()
        # Remove ", +X more" where X is any number
        skill_str = re.sub(r",\s\+\d+\s+more", "", skill_str).strip()
        return skill_str
    return ""

df["Skill"] = df["Skill"].apply(clean_skills)
df.head(5)

print('---------------------------------------------------------')
print('-> The first row of the Description column after cleaning: \n',df["Skill"][1][:400])


-> The first row of the Description column before cleaning: 
 Skills: English, Data Centers, +8 more
---------------------------------------------------------
-> The first row of the Description column after cleaning: 
 English, Data Centers


#### Clean Title

In [54]:
#1) download the stopwords
nltk.download("stopwords")

# 2) Load stopwords for each detected language
stop_words = set()
languages = ["english", "german", "dutch", "italian", "spanish", "french", "portuguese"]
# Add stopwords for each language to the set
for lang in languages:
    stop_words.update(set(stopwords.words(lang)))
# Add custom stopwords to the set
custom_stopwords = {"mwd", "mfd", "gender", "fh", "mf", "hf", "fmd", "wmd", "etc", "M/F"}
stop_words.update(custom_stopwords)

# 3) Function to clean stop words from text 
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r"\d+", "", text)  # Remove numbers
        text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
        text = " ".join(word for word in text.split() if word not in stop_words)  # Remove stopwords
        return text
    return ""

# Clean "Title" and "Description" columns
df["Title"] = df["Title"].apply(clean_text)
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\samiha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Title,Description,Primary Description,Detail URL,Location,Skill,Company Name
0,inhouse consultant information security awareness,". As a self-financed family company, we are mo...","Deichmann · Essen, North Rhine-Westphalia, Ger...",https://www.linkedin.com/jobs/view/4300643770,"Essen, North Rhine-Westphalia, Germany","Information Security, Phishing",Deichmann
1,senior storage engineer,Vattenfall IT’s Data Centre department manages...,"Vattenfall · Stockholm, Stockholm County, Swed...",https://www.linkedin.com/jobs/view/4279274865,"Stockholm, Stockholm County, Sweden","English, Data Centers",Vattenfall
2,dba,SoSafe has the ambition to become the leading ...,"SoSafe · Madrid, Community of Madrid, Spain (O...",https://www.linkedin.com/jobs/view/4292576067,"Madrid, Community of Madrid, Spain","Amazon Web Services (AWS), Amazon Relational D...",SoSafe
3,business system analyst,"At Exness, we are not just a leading trading b...","Exness · Limassol, Cyprus (Hybrid)",https://www.linkedin.com/jobs/view/4272079301,"Limassol, Cyprus","English, Programming Languages",Exness
4,data privacy protection consultant hmx,"In Experites, we seek privacy and data protect...","EXPERTIS SPAIN · Three songs, Community of Mad...",https://www.linkedin.com/jobs/view/4300652034,"Tres Cantos, Community of Madrid, Spain","Data Privacy, Data Protection Act",Experis España


#### Clean Location column and add Country column

In [56]:
# Replace specific metropolitan areas with their respective countries in the Location column
df["Location"] = df["Location"].replace({
    "Greater Madrid Metropolitan Area": "Spain",
    "Greater Paris Metropolitan Region": "France",
    "Greater Barcelona Metropolitan Area": "Spain"
})

# Create a new column 'Country' by extracting the text after the last comma
df["Country"] = df["Location"].apply(lambda x: x.split(",")[-1].strip() if "," in x else x.strip())

# Exclude rows where Location contains "EMEA" or "European Union" (as these are not countries)
df = df[~df["Location"].str.contains("EMEA|European Union", na=False)]

#### Save the df

In [57]:
df.to_json('cleaned_linkedin_jobs.json')