### DATA CLEANING

In [4556]:
import pandas as pd
import numpy as np
import warnings; warnings.filterwarnings(action = 'ignore')
from rapidfuzz import process, fuzz
from deep_translator import GoogleTranslator
import json
import re
from fuzzywuzzy import process
from deep_translator import GoogleTranslator

import nltk
nltk.data.path.append("/Users/elena/python 3.11/lib/nltk_data")
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk import FreqDist


In [4557]:
# removing the limit on the number of columns
pd.set_option('display.max_columns', 50)

# removing the limit on the width of columns
pd.set_option('display.max_colwidth', 120)

# ignoring the warnings
pd.set_option('chained_assignment', 'warn')
warnings.simplefilter(action='ignore', category=FutureWarning)

# setting a limit on the display of decimal places
pd.options.display.float_format = '{:,.2f}'.format


### 0. Import Germany companies database (translated and classified)

In [4558]:
# companies_database = pd.read_excel("data/classified_companies.xlsx")
# companies_database.head(10)

In [4559]:
# companies_database.to_excel('companies_database.xlsx', index=False)

### 1.  Indeed

In [4560]:
# import files
indeed_765 = pd.read_csv("data/indeed_765.csv")
indeed_400 = pd.read_csv("data/indeed_400.csv")


In [4561]:
# Leave only the useful columns

indeed_765_cl = indeed_765[['company', 'extractedSalary/max',
                    'extractedSalary/min','extractedSalary/type', 'formattedLocation',
                     'title']]
indeed_400_cl = indeed_400[['Position Name','Salary', 'Job Type', 'Company','Location']]


In [4562]:
# Rename the columns
new_names1 = ['company', 'salary_max', 'salary_min', 'salary_type', 'location', 'title']
new_names2 = ['title', 'salary', 'job_type','company', 'location']
indeed_765_cl.columns = new_names1
indeed_400_cl.columns = new_names2

In [4563]:
# JOIN  two dataframes

indeed = pd.merge(indeed_765_cl, indeed_400_cl, on=['title','company', 'location'], how='outer')
indeed['source'] = 'Indeed'

#### Salary column

In [4564]:
# Work with Salary column

# Check, where 'salary' not empty and  'salary_min' and 'salary_max' empty
mask = indeed['salary'].notnull() & indeed['salary_min'].isnull() & indeed['salary_max'].isnull()

# Extract the min, max salary
indeed['min'] = indeed['salary'].str.extract(r'(\d+\.\d+) €')
indeed['max'] = indeed['salary'].str.extract(r'[–\-]\s*(\d+\.\d+) €')
indeed['type'] = indeed['salary'].str.extract(r'[–\-]\s*\d+\.\d+ € (.*)')

# replace the dot
indeed['max'] = indeed['max'].str.replace('.', '', regex=False).astype(float)
indeed['min'] = indeed['min'].str.replace('.', '', regex=False).astype(float)
indeed['type'] = indeed['type'].str.replace('pro Monat', 'monthly', regex=False)
indeed['type'] = indeed['type'].str.replace('pro Jahr', 'yearly', regex=False)

indeed[['salary_min', 'salary_max','salary_type']] = indeed[['salary_min', 'salary_max','salary_type']].fillna(0)

indeed['salary_min'] = indeed.apply(lambda row: row['min'] if row['salary_min'] == 0 else row['salary_min'], axis=1)
indeed['salary_max'] = indeed.apply(lambda row: row['max'] if row['salary_max'] == 0 else row['salary_max'], axis=1)
indeed['salary_type'] = indeed.apply(lambda row: row['type'] if row['salary_type'] == 0 else row['salary_type'], axis=1)

# Delete extra columns
indeed = indeed.drop(['salary', 'min', 'max', 'type'], axis=1)

# indeed.head()

In [4565]:
pc = indeed.loc[indeed['salary_min'].notna()]
pc.head()

Unnamed: 0,company,salary_max,salary_min,salary_type,location,title,job_type,source
14,PsyCommerce,,2000.0,,Homeoffice,(Junior) Digital Data Analyst (m/w/d) (Remote),,Indeed
20,21X,55000.0,45000.0,yearly,88131 Lindau,(Junior) Trade Operations & Trade Reporting Analyst (all genders),Vollzeit,Indeed
170,dev-partner GmbH,65000.0,50000.0,yearly,72074 Tübingen,Business Analyst - Performance Marketing(w/m/d),Vollzeit,Indeed
204,Safran,155.0,155.0,daily,22453 Hamburg,Business Analyst – V.I.E contract (F/H),,Indeed
241,All for One Group SE,-1.0,245000.0,yearly,Berlin,Chief Technology Officer/CTO (m/w/d),,Indeed


#### Duplicates

In [4566]:
# Check for duplicates
duplicates = indeed.duplicated()
duplicates.value_counts()

False    946
True      32
Name: count, dtype: int64

In [4567]:
indeed = indeed.drop_duplicates()
duplicates = indeed.duplicated()
duplicates.value_counts()

False    946
Name: count, dtype: int64

In [4568]:
indeed['salary_type'].value_counts()

salary_type
yearly     26
monthly    11
daily       1
weekly      1
Name: count, dtype: int64

#### Experience

In [4569]:
def analyze_experience(title):
    # translate the job title into lower case to simplify the search.

    title = title.lower()
    
    # Dictionary with the work experience
    if 'junior' in title or 'entry-level' in title:
        return 'Junior'
    elif 'senior' in title or 'team lead' in title or 'teamlead' in title or 'geschäftsführer' in title:
        return 'Senior'
    elif 'werkstudierende' in title or 'werkstudent' in title or 'working student' in title or 'student' in title:
        return 'No experience'
    elif  'internship' in title or 'young' in title:
        return 'No experience'
    elif 'praktikum' in title or 'studium' in title:
        return 'No experience'
    elif 'trainee' in title:
        return 'No experience'
    elif 'advanced' in title:
        return 'Middle'
    else:
        return 'Middle'

indeed['experience'] = indeed['title'].apply(analyze_experience)
indeed['experience'].value_counts()
# indeed.head()

experience
Middle           667
Senior           169
No experience     65
Junior            45
Name: count, dtype: int64

#### Job_type

In [4570]:
indeed['job_type'] = indeed['job_type'].astype(str)

def analyze_job_type(type):
   
    type = type.lower()
    
    if 'vollzeit' in type or '4-tage-woche' in type or 'full-time' in type:
        return 'Full-time'
    elif 'gleitzeit' in type or 'flexi-time'in type:
        return 'Flexi-time'
    elif 'montag bis freitag' in type:
        return 'Full-time'
    elif  'werkstudent' in type or 'part-time' in type:
        return 'Part-time'
    elif 'schicht' in type:
        return 'Part-time'
    else:
        return None

indeed['job_type'] = indeed['job_type'].apply(analyze_job_type)
if indeed['job_type'].isnull().any():  # Проверяем на NaN, а не на None
    indeed['job_type'] = indeed['job_type'].fillna(indeed['title'].apply(analyze_job_type))

indeed['job_type'].value_counts()

job_type
Full-time     33
Part-time     21
Flexi-time     8
Name: count, dtype: int64

#### Location

In [4571]:
indeed["location"] = indeed["location"].str.extract(r'(?:\d+\s)?([\w\-]+\s?[\w\-]+)$')
indeed['location'].value_counts()

location
Berlin              126
München             101
am Main              79
Hamburg              71
Düsseldorf           42
                   ... 
Kiel                  1
Schkeuditz            1
Ravensburg            1
Holzwickede           1
Schwäbisch Gmünd      1
Name: count, Length: 197, dtype: int64

#### Bundesland

In [None]:

with open('data/german_cities.json', 'r', encoding='utf-8') as f:
    german_cities = json.load(f)


city_to_bundesland = {entry['name']: entry['state'] for entry in german_cities['data']}

def find_bundesland(city):
    return city_to_bundesland.get(city,city)

indeed['bundesland'] = indeed['location'].apply(find_bundesland)


#### Industry

In [None]:

industry_keywords = {
    'Internet and IT': ['Information Technology', 'IT', 'IT service', 'Internet','Automation & Technology','Kommunikationssysteme','Technologies','Information Services','Information','Database provider','Innovative companies','CHECK24'],
    'Management Consulting': ['Business consultant', 'Consulting', 'Management','Management companies','McKinsey','Deloitte','Business consultant','consultant'],
    'E-commerce,trade and Retail': ['Sales company', 'Sales', 'Food', 'retail','Commerce','E-Commerce','Trade','Rewe','Amazon','Hypermarkets','supermarkets','shop','shops','Douglas','Mail order','Branded goods','OBI'],
    'Banking, and financial services': ['Savings banks', 'banking', 'Bank', 'financial services','Financial','PayTech','finance','Finanz','Sparkassen','Investment','credit institutions','Financing companies','Financing','banks','Leasing','Volksbanken'],
    'Insurance': ['Insurance companies', 'Insurance', 'insurance agencies', 'health insurance','health insurance offices','Reinsurance','AOK'],
    'Pharmaceutical and medical products and technology': ['Pharmacies', 'Pharmacies, public', 'medical products', 'medical technology','pharmaceutical','chemical'],
    'Computer Software': ['computer retail', 'computer', 'software', 'software service','SAP','Hardware','Hardware stores'],
    'Media and publishing': ['media', 'publishing', 'press', 'multimedia','Film', 'radio','television','publishers'],
    'Industry and mechanical engineering': ['engineering offices', 'engineering', 'machinery', 'systems and apparatus','Technik','Metalworking', 'machines', 'manufacturer'],
    'HR services and consulting': ['Staffing', 'HR consultant', 'Recruiting', 'HR','Employment agency','Employment'],
    'Automotive': ['Auto repair', 'Auto', 'Auto repair shops', 'Automotive','Mercedes-Benz AG','BMW','Ford','Volkswagen','Car','Motor vehicle','Vehicle','Honda','Yacht','Yachts'],
    'Transport and logistics': ['Freight forwarding', 'logistic', 'warehouse', 'railway','Logistics','Logistics company','Transport','traffic','Airports'],
    'Wholesale': ['Wholesale'],
    'Telecommunications': ['telecommunications', 'Telecommunications company','Telecom','Telephone network'],
    'Energy, water and environment': ['Energy', 'water', 'environment','energy supply'],
    'Architecture and planning': ['Architects', 'architectural offices', 'Architecture'],
    'Real estate': ['Real estate agents', 'Real estate', 'real estate agencies','immowelt','Rental', 'brokerage','rental service','Property'],
    'Marketing, advertising, PR and design': ['advertising agencies', 'advertising', 'marketing', 'PR','design','Promotional','Public relations','Communication agencies'],
    'Sport, health and social': ['Clubs', 'Games and sports', 'sport', 'health','dental','welfare','Doctors','healthcare','clinics','Healthcare','clinic','diabetologists','Sports facilities','Psychotherapists','Fitness center','Medical organizations','Institutes of Medicine','Hospitals'],
    'Education and science': ['education','university','schule', 'school','wissenschaftliche','wissenschaft','Universität','Teaching','research','Think Tanks','Language', 'schools','Universities','Institutes of Economics','Educational institutions','educational centers'],
    'Auditing, tax and law': ['Tax advisory professions', 'Tax advisory', 'tax', 'law','Steuern','Steuerberator'],
    'Hotels, tourism, travel agencies': ['Hotels', 'Restaurants and pubs', 'Restaurants', 'Travel agencies', 'travel','Tour operator','Guest houses','Guest house','Hotel companies'], 
    'Building': ['Building','construction'],   
}

# Function for defining industry
def classify_industry(activity):
    if isinstance(activity, str):  
       
        cleaned_activity = re.sub(r'\W+', ' ', activity).lower()

        for industry, keywords in industry_keywords.items():
            for keyword in keywords:
               
                cleaned_keyword = re.sub(r'\W+', ' ', keyword).lower()
                
                if re.search(r'\b' + re.escape(cleaned_keyword) + r'\b', cleaned_activity):
                    return industry
    return None

# apply function
indeed['industry'] = indeed['title'].apply(classify_industry)

if indeed['industry'].isnull().any():  # Проверяем на NaN, а не на None
    indeed['industry'] = indeed['industry'].fillna(indeed['company'].apply(classify_industry))


indeed['industry'].value_counts()

industry
Management Consulting                    75
Banking, and financial services          73
Internet and IT                          59
E-commerce,trade and Retail              56
Marketing, advertising, PR and design    32
Computer Software                        25
Media and publishing                     13
Transport and logistics                  12
Sport, health and social                  6
Education and science                     6
Industry and mechanical engineering       4
Automotive                                4
Energy, water and environment             4
Insurance                                 3
Real estate                               3
Auditing, tax and law                     3
HR services and consulting                1
Architecture and planning                 1
Name: count, dtype: int64

#### Job_place and description

In [4574]:
indeed['job_place']=np.nan
indeed['description']=np.nan
indeed['skills']=np.nan
indeed['language']=np.nan

In [4575]:
indeed = indeed.drop_duplicates()

In [4576]:
indeed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 918 entries, 0 to 977
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   company      918 non-null    object 
 1   salary_max   38 non-null     float64
 2   salary_min   40 non-null     float64
 3   salary_type  38 non-null     object 
 4   location     914 non-null    object 
 5   title        918 non-null    object 
 6   job_type     59 non-null     object 
 7   source       918 non-null    object 
 8   experience   918 non-null    object 
 9   bundesland   914 non-null    object 
 10  industry     369 non-null    object 
 11  job_place    0 non-null      float64
 12  description  0 non-null      float64
 13  skills       0 non-null      float64
 14  language     0 non-null      float64
dtypes: float64(6), object(9)
memory usage: 114.8+ KB


#### New columns order

In [4577]:
# change the columns order

new_order = ["title", "company", "location",
             "bundesland","industry", "job_type",
             "job_place", "experience","skills",
             "language","salary_min","salary_max", "salary_type","description","source"]
indeed = indeed[new_order]
indeed.tail(10)

Unnamed: 0,title,company,location,bundesland,industry,job_type,job_place,experience,skills,language,salary_min,salary_max,salary_type,description,source
966,Werkstudium Data Analyst (m/w/d),Mawave Marketing GmbH,München,Bayern,"Marketing, advertising, PR and design",,,No experience,,,,,,,Indeed
967,Working Student (f/m/d) - Global Enterprise Architecture Practice,SAP,Walldorf,Walldorf,Architecture and planning,,,No experience,,,,,,,Indeed
968,Working Student - Data Analyst,shipzero GmbH,Hamburg,Hamburg,,,,No experience,,,,,,,Indeed
969,Working Student - Data Science,Hasso Plattner Foundation,Potsdam,Brandenburg,,,,No experience,,,,,,,Indeed
971,Working Student - Data and Business Analyst (m/f/d) in Berlin or Konstanz,KNIME AG,Berlin-Kreuzberg,Berlin-Kreuzberg,,,,No experience,,,,,,,Indeed
972,Working Student Data Analyst (x/f/m),Puls Technologies GmbH,Berlin,Berlin,Internet and IT,,,No experience,,,,,,,Indeed
974,Working Student in Market Intelligence for PayTech (m/f/d),Giesecke+Devrient,München,Bayern,"Banking, and financial services",,,No experience,,,,,,,Indeed
975,Working Student: Test Automation for IT project (f/m/div),Infineon Technologies,München,Bayern,Internet and IT,,,No experience,,,,,,,Indeed
976,Young Professional Financial Data Analyst (d/m/w),VR Smart Finanz AG,Eschborn,Eschborn,"Banking, and financial services",,,No experience,,,,,,,Indeed
977,[BS EU Division] ID EU Pipeline and Business Analyst (m/f/d),LG Electronics Deutschland GmbH,Eschborn,Eschborn,,,,Middle,,,,,,,Indeed


#### Save to .csv

In [4578]:
# Save cleaned data
# indeed.to_csv("indeed_cl.csv", index=False, encoding="utf-8")
# indeed.to_excel('indeed.xlsx', index=False)

### 2. Stepstone

In [None]:

stepstone_1 = pd.read_csv("data/stepstone.csv")
stepstone_my = pd.read_csv("data/stepstone_my.csv")

# Combine 2 datasets
stepstone_combined = pd.concat([stepstone_1, stepstone_my], ignore_index=True)
stepstone_combined = stepstone_combined.drop_duplicates()

# Rename the columns
new_names = ['title', 'company', 'location', 'job_place', 'salary']
stepstone_combined.columns = new_names

# stepstone_5000
stepstone_5000 = pd.read_csv("data/stepstone_5000_en.csv")

stepstone_5000 = stepstone_5000[['companyName', 'location',
                    'textSnippet','title']]
# Rename the columns
new_names = ['company', 'location', 'description', 'title']
stepstone_5000.columns = new_names

# Join 2 databases
stepstone = pd.merge(stepstone_combined, stepstone_5000, on=['title','company', 'location'], how='left')
stepstone['source'] = 'Stepstone'

stepstone = stepstone.drop_duplicates()
stepstone.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1610 entries, 0 to 1633
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        1610 non-null   object
 1   company      1610 non-null   object
 2   location     1610 non-null   object
 3   job_place    929 non-null    object
 4   salary       1205 non-null   object
 5   description  1292 non-null   object
 6   source       1610 non-null   object
dtypes: object(7)
memory usage: 100.6+ KB


In [4580]:
stepstone.head()

Unnamed: 0,title,company,location,job_place,salary,description,source
0,Mathematiker/Aktuar (d/m/w) Data Science/Pricing/Risikomanagement/Solvency II,Meyerthole Siems Kohlruss Ges. für aktuarielle Beratung mbH,Köln,,44.000 - 67.000 €/Jahr (geschätzt für Vollzeit),"Our focus is on varied tasks in the areas of data science, telematics, pricing, natural hazards, balance sheet valua...",Stepstone
1,Referent (m/w/d) Assetmanagement - Schwerpunkt Data Science,Avacon Netz GmbH,Salzgitter,Teilweise Home-Office,60.000 - 81.000 €/Jahr (geschätzt für Vollzeit),,Stepstone
2,"Duales Studium Wirtschaftsinformatik/Data Science, Bachelor of Science, Mercedes-Benz AG, GLC Germersheim, Studienbe...",Mercedes-Benz AG,Germersheim,Teilweise Home-Office,,"In the <strong>Data</strong> Science course of study, you will learn how information is generated from large amounts...",Stepstone
3,Duales Studium - Wirtschaftsinformatik Data Science (B. Sc. m/w/d) 2025,TTS Tooltechnic Systems AG & CO. KG,Wendlingen (bei Stuttgart),Teilweise Home-Office,,Dual study program - Business Informatics <strong>Data</strong> Science (B. Sc. During your three-year course with a...,Stepstone
4,Trainee (w/m/d) Kreditrisikomanagement – Data Science,KfW Bankengruppe,Frankfurt am Main,Teilweise Home-Office,,,Stepstone


#### Job_place 

In [4581]:
stepstone['job_place'] = stepstone['job_place'].astype(str)
stepstone['job_place'] = stepstone['job_place'].replace(['Teilweise Home-Office'], 'hybrid work')
stepstone['job_place'] = stepstone['job_place'].replace(['nan'], '')
stepstone['job_place'] = stepstone['job_place'].replace(['job_type'], '')


#### Expirience

In [4582]:
stepstone['experience'] = stepstone['title'].apply(analyze_experience)
stepstone['experience'].value_counts()

experience
Middle           1209
Senior            178
No experience     141
Junior             82
Name: count, dtype: int64

#### Job_type 

In [4583]:
stepstone['salary'] = stepstone['salary'].astype(str)   
stepstone['job_type'] = stepstone['salary'].apply(analyze_job_type)

#### Salary column

In [4584]:
# Work with Salary column

# # Check, where 'salary' not empty and  'salary_min' and 'salary_max' empty
# mask = stepstone_full['salary'].notnull() & stepstone_full['salary_min'].isnull() & stepstone_full['salary_max'].isnull()

# Extract the min, max salary
stepstone['min'] = stepstone['salary'].str.extract(r'(\d+\.\d+) €')
stepstone['max'] = stepstone['salary'].str.extract(r'[–\-]\s*(\d+\.\d+) €')
stepstone['type'] = stepstone['salary'].str.extract(r'€/(.*?)\s')

# replace the dot
stepstone['max'] = stepstone['max'].str.replace('.', '', regex=False).astype(float)
stepstone['min'] = stepstone['min'].str.replace('.', '', regex=False).astype(float)
stepstone['type'] = stepstone['type'].str.replace('Monat', 'monthly', regex=False)
stepstone['type'] = stepstone['type'].str.replace('Jahr', 'yearly', regex=False)

stepstone[['salary_min', 'salary_max','salary_type']] = indeed[['salary_min', 'salary_max','salary_type']].fillna(0)

stepstone['salary_min'] = stepstone.apply(lambda row: row['min'] if row['salary_min'] == 0 else row['salary_min'], axis=1)
stepstone['salary_max'] = stepstone.apply(lambda row: row['max'] if row['salary_max'] == 0 else row['salary_max'], axis=1)
stepstone['salary_type'] = stepstone.apply(lambda row: row['type'] if row['salary_type'] == 0 else row['salary_type'], axis=1)

# Delete extra columns
stepstone = stepstone.drop(['salary', 'min', 'max', 'type'], axis=1)


#### Bundesland

In [4585]:
stepstone['bundesland'] = stepstone['location'].apply(find_bundesland)

#### Industry

In [4586]:
 
stepstone['industry'] = stepstone['title'].apply(classify_industry)

if stepstone['industry'].isnull().any():  # Проверяем на NaN, а не на None
    stepstone['industry'] = stepstone['industry'].fillna(stepstone['company'].apply(classify_industry))

stepstone['industry'].value_counts()

industry
Management Consulting                    174
Internet and IT                          156
Banking, and financial services           91
Computer Software                         69
E-commerce,trade and Retail               44
Education and science                     27
Marketing, advertising, PR and design     21
HR services and consulting                18
Transport and logistics                   18
Automotive                                15
Industry and mechanical engineering       14
Media and publishing                      12
Energy, water and environment              5
Hotels, tourism, travel agencies           5
Sport, health and social                   3
Auditing, tax and law                      3
Real estate                                3
Insurance                                  2
Name: count, dtype: int64

#### Skills and Language

In [4587]:
stepstone['skills']=np.nan
stepstone['language']=np.nan

#### New columns order

In [4588]:
stepstone = stepstone.drop_duplicates()
stepstone.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1610 entries, 0 to 1633
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        1610 non-null   object 
 1   company      1610 non-null   object 
 2   location     1610 non-null   object 
 3   job_place    1610 non-null   object 
 4   description  1292 non-null   object 
 5   source       1610 non-null   object 
 6   experience   1610 non-null   object 
 7   job_type     1204 non-null   object 
 8   salary_min   662 non-null    float64
 9   salary_max   661 non-null    float64
 10  salary_type  662 non-null    object 
 11  bundesland   1610 non-null   object 
 12  industry     680 non-null    object 
 13  skills       0 non-null      float64
 14  language     0 non-null      float64
dtypes: float64(4), object(11)
memory usage: 201.2+ KB


In [4589]:
stepstone = stepstone[new_order]
stepstone.head()

Unnamed: 0,title,company,location,bundesland,industry,job_type,job_place,experience,skills,language,salary_min,salary_max,salary_type,description,source
0,Mathematiker/Aktuar (d/m/w) Data Science/Pricing/Risikomanagement/Solvency II,Meyerthole Siems Kohlruss Ges. für aktuarielle Beratung mbH,Köln,Nordrhein-Westfalen,,Full-time,,Middle,,,67000.0,67000.0,yearly,"Our focus is on varied tasks in the areas of data science, telematics, pricing, natural hazards, balance sheet valua...",Stepstone
1,Referent (m/w/d) Assetmanagement - Schwerpunkt Data Science,Avacon Netz GmbH,Salzgitter,Niedersachsen,,Full-time,hybrid work,Middle,,,81000.0,81000.0,yearly,,Stepstone
2,"Duales Studium Wirtschaftsinformatik/Data Science, Bachelor of Science, Mercedes-Benz AG, GLC Germersheim, Studienbe...",Mercedes-Benz AG,Germersheim,Germersheim,Automotive,,hybrid work,No experience,,,,,,"In the <strong>Data</strong> Science course of study, you will learn how information is generated from large amounts...",Stepstone
3,Duales Studium - Wirtschaftsinformatik Data Science (B. Sc. m/w/d) 2025,TTS Tooltechnic Systems AG & CO. KG,Wendlingen (bei Stuttgart),Wendlingen (bei Stuttgart),,,hybrid work,No experience,,,,,,Dual study program - Business Informatics <strong>Data</strong> Science (B. Sc. During your three-year course with a...,Stepstone
4,Trainee (w/m/d) Kreditrisikomanagement – Data Science,KfW Bankengruppe,Frankfurt am Main,Hessen,,,hybrid work,No experience,,,,,,,Stepstone


#### Save to .csv

In [4590]:
# Save the cleaned data
# stepstone.to_csv("stepstone_cl.csv", index=False, encoding="utf-8")

### 3. Xing

In [4591]:
# import files
file_path = 'data/xing_en.csv'
column_names = ['title', 'location','salary_min','salary_max','job_type','company','description']
xing = pd.read_csv(file_path, header=None, names=column_names)

xing.head()

Unnamed: 0,title,location,salary_min,salary_max,job_type,company,description
0,Financial Planning & Financial Data Analyst (m/w/d),Münster,37500.0,57500.0,Full-time,flaschenpost SE,"In order to fulfill our mission, we are looking for a Financial Planning & Financial Data Analyst (m/f/d) to join ou..."
1,Data Analyst Pricing (m/w/d),Menden,47000.0,65500.0,Full-time,OBO Bettermann GmbH,Data Analyst Pricing (m/f/d) About OBO Bettermann ...
2,Business & Data Analyst (w/m/d),Hamburg,43000.0,72000.0,Full-time,plusYOU GmbH,"Business & Data Analyst (f/m/d) in Hamburg ""Digital + Modern + Independent - True to these mottos, we continue to le..."
3,Data Analyst / Data Engineer,München,54000.0,73500.0,Full-time,Rahantech GmbH,Intro Rahantech is hiring a skilled Data Analyst / Data Engineer on behalf of a fast-growing company in Germany.
4,BI Consultant/ Data Analyst (w/m/d) mit Berufserfahrung,Neu-Isenburg,43000.0,61500.0,Full-time,RSM CONSULT GMBH,Our team in Neu-Isenburg/Frankfurt Main is looking for new colleagues as: BI Consultant/Data Analyst (f/m/d) with pr...


In [4592]:
xing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        900 non-null    object 
 1   location     900 non-null    object 
 2   salary_min   769 non-null    float64
 3   salary_max   769 non-null    float64
 4   job_type     900 non-null    object 
 5   company      900 non-null    object 
 6   description  900 non-null    object 
dtypes: float64(2), object(5)
memory usage: 49.3+ KB


#### Salary_type

In [4593]:
def classify_salary(amount):
    if pd.isna(amount): 
        return None
    
    amount = float(str(amount).replace(",", "")) 
    
    if amount <= 100: 
        return "hourly"
    elif 1_000 <= amount <= 15_000: 
        return "monthly"
    elif amount >= 20_000:
        return "yearly"
    else:
        return None


xing['salary_type'] = xing['salary_min'].apply(classify_salary)

#### Bundesland

In [4594]:
xing['bundesland'] = xing['location'].apply(find_bundesland)


#### Expirience

In [4595]:
xing['experience'] = xing['title'].apply(analyze_experience)
xing['experience'].value_counts()

experience
Middle           631
Senior           200
No experience     41
Junior            28
Name: count, dtype: int64

#### Source

In [4596]:
xing['source'] = 'Xing'

#### Industry

In [4597]:
xing['industry'] = xing['title'].apply(classify_industry)

if xing['industry'].isnull().any():  # Проверяем на NaN, а не на None
    xing['industry'] = xing['industry'].fillna(stepstone['company'].apply(classify_industry))

xing['industry'].value_counts()

industry
Management Consulting                    103
Internet and IT                           89
Banking, and financial services           50
E-commerce,trade and Retail               33
Marketing, advertising, PR and design     26
Computer Software                         22
Insurance                                 20
Media and publishing                      14
Industry and mechanical engineering       13
Transport and logistics                   12
Education and science                     12
Sport, health and social                  10
Automotive                                 9
HR services and consulting                 6
Auditing, tax and law                      3
Energy, water and environment              3
Hotels, tourism, travel agencies           2
Architecture and planning                  2
Real estate                                1
Name: count, dtype: int64

#### Skills,Language,Job_place

In [4598]:
xing['job_place']=np.nan
xing['skills']=np.nan
xing['language']=np.nan

#### New columns order

In [4599]:
xing = xing.drop_duplicates()
xing.info()

<class 'pandas.core.frame.DataFrame'>
Index: 860 entries, 0 to 899
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        860 non-null    object 
 1   location     860 non-null    object 
 2   salary_min   729 non-null    float64
 3   salary_max   729 non-null    float64
 4   job_type     860 non-null    object 
 5   company      860 non-null    object 
 6   description  860 non-null    object 
 7   salary_type  729 non-null    object 
 8   bundesland   860 non-null    object 
 9   experience   860 non-null    object 
 10  source       860 non-null    object 
 11  industry     413 non-null    object 
 12  job_place    0 non-null      float64
 13  skills       0 non-null      float64
 14  language     0 non-null      float64
dtypes: float64(5), object(10)
memory usage: 107.5+ KB


In [4600]:
new_order = ["title", "company", "location",
             "bundesland","industry", "job_type",
             "job_place", "experience","skills",
             "language","salary_min","salary_max", "salary_type","description","source"]
xing = xing[new_order]
xing.head()

Unnamed: 0,title,company,location,bundesland,industry,job_type,job_place,experience,skills,language,salary_min,salary_max,salary_type,description,source
0,Financial Planning & Financial Data Analyst (m/w/d),flaschenpost SE,Münster,Nordrhein-Westfalen,"Banking, and financial services",Full-time,,Middle,,,37500.0,57500.0,yearly,"In order to fulfill our mission, we are looking for a Financial Planning & Financial Data Analyst (m/f/d) to join ou...",Xing
1,Data Analyst Pricing (m/w/d),OBO Bettermann GmbH,Menden,Menden,,Full-time,,Middle,,,47000.0,65500.0,yearly,Data Analyst Pricing (m/f/d) About OBO Bettermann ...,Xing
2,Business & Data Analyst (w/m/d),plusYOU GmbH,Hamburg,Hamburg,Automotive,Full-time,,Middle,,,43000.0,72000.0,yearly,"Business & Data Analyst (f/m/d) in Hamburg ""Digital + Modern + Independent - True to these mottos, we continue to le...",Xing
3,Data Analyst / Data Engineer,Rahantech GmbH,München,Bayern,,Full-time,,Middle,,,54000.0,73500.0,yearly,Intro Rahantech is hiring a skilled Data Analyst / Data Engineer on behalf of a fast-growing company in Germany.,Xing
4,BI Consultant/ Data Analyst (w/m/d) mit Berufserfahrung,RSM CONSULT GMBH,Neu-Isenburg,Neu-Isenburg,Management Consulting,Full-time,,Middle,,,43000.0,61500.0,yearly,Our team in Neu-Isenburg/Frankfurt Main is looking for new colleagues as: BI Consultant/Data Analyst (f/m/d) with pr...,Xing


#### Save to.csv

In [4601]:
# Save the cleaned data
# xing.to_csv("xing_cl.csv", index=False, encoding="utf-8")

### 4. LinkedIn

In [4602]:
# import files
linkedin = pd.read_csv("data/linkedin_en.csv")

# Leave only the useful columns
linkedin = linkedin[['Title', 'Description',
                    'Location','Skills',
                     'Employment Status','Experience Level',
                     'Workplace','Company Name',
                     'Industry']]

# rename columns
new_names = ['title','description', 'location', 'skills', 'job_type', 'experience','job_place','company','industry']
linkedin.columns = new_names
linkedin


Unnamed: 0,title,description,location,skills,job_type,experience,job_place,company,industry
0,Data Scientist (m/f/d),"Location: Germany (remote) or Munich (office) Salary: Up to €95,000 annually (depending on qualifications) Entry lev...",Germany,"Data Science,Machine Learning,Python (Programming Language),SQL,Artificial Intelligence (AI),Deep Learning,English,G...",Full-time,Associate,Remote,Data-Talent GmbH,Staffing &amp; Recruiting
1,Praktikant Qualitätsmanagement - Data Analytics (w/m/x),A GOOD INTERNSHIP IS PRACTICALLY NEVER THEORETICAL.\n\nSHARE YOUR PASSION.\n\nSuccess is teamwork. Only when experts...,"Munich, Bavaria, Germany","Data Science,Analytics,Continuous Improvement,German,Information Technology,New Product Rollout,Optimization,Quality...",Full-time,Entry level,,BMW Group,Automotive
2,Senior BI-Consultant mit Schwerpunkt SAC - SAP Analytics Cloud (m/w/d),"SAC, Datasphere and/or BW/4HANA are your daily bread for you? Do you enjoy providing customers with holistic, hands-...","Munich, Bavaria, Germany","Extract, Transform, Load (ETL),Analytical Skills,Analytics,Business Intelligence (BI),Computer Science,Data Modeling...",Full-time,,Hybrid,H&Z.digital,Information Technology &amp; Services
3,Werkstudent:in Stakeholdermanagement Data Quality,"Your tasks\n\nFocus on identification, analysis and communication with all involved stakeholders of the data quality...","Hannover, Lower Saxony, Germany","Analytical Skills,Collaborative Work,Computer Science,Data Management,Data Quality,Endorsements,English,German,Multi...",Part-time,Internship,Hybrid,HDI Group,Insurance
4,Data-Analyst (m/w/d),"Software is our passion. The success of our customers is what drives us. For this reason, we go full throttle every ...","Detmold, North Rhine-Westphalia, Germany","Business Analytics,Data Analytics,SQL,Analytics,Business Administration",Full-time,Associate,Hybrid,deltra Business Software GmbH & Co. KG,Computer Software
...,...,...,...,...,...,...,...,...,...
995,Intern Consultant Data Science and AI (m/f/d)*,"company \n\nMunich Re\n\n Location \n\nMunich, Germany\n\nAre you passionate about Machine Learning and eager to dev...","Munich, Bavaria, Germany","Data Science,Machine Learning,Statistics,Artificial Intelligence (AI),Communication,Computer Science,Data Visualizat...",Internship,Internship,On-site,Munich Re,Insurance
996,Data Scientist* in der Produktion mit Promotionsabsicht,"In the “Production Quality” department of the Fraunhofer Institute for Production Technology IPT, we support compani...","Aachen, North Rhine-Westphalia, Germany","Data Analytics,Data Science,Machine Learning,Python (Programming Language),Statistics,Algorithms,Deep Learning,Natur...",Full-time,Entry level,Hybrid,Fraunhofer Institute for Production Technology IPT,Research
997,Research Associate/Post-Doctoral Research Fellow – Urban Analytics City Futures Research Centre,"This job is based in Australia\n\nEmployment Type: 12-month fixed term full timeRemuneration: Level A: $88,290 - $11...","Berlin, Berlin, Germany","Data Science,Machine Learning,Statistics,Analytical Skills,Artificial Intelligence (AI),Computer Science,Physics,Pre...",Full-time,Mid-Senior level,Hybrid,UNSW,Higher Education
998,Research Fellow (National Perinatal Epidemiology and Statistics Unit),This job is based in Australia\n\nResearch Fellow (National Perinatal Epidemiology and Statistics Unit)\n\nEmploymen...,"Berlin, Berlin, Germany","Data Analytics,Data Science,Statistics,Analytical Skills,Biostatistics,Epidemiology,Linked Data,Research Skills,SAS ...",Full-time,Mid-Senior level,On-site,UNSW,Higher Education


In [4603]:
linkedin['industry'].value_counts()

industry
Information Technology &amp; Services        218
Management Consulting                        203
Computer Software                            173
Staffing &amp; Recruiting                     40
Financial Services                            40
Automotive                                    36
Retail                                        28
Insurance                                     20
Utilities                                     16
Pharmaceuticals                               15
Marketing &amp; Advertising                   13
Research                                      13
Consumer Goods                                12
Wholesale                                     11
Transportation/Trucking/Railroad              11
Higher Education                              10
Sports                                         9
Renewables &amp; Environment                   9
Hospital &amp; Health Care                     9
Chemicals                                      8
Semiconduct

#### Salary

In [4604]:

def extract_salary(text):
    # Регулярное выражение для диапазона зарплаты (например: €50,000 - €60,000)
    pattern_range = r'€\s?(\d{1,3}(?:[.,]\d{3})*)\s?-\s?€\s?(\d{1,3}(?:[.,]\d{3})*)'
    
    # Регулярное выражение для зарплаты с "Bis zu" (например: Bis zu 95.000 €)
    pattern_bis_zu = r'Bis zu\s?€?\s?(\d{1,3}(?:[.,]\d{3})*)'
    
    # Поиск диапазона зарплаты
    match_range = re.search(pattern_range, text)
    if match_range:
        min_salary = match_range.group(1).replace(".", "").replace(",", "")
        max_salary = match_range.group(2).replace(".", "").replace(",", "")
        return int(min_salary), int(max_salary)
    
    # Поиск зарплаты с "Bis zu"
    match_bis_zu = re.search(pattern_bis_zu, text)
    if match_bis_zu:
        min_salary = max_salary = match_bis_zu.group(1).replace(".", "").replace(",", "")
        return int(min_salary), int(max_salary)
    
    return None, None



linkedin[['salary_min', 'salary_max']] = linkedin['description'].apply(lambda x: pd.Series(extract_salary(x)))



In [4605]:
linkedin['salary_max'].value_counts()

salary_max
60,000.00     2
93,400.00     2
90,000.00     1
113,200.00    1
Name: count, dtype: int64

In [4606]:

def classify_salary(amount):
    if pd.isna(amount): 
        return None
    
    amount = float(str(amount).replace(",", "")) 
    
    if amount <= 100: 
        return "hourly"
    elif 1_000 <= amount <= 15_000: 
        return "monthly"
    elif amount >= 20_000:
        return "yearly"
    else:
        return None


linkedin['salary_type'] = linkedin['salary_min'].apply(classify_salary)

#### Bundesland

In [4607]:
def split_location(location):
    parts = location.split(", ")
    
    if len(parts) == 3: 
        city, bundesland = parts[0], parts[1]
    elif len(parts) == 2:
        city, bundesland = parts[0], parts[1]
    elif len(parts) == 1: 
        city, bundesland = parts[0], parts[0]
    else:
        city, bundesland = None, None  
    
    return pd.Series([city, bundesland])


linkedin[['location', 'bundesland']] = linkedin['location'].apply(split_location)

#### Language

In [4608]:
programming_languages = {"Python", "SQL", "Java", "C++", "R", "JavaScript", "Go", "Ruby", "Swift"}
natural_languages = {"English", "German", "French", "Spanish", "Japanese", "Mandarin", "Italian"}

def extract_languages(text):
    words = [word.strip() for word in text.split(",")] 
    found_languages = [word for word in words if word in natural_languages]
    return ", ".join(found_languages) if found_languages else None 


linkedin['language'] = linkedin['skills'].apply(extract_languages)

#### Source

In [4609]:
linkedin['source'] = 'Linkedin'

#### Expirience

In [4610]:
def classify_experience(level, title):
    mapping = {
        "Mid-Senior level": "Senior",
        "Associate": "Middle",
        "Entry level": "Junior",
        "Internship": "No experience",
        "Director": "Senior"
    }
    

    if level in mapping:
        return mapping[level]

    title = title.lower()
    
    if 'junior' in title or 'entry-level' in title:
        return 'Junior'
    elif 'senior' in title or 'team lead' in title or 'teamlead' in title or 'geschäftsführer' in title:
        return 'Senior'
    elif 'werkstudierende' in title or 'werkstudent' in title or 'working student' in title or 'student' in title:
        return 'No experience'
    elif 'internship' in title or 'young' in title:
        return 'No experience'
    elif 'praktikum' in title or 'studium' in title:
        return 'No experience'
    elif 'trainee' in title:
        return 'No experience'
    elif 'advanced' in title:
        return 'Middle'
    else:
        return 'Middle'

linkedin["experience"] = linkedin.apply(lambda x: classify_experience(x["experience"], x["title"]), axis=1)

In [4611]:
linkedin['experience'].value_counts()

experience
Senior           426
Middle           290
Junior           169
No experience    115
Name: count, dtype: int64

#### New columns order

In [4612]:
new_order = ["title", "company", "location",
             "bundesland","industry", "job_type",
             "job_place", "experience","skills",
             "language","salary_min","salary_max", "salary_type","description","source"]
linkedin = linkedin[new_order]
linkedin.head()

Unnamed: 0,title,company,location,bundesland,industry,job_type,job_place,experience,skills,language,salary_min,salary_max,salary_type,description,source
0,Data Scientist (m/f/d),Data-Talent GmbH,Germany,Germany,Staffing &amp; Recruiting,Full-time,Remote,Middle,"Data Science,Machine Learning,Python (Programming Language),SQL,Artificial Intelligence (AI),Deep Learning,English,G...",English,,,,"Location: Germany (remote) or Munich (office) Salary: Up to €95,000 annually (depending on qualifications) Entry lev...",Linkedin
1,Praktikant Qualitätsmanagement - Data Analytics (w/m/x),BMW Group,Munich,Bavaria,Automotive,Full-time,,Junior,"Data Science,Analytics,Continuous Improvement,German,Information Technology,New Product Rollout,Optimization,Quality...",German,,,,A GOOD INTERNSHIP IS PRACTICALLY NEVER THEORETICAL.\n\nSHARE YOUR PASSION.\n\nSuccess is teamwork. Only when experts...,Linkedin
2,Senior BI-Consultant mit Schwerpunkt SAC - SAP Analytics Cloud (m/w/d),H&Z.digital,Munich,Bavaria,Information Technology &amp; Services,Full-time,Hybrid,Senior,"Extract, Transform, Load (ETL),Analytical Skills,Analytics,Business Intelligence (BI),Computer Science,Data Modeling...",,,,,"SAC, Datasphere and/or BW/4HANA are your daily bread for you? Do you enjoy providing customers with holistic, hands-...",Linkedin
3,Werkstudent:in Stakeholdermanagement Data Quality,HDI Group,Hannover,Lower Saxony,Insurance,Part-time,Hybrid,No experience,"Analytical Skills,Collaborative Work,Computer Science,Data Management,Data Quality,Endorsements,English,German,Multi...","English, German, Spanish",,,,"Your tasks\n\nFocus on identification, analysis and communication with all involved stakeholders of the data quality...",Linkedin
4,Data-Analyst (m/w/d),deltra Business Software GmbH & Co. KG,Detmold,North Rhine-Westphalia,Computer Software,Full-time,Hybrid,Middle,"Business Analytics,Data Analytics,SQL,Analytics,Business Administration",,,,,"Software is our passion. The success of our customers is what drives us. For this reason, we go full throttle every ...",Linkedin


#### Save to .csv

In [4613]:
# Save the cleaned data
# linkedin.to_csv("linkedin_cl.csv", index=False, encoding="utf-8")

### Concatenation

In [4614]:
vacancies = pd.concat([indeed, stepstone, xing, linkedin], ignore_index=True)


#### Duplicates

In [4615]:
# Check for duplicates
duplicates = vacancies.duplicated()
duplicates.value_counts()

False    4054
True      334
Name: count, dtype: int64

In [4616]:
# Drop duplicates
vacancies = vacancies.drop_duplicates()
duplicates = vacancies.duplicated()
duplicates.value_counts()

False    4054
Name: count, dtype: int64

In [4617]:
vacancies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4054 entries, 0 to 4386
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        4054 non-null   object 
 1   company      4054 non-null   object 
 2   location     4050 non-null   object 
 3   bundesland   4050 non-null   object 
 4   industry     2125 non-null   object 
 5   job_type     2789 non-null   object 
 6   job_place    2105 non-null   object 
 7   experience   4054 non-null   object 
 8   skills       666 non-null    object 
 9   language     141 non-null    object 
 10  salary_min   1435 non-null   float64
 11  salary_max   1432 non-null   float64
 12  salary_type  1433 non-null   object 
 13  description  2818 non-null   object 
 14  source       4054 non-null   object 
dtypes: float64(2), object(13)
memory usage: 506.8+ KB


In [4618]:
vacancies.head()

Unnamed: 0,title,company,location,bundesland,industry,job_type,job_place,experience,skills,language,salary_min,salary_max,salary_type,description,source
0,(Advanced) Data Developer (m/w/d) OBI Smart Technologies GmbH,OBI,Deutschland,Deutschland,Internet and IT,,,Middle,,,,,,,Indeed
1,(JUNIOR) BUSINESS ANALYST (W/M/D) / DATA ANALYST (W/M/D) IM CONTROLLING,ZECH MANAGEMENT GMBH,Bremen,Bremen,Management Consulting,,,Junior,,,,,,,Indeed
2,(Junior) Account Manager / Customer Development Manager Stammkundenvertrieb (m/w/d),All for One Group SE,Oldenburg,Niedersachsen,,,,Junior,,,,,,,Indeed
3,(Junior) Analyst (m/w/d) Data Analytics,Syncwork,Berlin,Berlin,,,,Junior,,,,,,,Indeed
4,(Junior) Analyst Modelled data - Technology Topics (m/f/d),Statista GmbH,Hamburg,Hamburg,,,,Junior,,,,,,,Indeed


In [4619]:
vacancies['experience'].value_counts()

experience
Middle           2642
Senior            822
No experience     323
Junior            267
Name: count, dtype: int64

#### Location

In [4620]:
vacancies.loc[vacancies['location'] == 'am Mein', 'location'] = 'Frankfurt am Mein'
vacancies['location'] = vacancies['location'].str.split(',').str[0].str.strip()
vacancies['location'] = vacancies['location'].str.extract(r'([A-ZÄÖÜ][a-zäöüß]+)')

In [4621]:
vacancies['location'].value_counts()

location
Berlin          544
Hamburg         337
München         290
Frankfurt       231
Düsseldorf      185
               ... 
Spelle            1
Großenlüder       1
Hallbergmoos      1
Ostfildern        1
Wessling          1
Name: count, Length: 508, dtype: int64

#### Bundesland

In [4622]:
vacancies['bundesland'] = vacancies['location'].apply(find_bundesland)

In [4623]:
vacancies['bundesland'].value_counts()

bundesland
Nordrhein-Westfalen    560
Berlin                 544
Bayern                 383
Hamburg                337
Baden-Württemberg      250
                      ... 
Boppard                  1
Hauptverwaltung          1
Hattersheim              1
Wismar                   1
Wessling                 1
Name: count, Length: 451, dtype: int64

#### Job_type

In [4624]:
vacancies.loc[vacancies['job_type'] == 'Self-employed', 'job_type'] = 'Flexi-time'
vacancies.loc[vacancies['job_type'] == 'Temp', 'job_type'] = 'Temporary'
vacancies.loc[vacancies['job_type'] == 'Contract', 'job_type'] = 'Temporary'
vacancies.loc[vacancies['job_type'] == 'Other', 'job_type'] = 'Temporary'

In [4625]:
vacancies['job_type'].value_counts()

job_type
Full-time     2616
Part-time       76
Student         43
Internship      22
Flexi-time      16
Temporary       16
Name: count, dtype: int64

#### Job_place

In [4626]:
vacancies.loc[vacancies['job_place'] == 'Hybrid', 'job_place'] = 'Hybrid work'
vacancies.loc[vacancies['job_place'] == 'hybrid work','job_place'] = 'Hybrid work'
vacancies.loc[vacancies['job_place'] == 'Temporary','job_place'] = 'Remote'
vacancies.loc[vacancies['job_place'] == 'Flexi-time','job_place'] = 'Remote'

In [4627]:
vacancies['job_place'].value_counts()

job_place
Hybrid work    1192
                682
On-site         209
Remote           22
Name: count, dtype: int64

#### Experience

In [4628]:
vacancies['experience'].value_counts()

experience
Middle           2642
Senior            822
No experience     323
Junior            267
Name: count, dtype: int64

#### Language

In [4629]:
vacancies['language'].value_counts()

language
English                     43
German                      27
English, German             25
Spanish                     21
German, Spanish             16
English, German, Spanish     3
Italian                      2
English, French, German      2
English, French, Spanish     1
French, German, Spanish      1
Name: count, dtype: int64

In [4630]:
# separate the languages and create binary columns
languages_split = vacancies['language'].str.get_dummies(sep=', ')
languages_split.columns = languages_split.columns.str.lower()

# Adding these columns back to the dataframe
vacancies = pd.concat([vacancies, languages_split], axis=1)

# go through all the language columns
for lang in languages_split.columns:
    vacancies[lang] = vacancies.apply(
        lambda row: 1 if row[lang] == 1 or re.search(r'\b' + re.escape(lang) + r'\b', str(row['description']), re.IGNORECASE) else 0, 
        axis=1
    )
vacancies = vacancies.drop('language', axis=1)
vacancies[languages_split.columns].apply(pd.Series.value_counts)

Unnamed: 0,english,french,german,italian,spanish
0,3604,4041,3593,4049,4003
1,450,13,461,5,51


In [4631]:
vacancies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4054 entries, 0 to 4386
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        4054 non-null   object 
 1   company      4054 non-null   object 
 2   location     4011 non-null   object 
 3   bundesland   4011 non-null   object 
 4   industry     2125 non-null   object 
 5   job_type     2789 non-null   object 
 6   job_place    2105 non-null   object 
 7   experience   4054 non-null   object 
 8   skills       666 non-null    object 
 9   salary_min   1435 non-null   float64
 10  salary_max   1432 non-null   float64
 11  salary_type  1433 non-null   object 
 12  description  2818 non-null   object 
 13  source       4054 non-null   object 
 14  english      4054 non-null   int64  
 15  french       4054 non-null   int64  
 16  german       4054 non-null   int64  
 17  italian      4054 non-null   int64  
 18  spanish      4054 non-null   int64  
dtypes: float64(

#### Skills

In [4632]:

skills_map = {
    'ETL': ['Extract', 'ETL', 'Extract, Transform, Load'],
    'Data Science': ['Data Science', 'Machine Learning', 'Deep Learning','Data mining'],
    'Data engineering':['Data engineering'],
    'Analytical skills': ['Analytical skills', 'Analytical', 'Analytics','Business analytics','Data Analytics','Data Analysis','Data Analysis'],
    'Statistics': ['Statistics'],
    'Artificial intelligence': ['Artificial intelligence'],
    'Soft skills':['Communication', 'Clients','Conflict'],
    'Python': ['Python'],
    'SQL': ['SQL','Relational'],
    'ABAP': ['ABAP'],
    'Alteryx': ['Alteryx'],
    'Dashboards (Tableau, PowerBI)': ['Tableau','Dashboards','PowerBI','Power BI']
}

# Replace NaN with empty strings
vacancies['skills'] = vacancies['skills'].fillna('')
vacancies['description'] = vacancies['description'].fillna('')

# Converting all data to a string type
vacancies['skills'] = vacancies['skills'].astype(str)
vacancies['description'] = vacancies['description'].astype(str)

# A function for checking the presence of skill in the text
def skill_in_text(skill_list, text):
    return any(skill.lower() in text.lower() for skill in skill_list)

# Creating new columns for each skill
for skill, variants in skills_map.items():
   
    col_name = skill
   
    vacancies[col_name] = vacancies.apply(
        lambda row: 1 if skill_in_text(variants, str(row['skills']) + ' ' + str(row['description'])) else 0,
        axis=1
    )


# check that we have exactly the right speakers for skills.
skill_columns = [col for col in vacancies.columns if col in skills_map.keys()]

# apply updates only to these columns.
vacancies[skill_columns] = vacancies[skill_columns].apply(pd.to_numeric, errors='coerce')


# counts the number of 1's in each skill column
skill_counts = vacancies[skill_columns].sum()

vacancies = vacancies.drop(columns=['skills', 'description'])


skill_counts

ETL                              181
Data Science                     597
Data engineering                  75
Analytical skills                965
Statistics                       354
Artificial intelligence          221
Soft skills                      505
Python                           409
SQL                              403
ABAP                              21
Alteryx                           21
Dashboards (Tableau, PowerBI)    339
dtype: int64

#### Salary

In [4633]:
# Step 1: Replacing the value in salary_max if it is missing
vacancies['salary_max'] = vacancies.apply(lambda row: row['salary_min'] if pd.isna(row['salary_max']) else row['salary_max'], axis=1)

# Step 2: Calculating the average salary
vacancies['avg_salary'] = (vacancies['salary_min'] + vacancies['salary_max']) / 2

# Step 3: Bringing the salary to the annual amount
def convert_to_annual_salary(row):
    if row['salary_type'] == 'monthly':
        return row['avg_salary'] * 12  
    elif row['salary_type'] == 'hourly':
        return row['avg_salary'] * 2080  
    elif row['salary_type'] == 'daily':
        return row['avg_salary'] * 260  
    else:
        return row['avg_salary'] 

vacancies['avg_salary'] = vacancies.apply(convert_to_annual_salary, axis=1)

vacancies = vacancies.drop(columns=['salary_min', 'salary_max','salary_type'])

vacancies['avg_salary'].value_counts()

avg_salary
70,000.00     46
65,000.00     37
60,000.00     33
75,000.00     31
59,000.00     29
              ..
25,200.00      1
38,000.00      1
41,000.00      1
102,000.00     1
98,400.00      1
Name: count, Length: 250, dtype: int64

#### Industry

In [4634]:
vacancies['industry'].value_counts()

industry
Management Consulting                    480
Internet and IT                          298
Computer Software                        220
Banking, and financial services          212
Information Technology &amp; Services    141
                                        ... 
Think Tanks                                1
Information Services                       1
Computer Games                             1
Wine &amp; Spirits                         1
Cosmetics                                  1
Name: count, Length: 74, dtype: int64

In [None]:
from rapidfuzz import process, fuzz

companies = pd.read_excel("data/classified_companies.xlsx")

company_dict = companies.groupby("location")["company"].apply(list).to_dict()

def find_best_match(company_name, location):
    if location in company_dict:
        choices = company_dict[location]  
        match = process.extractOne(company_name, choices, scorer=fuzz.WRatio)
        return match[0] if match and match[1] > 80 else None
    return None

vacancies['matched_company'] = vacancies.apply(lambda row: find_best_match(row['company'], row['location']), axis=1)
companies_unique = companies[['company', 'activity']].drop_duplicates(subset=['company'])

vacancies = vacancies.merge(companies_unique, 
                            left_on='matched_company', 
                            right_on='company', 
                            how='left') \
                     .drop(columns=['matched_company', 'company_y']) \
                     .rename(columns={'company_x': 'company'})


In [None]:

industry_keywords = {
    'Internet and IT': ['Information Technology', 'IT', 'IT service', 'Internet','Automation & Technology','Kommunikationssysteme','Technologies','Information Services','Information','Database provider','Innovative companies','CHECK24'],
    'Management Consulting': ['Business consultant', 'Consulting', 'Management','Management companies','McKinsey','Deloitte','Business consultant','consultant'],
    'E-commerce,trade and Retail': ['Sales company', 'Sales', 'Food', 'retail','Commerce','E-Commerce','Trade','Rewe','Amazon','Hypermarkets','supermarkets','shop','shops','Douglas','Mail order','Branded goods','OBI'],
    'Banking, and financial services': ['Savings banks', 'banking', 'Bank', 'financial services','Financial','PayTech','finance','Finanz','Sparkassen','Investment','credit institutions','Financing companies','Financing','banks','Leasing','Volksbanken'],
    'Insurance': ['Insurance companies', 'Insurance', 'insurance agencies', 'health insurance','health insurance offices','Reinsurance','AOK'],
    'Pharmaceutical and medical products and technology': ['Pharmacies', 'Pharmacies, public', 'medical products', 'medical technology','pharmaceutical','chemical'],
    'Computer Software': ['computer retail', 'computer', 'software', 'software service','SAP','Hardware','Hardware stores'],
    'Media and publishing': ['media', 'publishing', 'press', 'multimedia','Film', 'radio','television','publishers'],
    'Industry and mechanical engineering': ['engineering offices', 'engineering', 'machinery', 'systems and apparatus','Technik','Metalworking', 'machines', 'manufacturer'],
    'HR services and consulting': ['Staffing', 'HR consultant', 'Recruiting', 'HR','Employment agency','Employment'],
    'Automotive': ['Auto repair', 'Auto', 'Auto repair shops', 'Automotive','Mercedes-Benz AG','BMW','Ford','Volkswagen','Car','Motor vehicle','Vehicle','Honda','Yacht','Yachts'],
    'Transport and logistics': ['Freight forwarding', 'logistic', 'warehouse', 'railway','Logistics','Logistics company','Transport','traffic','Airports'],
    'Wholesale': ['Wholesale'],
    'Telecommunications': ['telecommunications', 'Telecommunications company','Telecom','Telephone network'],
    'Energy, water and environment': ['Energy', 'water', 'environment','energy supply'],
    'Architecture and planning': ['Architects', 'architectural offices', 'Architecture'],
    'Real estate': ['Real estate agents', 'Real estate', 'real estate agencies','immowelt','Rental', 'brokerage','rental service','Property'],
    'Marketing, advertising, PR and design': ['advertising agencies', 'advertising', 'marketing', 'PR','design','Promotional','Public relations','Communication agencies'],
    'Sport, health and social': ['Clubs', 'Games and sports', 'sport', 'health','dental','welfare','Doctors','healthcare','clinics','Healthcare','clinic','diabetologists','Sports facilities','Psychotherapists','Fitness center','Medical organizations','Institutes of Medicine','Hospitals'],
    'Education and science': ['education','university','schule', 'school','wissenschaftliche','wissenschaft','Universität','Teaching','research','Think Tanks','Language', 'schools','Universities','Institutes of Economics','Educational institutions','educational centers'],
    'Auditing, tax and law': ['Tax advisory professions', 'Tax advisory', 'tax', 'law','Steuern','Steuerberator'],
    'Hotels, tourism, travel agencies': ['Hotels', 'Restaurants and pubs', 'Restaurants', 'Travel agencies', 'travel','Tour operator','Guest houses','Guest house','Hotel companies'], 
    'Building': ['Building','construction'],   
}

# Function for Industry classification
def classify_industry(activity):
    if pd.notna(activity) and activity.strip():
        cleaned_activity = re.sub(r'\W+', ' ', activity).lower()

        for industry, keywords in industry_keywords.items():
            for keyword in keywords:
                cleaned_keyword = re.sub(r'\W+', ' ', keyword).lower()
                
                if re.search(r'\b' + re.escape(cleaned_keyword) + r'\b', cleaned_activity):
                    return industry
        
        return activity
    
    return None  


# vacancies['industry'] = vacancies['activity'].apply(classify_industry)
if vacancies['industry'].isnull().any(): 
    vacancies['industry'] = vacancies['activity'].apply(classify_industry)

vacancies['industry'].value_counts()

industry
E-commerce,trade and Retail                             449
Banking, and financial services                         211
Industry and mechanical engineering                     187
Management Consulting                                   167
Internet and IT                                         157
                                                       ... 
Locksmiths                                                1
Transistors and semiconductor manufacturers               1
Advice centers                                            1
Heating, air conditioning and ventilation installers      1
Air freight offices and companies                         1
Name: count, Length: 149, dtype: int64

#### Change columns places

In [4637]:
vacancies = vacancies[['title', 'company', 'location',
                       'bundesland', 'industry','job_type','job_place',
                       'experience','avg_salary','english','french','german','italian',	'spanish',
                       'ETL','Data Science','Data engineering', 'Analytical skills','Statistics',
                       'Artificial intelligence',	'Soft skills','Python',	'SQL', 
                       'ABAP','Alteryx','Dashboards (Tableau, PowerBI)',
                       'activity','source']]
vacancies

Unnamed: 0,title,company,location,bundesland,industry,job_type,job_place,experience,avg_salary,english,french,german,italian,spanish,ETL,Data Science,Data engineering,Analytical skills,Statistics,Artificial intelligence,Soft skills,Python,SQL,ABAP,Alteryx,"Dashboards (Tableau, PowerBI)",activity,source
0,(Advanced) Data Developer (m/w/d) OBI Smart Technologies GmbH,OBI,Deutschland,Deutschland,,,,Middle,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,Indeed
1,(JUNIOR) BUSINESS ANALYST (W/M/D) / DATA ANALYST (W/M/D) IM CONTROLLING,ZECH MANAGEMENT GMBH,Bremen,Bremen,,,,Junior,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,Indeed
2,(Junior) Account Manager / Customer Development Manager Stammkundenvertrieb (m/w/d),All for One Group SE,Oldenburg,Niedersachsen,,,,Junior,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,Indeed
3,(Junior) Analyst (m/w/d) Data Analytics,Syncwork,Berlin,Berlin,,,,Junior,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,Indeed
4,(Junior) Analyst Modelled data - Technology Topics (m/f/d),Statista GmbH,Hamburg,Hamburg,Internet and IT,,,Junior,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Information Services,Indeed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4049,Pflichtpraktikum im eBusiness - Business Intelligence & Digital Analyst,Liebherr Group,Ulm,Baden-Württemberg,,Full-time,Hybrid work,Senior,,1,0,1,0,0,1,1,0,1,1,0,0,1,1,0,0,1,,Linkedin
4050,"Senior Consultant Valuation, Modeling & Analytics (w/m/d)",PwC Deutschland,Hannover,Niedersachsen,Education and science,Full-time,Hybrid work,Senior,,1,0,1,0,0,0,1,0,1,1,0,1,1,0,0,0,0,Language schools,Linkedin
4051,Intern Consultant Data Science and AI (m/f/d)*,Munich Re,Munich,Munich,,Internship,On-site,No experience,,0,0,0,0,0,0,1,1,1,1,1,1,1,0,0,1,1,,Linkedin
4052,Research Associate/Post-Doctoral Research Fellow – Urban Analytics City Futures Research Centre,UNSW,Berlin,Berlin,,Full-time,Hybrid work,Senior,,0,0,0,0,0,0,1,0,1,1,1,0,1,0,0,0,0,,Linkedin


In [4638]:
vacancies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4054 entries, 0 to 4053
Data columns (total 28 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   title                          4054 non-null   object 
 1   company                        4054 non-null   object 
 2   location                       4011 non-null   object 
 3   bundesland                     4011 non-null   object 
 4   industry                       2677 non-null   object 
 5   job_type                       2789 non-null   object 
 6   job_place                      2105 non-null   object 
 7   experience                     4054 non-null   object 
 8   avg_salary                     1435 non-null   float64
 9   english                        4054 non-null   int64  
 10  french                         4054 non-null   int64  
 11  german                         4054 non-null   int64  
 12  italian                        4054 non-null   i

#### Save to .csv

In [4639]:
# vacancies.to_csv('vacancies.csv', index=False)
# vacancies.to_excel('vacancies.xlsx', index=False)