<a href="https://colab.research.google.com/github/philadelphia24/Thesis-Job-Recommender-System-/blob/main/Synthetic_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Synthetic data

## The clean dataset

In [None]:
#Install all necessary packages
!pip install faker
import pandas as pd
from faker import Faker
import random

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faker
  Downloading Faker-18.6.2-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faker
Successfully installed faker-18.6.2


In [None]:
# Initialize Faker and set random seed
faker = Faker("nl_BE") 
random.seed(42)

In [None]:
# Define job titles, languages, and education levels
job_title = ['Truck Driver', 'Janitor', 'Cleaner', 'Administrative Clerk', 'Cashier', 'Data Analyst', 'Finance Manager', 'Professor']
languages = ['Dutch', 'French', 'German', 'English', 'Spanish']
education_levels = ['High School', 'Bachelor', 'Master',  'PhD']
industry= ['Transport', 'Logistics', 'Cleaning', 'Hotels and restaurants', 'Construction', 'Manufacturing', 'Financial intermediation', 'Pharmaceutical', 'Health and social services', 'Wholesale and retail', 'Technology', 'Research', 'Administration', 'Education', 'Energy', 'Agriculture']
experience_years = ['0-1', '2-4', '5-10', '10+']
#limitation: "Cleaning" is not really an industry, but renders our recommendation easier.

In [None]:
# Create job seekers DataFrame
nb_jobseekers = 9000
jobseekers_data = {'Name': [faker.name() for i in range(nb_jobseekers)], 
                   'Language': [random.choice(languages) for i in range(nb_jobseekers)],
                   'Education Level': [random.choice(education_levels) for i in range(nb_jobseekers)],
                   'Industry': [random.choice(industry) for i in range(nb_jobseekers)],
                   'Experience': [random.choice(experience_years) for i in range(nb_jobseekers)],
                   }
jobseekers_df = pd.DataFrame(jobseekers_data)

In [None]:
#Jobseeker definitions
def conditions_1(row):
    if row['Industry'] == 'Cleaning':
        return 'High school'
    elif row['Industry'] == 'Education':
        return random.choice(['Bachelor', 'Master', 'PhD'])
    else:
        return row['Education Level']

# Apply the conditions function to the Education Level column in the vacancies DataFrame
jobseekers_df['Education Level'] = jobseekers_df.apply(conditions_1, axis=1)

In [None]:
# create vacancies DataFrame 
nb_vacancies = 500

vacancies_data = {'Job Title': [random.choice(job_title) for i in range(nb_vacancies)], 
                  'Language': [random.choice(languages) for i in range(nb_vacancies)],
                  'Education Level': ['']*nb_vacancies, # initialize with empty strings
                  'Industry': [random.choice(industry) for i in range(nb_vacancies)],
                  'Experience': [random.choice(experience_years) for i in range(nb_vacancies)],
                  }
vacancies_df = pd.DataFrame(vacancies_data)

In [None]:
# Modify the conditions function to assign the education level based on job title
def conditions_2(row):
    if row['Job Title'] == 'Professor':
        return 'PhD'
    elif row['Job Title'] == 'Finance Manager':
        return random.choice(['Bachelor', 'Master'])
    elif row['Job Title'] == 'Data Analyst':
        return random.choice(['Master','Bachelor'])
    elif row['Job Title'] == 'Cashier':
        return 'High School'
    elif row['Job Title'] == 'Administrative Clerk':
        return random.choice(['High school', 'Bachelor'])
    elif row['Job Title'] == 'Cleaner':
        return 'High School'
    elif row['Job Title'] == 'Janitor':
        return 'High School'
    elif row['Job Title'] == 'Truck Driver':
        return 'High School'
    else:
        return row['Education Level']
    
# Apply the conditions function to the Education Level column in the vacancies DataFrame
vacancies_df['Education Level'] = vacancies_df.apply(conditions_2, axis=1)

In [None]:
def conditions_3(row):
    if row['Job Title'] == 'Professor':
        return 'Research'
    elif row['Job Title'] == 'Finance Manager':
        return random.choice([i for i in industry]) #see if this works
    elif row['Job Title'] == 'Data Analyst':
        return random.choice([i for i in industry if i != 'Cleaning'])
    elif row['Job Title'] == 'Cashier':
        return 'Wholesale and retail'
    elif row['Job Title'] == 'Administrative Clerk':
        return 'Administration'
    elif row['Job Title'] == 'Cleaner':
        return 'Cleaning'
    elif row['Job Title'] == 'Janitor':
        return 'Cleaning'
    elif row['Job Title'] == 'Truck Driver':
        return 'Transport'
    else:
        return row['Industry']

# Apply the conditions function to the Industry Level column in the vacancies DataFrame
vacancies_df['Industry'] = vacancies_df.apply(conditions_3, axis=1)

In [None]:
def conditions_4(row):
    if row['Job Title'] == 'Finance Manager':
        return random.choice(['5-10', '10+'])
    else:
        return row['Experience']

# Apply the conditions function to the Experience Level column in the vacancies DataFrame
vacancies_df['Experience'] = vacancies_df.apply(conditions_4, axis=1)

In [None]:
# Add random vacancy and jobseeker IDs to the DataFrame
# generate unique jobseeker IDs
jobseeker_ids = random.sample(range(10000, 25000), len(jobseekers_df))
jobseekers_df.insert(0, 'JobseekerID', jobseeker_ids)

# generate unique vacancy IDs
vacancy_ids = random.sample(range(25001, 50000), len(vacancies_df))
vacancies_df.insert(0, 'VacancyID', vacancy_ids)

In [None]:
# Create a list to hold the matches
matches = []

# Iterate over jobseekers and vacancies and calculate match rating
for i, jobseeker in jobseekers_df.iterrows():
    num_ratings = 0  # Initialize number of ratings for this particular jobseeker to 0
    
    # Only consider jobseekers with matching industry and Language!!This enables them to be "eligible" for the job of NXTPeople and therefore, to assign some ratings after first week
    matching_vacancies = vacancies_df[(vacancies_df['Industry'] == jobseeker['Industry']) & (vacancies_df['Language'] == jobseeker['Language'])]
    
    for j, vacancy in matching_vacancies.iterrows():
        if num_ratings >= 12:  # Skip this vacancy if the maximum limit has been reached. It is not possible in reality that a jobseeker has rates more than 5 jobs.
            break
        if jobseeker['Experience'] == vacancy['Experience'] and jobseeker['Education Level'] == vacancy['Education Level']:
              rating = 5
        elif jobseeker['Experience'] == vacancy['Experience'] or jobseeker['Education Level'] == vacancy['Education Level']:
            rating = random.randint(4, 5) 
        else:
            rating = random.randint(1, 4)     
        
        matches.append({'JobseekerID': jobseekers_df.loc[i, 'JobseekerID'], 'JobseekerName': jobseekers_df.loc[i, 'Name'], 'VacancyID':  vacancies_df.loc[j, 'VacancyID'],'JobTitle':  vacancies_df.loc[j, 'Job Title'], 'Rating': rating})
        num_ratings += 1  # Increment the number of ratings for this jobseeker

matches_df = pd.DataFrame(matches)

In [None]:
#display vacancy and jobseeker IDs Dataframe
display(vacancies_df)

Unnamed: 0,VacancyID,Job Title,Language,Education Level,Industry,Experience
0,49442,Cashier,French,High School,Wholesale and retail,5-10
1,31482,Data Analyst,Dutch,Bachelor,Administration,0-1
2,39069,Janitor,French,High School,Cleaning,10+
3,31129,Finance Manager,Dutch,Master,Cleaning,5-10
4,44258,Data Analyst,German,Master,Wholesale and retail,0-1
...,...,...,...,...,...,...
495,48744,Data Analyst,Spanish,Bachelor,Financial intermediation,5-10
496,49498,Finance Manager,German,Bachelor,Manufacturing,5-10
497,30794,Professor,German,PhD,Research,0-1
498,34852,Janitor,English,High School,Cleaning,5-10


In [None]:
display(jobseekers_df)

Unnamed: 0,JobseekerID,Name,Language,Education Level,Industry,Experience
0,20473,Franky Verschueren,Dutch,PhD,Technology,2-4
1,22762,Frank Mertens,Dutch,PhD,Hotels and restaurants,0-1
2,10466,Jeanine Thys,German,PhD,Administration,10+
3,10978,Veerle Martens,French,High School,Construction,10+
4,15303,Hugo Van Gestel,French,High School,Manufacturing,0-1
...,...,...,...,...,...,...
8995,18299,Maria Verheyden,Spanish,High school,Cleaning,0-1
8996,17686,Carmen Dewulf,English,High School,Health and social services,2-4
8997,18212,Bruno Mariën,Dutch,PhD,Logistics,5-10
8998,23689,Koen De Smet,Dutch,High School,Research,10+


In [None]:
display(matches_df)

Unnamed: 0,JobseekerID,JobseekerName,VacancyID,JobTitle,Rating
0,20473,Franky Verschueren,30356,Finance Manager,3
1,22762,Frank Mertens,43396,Finance Manager,4
2,10466,Jeanine Thys,34808,Administrative Clerk,3
3,10466,Jeanine Thys,36153,Administrative Clerk,3
4,10466,Jeanine Thys,38808,Administrative Clerk,3
...,...,...,...,...,...
45312,21446,Kathy Van Daele,40802,Janitor,3
45313,21446,Kathy Van Daele,40283,Cleaner,5
45314,21446,Kathy Van Daele,42948,Janitor,2
45315,21446,Kathy Van Daele,27350,Cleaner,2


In [None]:
#Exporting the data to csv

# Save the DataFrames as CSV files
matches_df.to_csv('matches.csv', index=False) 
jobseekers_df.to_csv('jobseekers.csv', index=False)
vacancies_df.to_csv('vacancies.csv', index=False)

# Authenticate and mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Save the DataFrames as CSV files to the 'My Drive' folder in Google Drive
matches_df.to_csv('/content/drive/My Drive/matches.csv', index=False)
jobseekers_df.to_csv('/content/drive/My Drive/jobseekers.csv', index=False)
vacancies_df.to_csv('/content/drive/My Drive/vacancies.csv', index=False)

Mounted at /content/drive


##Noise: Option 1

*Uniform distribution of Ratings*

Let us first add noise to the dataset. Hence, no NA values will be produces but NA can be used to check how many values were replaced here.

In [None]:
from google.colab import files
uploaded = files.upload()

import numpy as np
random.seed(42)

import io
matches_df = pd.read_csv(io.BytesIO(uploaded['matches.csv']))

Saving matches.csv to matches (6).csv


In [None]:
#To check the original distribution of ratings
print(matches_df['Rating'].value_counts(ascending=False).sort_index())

1     6852
2     6670
3     6666
4    14824
5    10305
Name: Rating, dtype: int64


In [None]:
data_with_noise_df = matches_df

In [None]:
percentage_noise= 0.80
noise = round(percentage_noise * len(data_with_noise_df))

In [None]:
data_with_noise_df.loc[data_with_noise_df.sample(noise).index, 'Rating'] = random.randint(1, 5) 
#data_with_noise_df.loc[...] selects the rows with the corresponding index values and the "Rating" column.
#data_with_noise_df.sample(noise) selects a random sample of rows from the DataFrame with size noise.
#data_with_noise_df.sample(noise).index gets the index values of the selected rows.

In [None]:
data_with_noise_df

Unnamed: 0,JobseekerID,VacancyID,Rating
0,20473,30356,4
1,22762,43396,4
2,10466,34808,4
3,10466,36153,4
4,10466,38808,3
...,...,...,...
45312,21446,40802,4
45313,21446,40283,4
45314,21446,42948,4
45315,21446,27350,4


In [None]:
print(data_with_noise_df['Rating'].value_counts(ascending=False).sort_index())

1     1309
2     1372
3     1298
4    39316
5     2022
Name: Rating, dtype: int64


In [None]:
#Exporting the data to csv

# Save the DataFrames as CSV files
data_with_noise_df.to_csv('data_with_noise_80%.csv', index=False) 

# Authenticate and mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Save the DataFrames as CSV files to the 'My Drive' folder in Google Drive
data_with_noise_df.to_csv('/content/drive/My Drive/data_with_nois80%.csv', index=False)

Mounted at /content/drive


## Noise: Option 2

*Lognormal distribution of Ratings*

We came up with a second option, because the noise of option 1 outputted the same MAE values as the clean data in our Collaborative Filtering.

In [4]:
from google.colab import files
uploaded = files.upload()

import io
matches_df = pd.read_csv(io.BytesIO(uploaded['matches.csv']))

Saving matches.csv to matches (2).csv


NameError: ignored

In [None]:
data_with_noise_df = matches_df

In [None]:
print(matches_df['Rating'].value_counts())

4    14824
5    10305
1     6852
2     6670
3     6666
Name: Rating, dtype: int64


In [None]:
percentage_noise= 0.10

In [None]:
import numpy as np
random.seed(42)

# Set the mean and standard deviation of the lognormal distribution
mu = 0  # Mean of the logarithmic values (log(1))
sigma = 1  # Standard deviation of the logarithmic values

# Generate lognormally distributed noise
noise = np.random.lognormal(mu, sigma, size=round(percentage_noise * len(data_with_noise_df)))
# numpy.random.lognormal() function generates random values from a lognormal distribution with mean mu and standard deviation sigma.
# size parameter specifies the number of noise values to generate
# sample() method of the dataframe is used to select a random subset of rows with size equal to the number of generated noise values. 

# Select a random subset of rows and assign them the generated noise
data_with_noise_df.loc[data_with_noise_df.sample(noise.size).index, 'Rating'] += noise
# The += operator is used to add the generated noise to the "Rating" column of the selected rows.

# Round the final values of the "Rating" column to 0 decimal places
data_with_noise_df['Rating'] = round(data_with_noise_df['Rating'], 0)

# Clip the values of the "Rating" column to a maximum of 5
data_with_noise_df['Rating'] = data_with_noise_df['Rating'].clip(1, 5)

In [None]:
data_with_noise_df

Unnamed: 0,JobseekerID,VacancyID,Rating
0,20473,30356,5.0
1,22762,43396,1.0
2,10466,34808,4.0
3,10466,36153,3.0
4,10466,38808,5.0
...,...,...,...
45312,21446,40802,5.0
45313,21446,40283,5.0
45314,21446,42948,2.0
45315,21446,27350,3.0


In [None]:
if len(data_with_noise_df) == len(matches_df):
  print(f"Only {percentage_noise} of rows were replaced.")
else:
  print("More than {percentage_noise} of rows were replaced.")

Only 0.8 of rows were replaced.


In [None]:
print(data_with_noise_df['Rating'].value_counts())

5.0    22558
4.0     9442
3.0     5660
2.0     4968
1.0     2689
Name: Rating, dtype: int64


In [None]:
#Exporting the data to csv

# Save the DataFrames as CSV files
data_with_noise_df.to_csv('matches_ln80.csv', index=False) 

# Authenticate and mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Save the DataFrames as CSV files to the 'My Drive' folder in Google Drive
data_with_noise_df.to_csv('/content/drive/My Drive/matches_ln80.csv', index=False)

Mounted at /content/drive
