In [1]:
#importing packages required to create a synthetic dataset
import pandas as pd
import numpy as np

In [2]:
# parameters
n_users = 10000
n_jobs = 9827

In [3]:
# Generate random user profiles
users = pd.DataFrame({
    'user_id': np.arange(1, n_users+1),
    'age': np.random.randint(18, 60, n_users),
    'education_level': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD']),
    'years_experience': np.random.randint(0, 20, n_users),
    'skills': np.random.choice(['Python', 'SQL', 'Data Analysis','Machine Learning', 'Project Management', 'R'], n_users)
})

In [4]:
# Generate random job listings
jobs = pd.DataFrame({
    'job_id': np.arange(1, n_jobs + 1),
    'job_title': np.random.choice(['Data Scientist', 'Data Analyst', 'Machine Learning Engineer'], n_jobs),
    'required_experience': np.random.randint(0, 15, n_jobs),
    'required_skills': np.random.choice(['Python', 'SQL', 'Data Analysis','R', 'Machine Learning', 'Project Management'], n_jobs),
    'locaton': np.random.choice(['Nairobi', 'Mombasa', 'Kisumu'], n_jobs)
})

In [5]:
# Generate random user-job interactions
interactions = pd.DataFrame({
    'user_id': np.random.choice(users['user_id'], size = 5000),
    'job_id': np.random.choice(jobs['job_id'], size=5000),
    'interaction_score': np.random.randint(1, 5, size=5000) #interaction score e.g 1-5
})

In [6]:
users.head()

Unnamed: 0,user_id,age,education_level,years_experience,skills
0,1,21,PhD,17,Machine Learning
1,2,42,PhD,17,Project Management
2,3,25,PhD,18,Project Management
3,4,44,PhD,13,Machine Learning
4,5,51,PhD,17,R


In [7]:
users.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user_id,10000.0,5000.5,2886.89568,1.0,2500.75,5000.5,7500.25,10000.0
age,10000.0,38.5552,12.169707,18.0,28.0,39.0,49.0,59.0
years_experience,10000.0,9.3893,5.760405,0.0,4.0,9.0,14.0,19.0


In [8]:
jobs.head()

Unnamed: 0,job_id,job_title,required_experience,required_skills,locaton
0,1,Machine Learning Engineer,3,Python,Nairobi
1,2,Data Scientist,2,Python,Nairobi
2,3,Data Analyst,2,SQL,Nairobi
3,4,Data Analyst,10,Machine Learning,Mombasa
4,5,Machine Learning Engineer,9,Data Analysis,Nairobi


In [9]:
jobs.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
job_id,9827.0,4914.0,2836.954882,1.0,2457.5,4914.0,7370.5,9827.0
required_experience,9827.0,6.998881,4.330756,0.0,3.0,7.0,11.0,14.0


In [10]:
interactions.head()

Unnamed: 0,user_id,job_id,interaction_score
0,4106,5786,4
1,2959,529,3
2,5639,4188,2
3,7141,418,2
4,9113,1579,2


In [11]:
interactions.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user_id,5000.0,5040.6168,2906.621665,6.0,2532.75,5023.0,7602.25,9999.0
job_id,5000.0,4933.7964,2823.593374,1.0,2479.5,5007.0,7331.25,9827.0
interaction_score,5000.0,2.5052,1.112394,1.0,2.0,3.0,3.0,4.0


In [12]:
# Add job preferences to users
users['preferred_job'] = np.random.choice([
    'Data Scientist', 'Data Analyst', 'Machine Learning Engineer', 'Software Engineer'], size=n_users)

users['preferred_location'] = np.random.choice([
    'Nairobi', 'Mombasa', 'Kisumu', 'Remote'], size=n_users)

users['salary_expectation'] = np.random.randint(50000, 200000, size=n_users)

users.head()

Unnamed: 0,user_id,age,education_level,years_experience,skills,preferred_job,preferred_location,salary_expectation
0,1,21,PhD,17,Machine Learning,Machine Learning Engineer,Mombasa,177870
1,2,42,PhD,17,Project Management,Software Engineer,Mombasa,186425
2,3,25,PhD,18,Project Management,Machine Learning Engineer,Kisumu,134315
3,4,44,PhD,13,Machine Learning,Data Analyst,Nairobi,178646
4,5,51,PhD,17,R,Data Scientist,Kisumu,127532


In [13]:
# Simulate Past Job Search Behaviours
users['search_keywords'] = np.random.choice([
    'Python jobs', 'Data Science jobs', 'Remote work', 'Machine Learning Jobs'],size = n_users)

users['search_frequency'] = np.random.randint(1, 10, size=n_users) #number of searches per week
users['last_search_time'] = pd.to_datetime('2024-10-10') - pd.to_timedelta(np.random.randint(1, 365, size=n_users), unit='d')

users.head()

Unnamed: 0,user_id,age,education_level,years_experience,skills,preferred_job,preferred_location,salary_expectation,search_keywords,search_frequency,last_search_time
0,1,21,PhD,17,Machine Learning,Machine Learning Engineer,Mombasa,177870,Python jobs,3,2024-05-17
1,2,42,PhD,17,Project Management,Software Engineer,Mombasa,186425,Remote work,1,2024-09-28
2,3,25,PhD,18,Project Management,Machine Learning Engineer,Kisumu,134315,Data Science jobs,1,2023-11-22
3,4,44,PhD,13,Machine Learning,Data Analyst,Nairobi,178646,Python jobs,3,2024-03-14
4,5,51,PhD,17,R,Data Scientist,Kisumu,127532,Machine Learning Jobs,5,2024-03-05


In [14]:
# Add job popularity feature (number of applications)
jobs['applications'] = np.random.randint(10, 500, size=n_jobs)

jobs.head()

Unnamed: 0,job_id,job_title,required_experience,required_skills,locaton,applications
0,1,Machine Learning Engineer,3,Python,Nairobi,257
1,2,Data Scientist,2,Python,Nairobi,17
2,3,Data Analyst,2,SQL,Nairobi,98
3,4,Data Analyst,10,Machine Learning,Mombasa,322
4,5,Machine Learning Engineer,9,Data Analysis,Nairobi,396


In [15]:
# Incorporate Skills Matching Score for each user-job interaction
def calculate_skill_match(user_skills, job_skills):
    user_skills_set = set(user_skills.split(','))
    job_skills_set = set(job_skills.split(','))
    return len(user_skills_set.intersection(job_skills_set)) / len(job_skills_set)


In [16]:
def calculate_skill_match(user_skills, job_skills):
    #Ensure they are strings
    if not isinstance(user_skills, str) or not isinstance(job_skills, str):
        return 0 #if skills data is invalid
    
    #convert the skills strings  to sets
    user_skills_set=set(user_skills.split(','))
    job_skills_set=set(job_skills.split(','))
    
    #if job requires no skills, return 0 match
    if len(job_skills_set) == 0:
        return 0
    
    # calculate the intersection and return match score
    return len(user_skills_set.intersection(job_skills_set)) / len(job_skills_set)

In [17]:
# Merge datasets
data = pd.merge(interactions, users, on='user_id')
df = pd.merge(data, jobs, on='job_id')

In [18]:
# Apply this logic to the interactions data
df['skills_match_score'] = df.apply(
    lambda row: calculate_skill_match(row['skills'], row['required_skills']), axis=1)

In [19]:
df.head()

Unnamed: 0,user_id,job_id,interaction_score,age,education_level,years_experience,skills,preferred_job,preferred_location,salary_expectation,search_keywords,search_frequency,last_search_time,job_title,required_experience,required_skills,locaton,applications,skills_match_score
0,4106,5786,4,46,PhD,1,SQL,Software Engineer,Mombasa,194692,Remote work,6,2023-11-25,Machine Learning Engineer,14,R,Nairobi,14,0.0
1,2959,529,3,42,PhD,13,Machine Learning,Data Analyst,Kisumu,168625,Python jobs,7,2024-03-02,Machine Learning Engineer,2,Data Analysis,Nairobi,250,0.0
2,5639,4188,2,40,PhD,9,Data Analysis,Machine Learning Engineer,Mombasa,178473,Data Science jobs,5,2023-10-26,Data Analyst,9,SQL,Mombasa,462,0.0
3,7141,418,2,59,PhD,10,Machine Learning,Data Analyst,Nairobi,104139,Machine Learning Jobs,4,2024-03-15,Data Scientist,11,Machine Learning,Kisumu,311,1.0
4,9113,1579,2,57,PhD,5,Project Management,Software Engineer,Mombasa,127522,Remote work,3,2024-02-25,Machine Learning Engineer,11,Project Management,Nairobi,25,1.0


In [20]:
# Adding job Duartion in the Interaction Data
# Simulating time spent viewing the job in seconds
df['time_spent'] = np.random.randint(10, 300, size=len(df))
df.head()

Unnamed: 0,user_id,job_id,interaction_score,age,education_level,years_experience,skills,preferred_job,preferred_location,salary_expectation,search_keywords,search_frequency,last_search_time,job_title,required_experience,required_skills,locaton,applications,skills_match_score,time_spent
0,4106,5786,4,46,PhD,1,SQL,Software Engineer,Mombasa,194692,Remote work,6,2023-11-25,Machine Learning Engineer,14,R,Nairobi,14,0.0,130
1,2959,529,3,42,PhD,13,Machine Learning,Data Analyst,Kisumu,168625,Python jobs,7,2024-03-02,Machine Learning Engineer,2,Data Analysis,Nairobi,250,0.0,218
2,5639,4188,2,40,PhD,9,Data Analysis,Machine Learning Engineer,Mombasa,178473,Data Science jobs,5,2023-10-26,Data Analyst,9,SQL,Mombasa,462,0.0,217
3,7141,418,2,59,PhD,10,Machine Learning,Data Analyst,Nairobi,104139,Machine Learning Jobs,4,2024-03-15,Data Scientist,11,Machine Learning,Kisumu,311,1.0,74
4,9113,1579,2,57,PhD,5,Project Management,Software Engineer,Mombasa,127522,Remote work,3,2024-02-25,Machine Learning Engineer,11,Project Management,Nairobi,25,1.0,146


In [21]:
# Simulate job clicks, saves, and applications
df['job_clicks'] = np.random.randint(0,10, size=len(df)) #how many times the user clicked on the job
df['job_saves'] = np.random.randint(0,5, size=len(df))
df['applied'] = np.random.choice([0, 1], size=len(df)) # if user applied 1 and 0 otherwise

In [22]:
# Simulate user job history
users['past_jobs'] = np.random.choice(['Junior Data Analyst', 'Senior Developer', 'Consultant', 'Intern', 'Data Engineer',
                                      'Data Scientist', 'Junior Data Engineer', 'Machine Learning Engineer',
                                       'Research Assistant'], size=n_users)
users['tenure_years'] = np.random.randint(1, 10, size=n_users) # number of years in the past job

users.head()

Unnamed: 0,user_id,age,education_level,years_experience,skills,preferred_job,preferred_location,salary_expectation,search_keywords,search_frequency,last_search_time,past_jobs,tenure_years
0,1,21,PhD,17,Machine Learning,Machine Learning Engineer,Mombasa,177870,Python jobs,3,2024-05-17,Data Scientist,8
1,2,42,PhD,17,Project Management,Software Engineer,Mombasa,186425,Remote work,1,2024-09-28,Senior Developer,3
2,3,25,PhD,18,Project Management,Machine Learning Engineer,Kisumu,134315,Data Science jobs,1,2023-11-22,Consultant,3
3,4,44,PhD,13,Machine Learning,Data Analyst,Nairobi,178646,Python jobs,3,2024-03-14,Consultant,7
4,5,51,PhD,17,R,Data Scientist,Kisumu,127532,Machine Learning Jobs,5,2024-03-05,Intern,3


In [23]:
# job posting dates and deadlines
jobs['posted_date'] = pd.to_datetime('2024-09-01') - pd.to_timedelta(np.random.randint(1, 30, size=n_jobs), unit='d')
jobs['application_deadline'] = jobs['posted_date'] + pd.to_timedelta(np.random.randint(7, 30, size=n_jobs), unit='d')

jobs.head()

Unnamed: 0,job_id,job_title,required_experience,required_skills,locaton,applications,posted_date,application_deadline
0,1,Machine Learning Engineer,3,Python,Nairobi,257,2024-08-13,2024-08-31
1,2,Data Scientist,2,Python,Nairobi,17,2024-08-10,2024-08-18
2,3,Data Analyst,2,SQL,Nairobi,98,2024-08-25,2024-09-07
3,4,Data Analyst,10,Machine Learning,Mombasa,322,2024-08-09,2024-09-02
4,5,Machine Learning Engineer,9,Data Analysis,Nairobi,396,2024-08-15,2024-08-22


In [24]:
# Merge additional features to the main dataset
df.columns

Index(['user_id', 'job_id', 'interaction_score', 'age', 'education_level',
       'years_experience', 'skills', 'preferred_job', 'preferred_location',
       'salary_expectation', 'search_keywords', 'search_frequency',
       'last_search_time', 'job_title', 'required_experience',
       'required_skills', 'locaton', 'applications', 'skills_match_score',
       'time_spent', 'job_clicks', 'job_saves', 'applied'],
      dtype='object')

In [25]:
users.columns

Index(['user_id', 'age', 'education_level', 'years_experience', 'skills',
       'preferred_job', 'preferred_location', 'salary_expectation',
       'search_keywords', 'search_frequency', 'last_search_time', 'past_jobs',
       'tenure_years'],
      dtype='object')

In [26]:
jobs.columns

Index(['job_id', 'job_title', 'required_experience', 'required_skills',
       'locaton', 'applications', 'posted_date', 'application_deadline'],
      dtype='object')

In [27]:
df = pd.merge(df, jobs[['job_id', 'posted_date', 'application_deadline']], on='job_id')

In [28]:
df.head()

Unnamed: 0,user_id,job_id,interaction_score,age,education_level,years_experience,skills,preferred_job,preferred_location,salary_expectation,...,required_skills,locaton,applications,skills_match_score,time_spent,job_clicks,job_saves,applied,posted_date,application_deadline
0,4106,5786,4,46,PhD,1,SQL,Software Engineer,Mombasa,194692,...,R,Nairobi,14,0.0,130,0,1,1,2024-08-09,2024-09-01
1,2959,529,3,42,PhD,13,Machine Learning,Data Analyst,Kisumu,168625,...,Data Analysis,Nairobi,250,0.0,218,6,1,0,2024-08-21,2024-08-31
2,5639,4188,2,40,PhD,9,Data Analysis,Machine Learning Engineer,Mombasa,178473,...,SQL,Mombasa,462,0.0,217,4,0,0,2024-08-27,2024-09-19
3,7141,418,2,59,PhD,10,Machine Learning,Data Analyst,Nairobi,104139,...,Machine Learning,Kisumu,311,1.0,74,8,3,0,2024-08-15,2024-08-22
4,9113,1579,2,57,PhD,5,Project Management,Software Engineer,Mombasa,127522,...,Project Management,Nairobi,25,1.0,146,6,0,0,2024-08-18,2024-09-11


In [29]:
df.duplicated().sum()

0

In [30]:
df.isna().sum()

user_id                 0
job_id                  0
interaction_score       0
age                     0
education_level         0
years_experience        0
skills                  0
preferred_job           0
preferred_location      0
salary_expectation      0
search_keywords         0
search_frequency        0
last_search_time        0
job_title               0
required_experience     0
required_skills         0
locaton                 0
applications            0
skills_match_score      0
time_spent              0
job_clicks              0
job_saves               0
applied                 0
posted_date             0
application_deadline    0
dtype: int64

In [34]:
# Features to use for training
feature_columns = [
    'skills_match_score', 'time_spent', 'job_clicks', 'job_saves',
    'applications', 'salary_expectation','search_frequency'
]

In [35]:
target_column ='applied'

In [36]:
x = df[feature_columns]
y= df[target_column]

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

<h2> Choosing a Model </h2>
    We can start by training a simple logistic regression model to predict whether a user would apply for job or not (binary classification).

In [39]:
from sklearn.linear_model import LogisticRegression

In [40]:
# Initialize the model
model = LogisticRegression(max_iter=1000)

In [41]:
# Train the model
model.fit(X_train, y_train)

<h2> Evaluate the Model </h2>

In [42]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [43]:
# Make predictions
y_pred = model.predict(X_test)

In [44]:
# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score=(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [45]:
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.495
Precision: 0.4953271028037383
Recall: (1501    0
2586    1
2653    1
1055    1
705     0
       ..
4711    1
2313    1
3214    0
2732    0
1926    1
Name: applied, Length: 1000, dtype: int32, array([1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1

<h2> Model Improvement </h2>

In [46]:
from sklearn.ensemble import RandomForestClassifier

In [47]:
# Initialize Random forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [48]:
# Train the Random Forest Model
rf_model.fit(X_train, y_train)

In [49]:
# Make predictions
y_pred_rf = rf_model.predict(X_test)

In [50]:
# Evaluate Random Forest performance
accuracy = accuracy_score(y_test, y_pred_rf)
precision = precision_score(y_test, y_pred_rf)
recall = recall_score=(y_test, y_pred_rf)
f1 = f1_score(y_test, y_pred_rf)

In [51]:
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.486
Precision: 0.4851694915254237
Recall: (1501    0
2586    1
2653    1
1055    1
705     0
       ..
4711    1
2313    1
3214    0
2732    0
1926    1
Name: applied, Length: 1000, dtype: int32, array([0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0

<h2> Generate Job Predictions</h2>
Once the model is trained, we can use it to recommend jobs predicting the probability that a user will apply fr a partcular job.

In [52]:
# predict the prob that a user will apply
prob = model.predict_proba(X_test)[:, 1] # probability of class 1 is applied and 0 otherwise

In [53]:
# Rank jobs by probability
recommendations = X_test.copy()
recommendations['apply_prob'] = prob

In [54]:
# Sort recommendations by probability
recommendations = recommendations.sort_values(by='apply_prob', ascending=False)

In [55]:
# Display top recommendations
recommendations.head(10)

Unnamed: 0,skills_match_score,time_spent,job_clicks,job_saves,applications,salary_expectation,search_frequency,apply_prob
2462,1.0,15,6,4,479,71748,9,0.563534
3049,1.0,133,1,4,467,91675,4,0.559614
2223,1.0,78,0,4,309,102914,1,0.555682
3295,1.0,154,4,4,467,67310,4,0.554077
605,1.0,54,2,4,264,50581,3,0.553293
2594,1.0,144,0,4,462,199802,3,0.55219
4231,0.0,20,0,4,492,52494,9,0.550726
175,1.0,28,7,3,334,106447,2,0.5487
4707,1.0,139,0,3,403,138996,5,0.547332
2575,1.0,185,1,3,488,111069,8,0.547006
