# Perform the ranking tasks

- randomly select 20 candidate jobs and rank them.

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import random
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
def show_result(y_true, y_prob):
    y_prediction = [0 if i<=0.5 else 1 for i in y_prob]
    report = classification_report(y_true,y_prediction,digits=4)
    report = report.splitlines()
    columns = ['class'] + report[0].split()
    col_1, col_2, col_3, col_4, col_5 = [], [], [], [], []
    for row in report[1:]:
        if len(row.split()) != 0:
            row = row.split()
            if len(row) < 5:
                col_1.append(row[0])
                col_2.append('')
                col_3.append('')
                col_4.append(row[1])
                col_5.append(row[2])
            elif len(row) > 5:
                col_1.append(row[0] + ' ' + row[1])
                col_2.append(row[2])
                col_3.append(row[3])
                col_4.append(row[4])
                col_5.append(row[5])
            else:
                col_1.append(row[0])
                col_2.append(row[1])
                col_3.append(row[2])
                col_4.append(row[3])
                col_5.append(row[4])
    col_1.append("overall")
    col_2.append(precision_score(y_true, y_prediction))
    col_3.append(recall_score(y_true, y_prediction))
    col_4.append(f1_score(y_true, y_prediction))
    col_5.append(roc_auc_score(y_true, y_prob))
    result = pd.DataFrame()
    result[columns[0]] = col_1
    result[columns[1]] = col_2
    result[columns[2]] = col_3
    result[columns[3]] = col_4
    result[columns[4]] = col_5
    print("——————Test——————")
    print(result)

In [3]:
user_set = pd.read_csv("user_set_cleaned.csv")
job_set = pd.read_csv("job_set_cleaned.csv")
work_history = pd.read_csv("work_history_cleaned.csv")
dataset = pd.read_csv("dataset_cleaned.csv")

In [5]:
X_train = np.load("X_train.npy")
Y_train = np.load("Y_train.npy")
X_test = np.load("X_test.npy")
Y_test = np.load("Y_test.npy")

# 1. Build datasets

In [6]:
# about 1 min
job_set = job_set.fillna(" ")
job_set["word"] = job_set.Title + job_set.Description + job_set.Requirements
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=5, max_features=100, stop_words='english')
tfidf_matrix = tf.fit_transform(job_set['word'])

In [9]:
word_history_tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=1, max_features=50, stop_words='english')
word_history_tf_matrix = word_history_tf.fit_transform(work_history.groupby("UserID").JobTitle.sum().values)

In [10]:
test_user = user_set[user_set.Split=="Test"].UserID.values
test_data = dataset[dataset.UserID.isin(test_user)]

In [11]:
ranking_data = pd.DataFrame(columns = ["UserID","JobID","label", "City", "State"])
job_id = job_set.JobID.unique().tolist()
groups = test_data.groupby("UserID")
user_ids = []
job_ids = []
labels = []
City = []
State = []
for idx, group in tqdm(groups):
    size = 99
    exist_job = group.JobID.unique().tolist()
    candidate_job = [i for i in job_id if i not in exist_job ]
    sample_job = random.sample(range(0,len(candidate_job)),size)
    user_ids.extend([idx] * (size+1))
    job_ids.append(exist_job[0])
    job_ids.extend([candidate_job[i] for i in sample_job])
    labels.append(1)
    labels.extend([0] * (size))
    City.append(group.City.values[0])
    State.append(group.State.values[0])
    jobs = job_set[job_set.JobID.isin([candidate_job[i] for i in sample_job])]
    
    City.extend([0 if i!=group.City.values[0] else a for i in jobs.City.values.tolist()])
    State.extend([0 if i!=group.State.values[0] else a for i in jobs.State.values.tolist()])
    
ranking_data.UserID = user_ids
ranking_data.JobID = job_ids
ranking_data.label = labels
ranking_data.City = City
ranking_data.State = State
# ranking_data.to_csv("ranking_data.csv",index=False)

100%|████████████████████████████████████████████████████████████████████████████████| 260/260 [00:05<00:00, 51.32it/s]


# 2. Define the evaluation function

In [12]:
def test_hit_rate(model, N):
    hit = 0
    groups = ranking_data.groupby("UserID")
    for u_id, group in tqdm(groups):
        X = np.zeros((1,158))
        user = user_set[user_set.UserID==u_id][["DegreeType", "WorkHistoryCount", "TotalYearsExperience", "CurrentlyEmployed", 
                                                "ManagedOthers", "ManagedHowMany"]]
        u_idx = user.index.values[0]
        user_feature = np.concatenate((user.values, word_history_tf_matrix[u_idx,:].toarray()),axis=1)
        job_id_list = group.JobID.values
        jobs = job_set[job_set.JobID.isin(job_id_list)]
        j_idx = jobs.index.values
        f = []
        for i in j_idx:
            feature = np.concatenate((user_feature, tfidf_matrix[i,:].toarray()), axis=1).reshape(156,).tolist()
            f.append(feature)
        feature = np.concatenate((group[["City","State"]].values, np.array(f)),axis=1)
        X = np.concatenate((X, feature), axis=0)
        result = model.predict_proba(X[1:])
#         result = model.predict(X[1:])
        a = -np.sort(-result[:,1])
        idx = np.argwhere(a==result[0,1])[0][0]
        if idx <= N-1:
            hit += 1
    return hit/len(test_user)

# 3. Test models
- Random Forest

In [14]:
from sklearn.impute import SimpleImputer

# Create the imputer with a chosen strategy (mean, median, most_frequent, or constant)
imputer = SimpleImputer(strategy='mean')

# Impute the missing values in the training and test sets
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Create and fit the RandomForestClassifier model
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train_imputed, Y_train)

# Make predictions
y_pred = rf.predict_proba(X_test_imputed)[:, 1]

# Function to show results (assuming show_result is defined)
show_result(Y_test, y_pred)


——————Test——————
          class precision    recall f1-score   support
0             0    0.6353    0.6414   0.6383       527
1             1    0.6379    0.6319   0.6349       527
2      accuracy                       0.6366      1054
3     macro avg    0.6366    0.6366   0.6366      1054
4  weighted avg    0.6366    0.6366   0.6366      1054
5       overall  0.637931  0.631879  0.63489  0.712698


In [16]:
from sklearn.pipeline import Pipeline

# Create the pipeline with imputer and RandomForestClassifier
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('classifier', RandomForestClassifier(random_state=0))
])

# Fit the pipeline on the training data
pipeline.fit(X_train, Y_train)

# Test hit rate using the pipeline
test_hit_rate(pipeline, 1), test_hit_rate(pipeline, 5), test_hit_rate(pipeline, 10), test_hit_rate(pipeline, 20)


100%|████████████████████████████████████████████████████████████████████████████████| 260/260 [00:09<00:00, 27.62it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 260/260 [00:09<00:00, 27.51it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 260/260 [00:09<00:00, 27.43it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 260/260 [00:09<00:00, 27.51it/s]


(0.03461538461538462,
 0.1346153846153846,
 0.2230769230769231,
 0.38076923076923075)

- Linear Regression

In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import numpy as np
from tqdm import tqdm

def test_hit_rate_linearRegr(model, N):
    hit = 0
    groups = ranking_data.groupby("UserID")
    for u_id, group in tqdm(groups):
        X = np.zeros((1,158))
        user = user_set[user_set.UserID==u_id][["DegreeType", "WorkHistoryCount", "TotalYearsExperience", "CurrentlyEmployed", 
                                                "ManagedOthers", "ManagedHowMany"]]
        u_idx = user.index.values[0]
        user_feature = np.concatenate((user.values, word_history_tf_matrix[u_idx,:].toarray()), axis=1)
        job_id_list = group.JobID.values
        jobs = job_set[job_set.JobID.isin(job_id_list)]
        j_idx = jobs.index
        f = []
        for i in j_idx:
            feature = np.concatenate((user_feature, tfidf_matrix[i,:].toarray()), axis=1).reshape(156,).tolist()
            f.append(feature)
        feature = np.concatenate((group[["City","State"]].values, np.array(f)), axis=1)
        X = np.concatenate((X, feature), axis=0)
        result = model.predict(X[1:])
        a = -np.sort(-result)
        idx = np.argwhere(a == result[0])[0][0]
        if idx <= N-1:
            hit += 1
    return hit / len(groups)

pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('regressor', LinearRegression())
])

pipeline.fit(X_train, Y_train)
y_pred = pipeline.predict(X_test)
show_result(Y_test, y_pred)

# Test hit rates
hit_rate_1 = test_hit_rate_linearRegr(pipeline, 1)
hit_rate_5 = test_hit_rate_linearRegr(pipeline, 5)
hit_rate_10 = test_hit_rate_linearRegr(pipeline, 10)
hit_rate_20 = test_hit_rate_linearRegr(pipeline, 20)

print(f"Hit rate for top 1: {hit_rate_1}")
print(f"Hit rate for top 5: {hit_rate_5}")
print(f"Hit rate for top 10: {hit_rate_10}")
print(f"Hit rate for top 20: {hit_rate_20}")


——————Test——————
          class precision    recall  f1-score  support
0             0    0.5251    0.5351    0.5301      527
1             1    0.5261    0.5161    0.5211      527
2      accuracy                        0.5256     1054
3     macro avg    0.5256    0.5256    0.5256     1054
4  weighted avg    0.5256    0.5256    0.5256     1054
5       overall  0.526112  0.516129  0.521073  0.52463


100%|████████████████████████████████████████████████████████████████████████████████| 260/260 [00:04<00:00, 60.14it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 260/260 [00:04<00:00, 59.63it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 260/260 [00:04<00:00, 61.43it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 260/260 [00:04<00:00, 63.64it/s]

Hit rate for top 1: 0.015384615384615385
Hit rate for top 5: 0.10384615384615385
Hit rate for top 10: 0.15384615384615385
Hit rate for top 20: 0.28076923076923077





  0%|                                                                                          | 0/260 [00:00<?, ?it/s]


NotFittedError: This LinearRegression instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

- Logistic Regression

In [21]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # You can change 'mean' to 'median' or another strategy if needed
    ('classifier', LogisticRegression())
])
pipeline.fit(X_train, Y_train)
y_pred = pipeline.predict_proba(X_test)
show_result(Y_test, y_pred[:,1])
def test_hit_rate_linearRegr(model, N):
    hit = 0
    groups = ranking_data.groupby("UserID")
    for u_id, group in tqdm(groups):
        X = np.zeros((1, 158))
        user = user_set[user_set.UserID == u_id][["DegreeType", "WorkHistoryCount", "TotalYearsExperience", "CurrentlyEmployed", 
                                                  "ManagedOthers", "ManagedHowMany"]]
        u_idx = user.index.values[0]
        user_feature = np.concatenate((user.values, word_history_tf_matrix[u_idx,:].toarray()), axis=1)
        job_id_list = group.JobID.values
        jobs = job_set[job_set.JobID.isin(job_id_list)]
        j_idx = jobs.index
        f = []
        for i in j_idx:
            feature = np.concatenate((user_feature, tfidf_matrix[i,:].toarray()), axis=1).reshape(156,).tolist()
            f.append(feature)
        feature = np.concatenate((group[["City", "State"]].values, np.array(f)), axis=1)
        X = np.concatenate((X, feature), axis=0)
        result = model.predict(X[1:])
        a = -np.sort(-result)
        idx = np.argwhere(a == result[0])[0][0]
        if idx <= N-1:
            hit += 1
    return hit / len(groups)

# Test hit rates
hit_rate_1 = test_hit_rate_linearRegr(pipeline, 1)
hit_rate_5 = test_hit_rate_linearRegr(pipeline, 5)
hit_rate_10 = test_hit_rate_linearRegr(pipeline, 10)
hit_rate_20 = test_hit_rate_linearRegr(pipeline, 20)

print(f"Hit rate for top 1: {hit_rate_1}")
print(f"Hit rate for top 5: {hit_rate_5}")
print(f"Hit rate for top 10: {hit_rate_10}")
print(f"Hit rate for top 20: {hit_rate_20}")


——————Test——————
          class precision   recall f1-score   support
0             0    0.5333   0.5009   0.5166       527
1             1    0.5295   0.5617   0.5451       527
2      accuracy                      0.5313      1054
3     macro avg    0.5314   0.5313   0.5309      1054
4  weighted avg    0.5314   0.5313   0.5309      1054
5       overall  0.529517  0.56167  0.54512  0.527964


100%|████████████████████████████████████████████████████████████████████████████████| 260/260 [00:04<00:00, 60.32it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 260/260 [00:04<00:00, 62.36it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 260/260 [00:04<00:00, 63.39it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 260/260 [00:04<00:00, 63.74it/s]

Hit rate for top 1: 0.6461538461538462
Hit rate for top 5: 0.6461538461538462
Hit rate for top 10: 0.6461538461538462
Hit rate for top 20: 0.6461538461538462





- Decision Tree

In [22]:
dt = DecisionTreeClassifier(max_leaf_nodes=1500,random_state=0)
dt.fit(X_train,Y_train)
y_pred = dt.predict_proba(X_test)
show_result(Y_test, y_pred[:,1])

——————Test——————
          class precision    recall  f1-score   support
0             0    0.5024    0.5882    0.5420       527
1             1    0.5034    0.4175    0.4564       527
2      accuracy                        0.5028      1054
3     macro avg    0.5029    0.5028    0.4992      1054
4  weighted avg    0.5029    0.5028    0.4992      1054
5       overall  0.503432  0.417457  0.456432  0.525833


In [23]:
test_hit_rate(dt,1), test_hit_rate(dt,5), test_hit_rate(dt,10), test_hit_rate(dt,20)

100%|████████████████████████████████████████████████████████████████████████████████| 260/260 [00:04<00:00, 62.67it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 260/260 [00:04<00:00, 62.26it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 260/260 [00:04<00:00, 64.46it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 260/260 [00:04<00:00, 64.32it/s]


(0.03076923076923077,
 0.07692307692307693,
 0.23076923076923078,
 0.5423076923076923)

- Naive Bayes

In [25]:


# Initialize SimpleImputer with a strategy (e.g., 'mean', 'median', 'most_frequent')
imputer = SimpleImputer(strategy='mean')

# Fit the imputer on the training data and transform both training and test data
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Initialize and fit Gaussian Naive Bayes classifier
nb = GaussianNB()
nb.fit(X_train_imputed, Y_train)

# Predict probabilities for test data
y_pred = nb.predict_proba(X_test_imputed)

# Show results
show_result(Y_test, y_pred[:, 1])


——————Test——————
          class precision    recall  f1-score   support
0             0    0.5174    0.5655    0.5403       527
1             1    0.5209    0.4725    0.4955       527
2      accuracy                        0.5190      1054
3     macro avg    0.5191    0.5190    0.5179      1054
4  weighted avg    0.5191    0.5190    0.5179      1054
5       overall  0.520921  0.472486  0.495522  0.525539


  4%|███▍                                                                             | 11/260 [00:00<00:04, 58.75it/s]


ValueError: Input X contains NaN.
GaussianNB does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

- AdaBoost

In [29]:


# Initialize SimpleImputer with a strategy (e.g., 'mean', 'median', 'most_frequent')
imputer = SimpleImputer(strategy='mean')

# Fit the imputer on the training data and transform both training and test data
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Initialize and fit AdaBoostClassifier
ada = AdaBoostClassifier(random_state=0)
ada.fit(X_train_imputed, Y_train)

# Predict probabilities for test data
y_pred = ada.predict_proba(X_test_imputed)

# Show results
show_result(Y_test, y_pred[:, 1])


——————Test——————
          class precision    recall  f1-score  support
0             0    0.5081    0.5370    0.5221      527
1             1    0.5091    0.4801    0.4941      527
2      accuracy                        0.5085     1054
3     macro avg    0.5086    0.5085    0.5081     1054
4  weighted avg    0.5086    0.5085    0.5081     1054
5       overall  0.509054  0.480076  0.494141  0.50184


  4%|███▍                                                                             | 11/260 [00:00<00:07, 34.85it/s]


ValueError: Input X contains NaN.
AdaBoostClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

- Gradient Boosting

In [None]:
gbdt = GradientBoostingClassifier(max_depth=10, random_state=0, verbose=1)
gbdt.fit(X_train,Y_train)
y_pred = gbdt.predict_proba(X_test)
show_result(Y_test, y_pred[:,1])

In [None]:
test_hit_rate(gbdt,1),  test_hit_rate(gbdt,5), test_hit_rate(gbdt,10), test_hit_rate(gbdt,20)