# Baseline Models
- TF-IDF vectors to represent the texts.
- Use several machine learning models.

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

In [3]:
user_set = pd.read_csv("user_set.csv")
job_set = pd.read_csv("job_set_cleaned.csv")
work_history = pd.read_csv("work_history.csv")
dataset = pd.read_csv("dataset.csv")

# 1. TF-IDF vectors for text representation

In [4]:
# about 1 min
job_set = job_set.fillna(" ")
job_set["word"] = job_set.Title + job_set.Description + job_set.Requirements
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=5, max_features=100, stop_words='english')
tfidf_matrix = tf.fit_transform(job_set['word'])

# 2. Filter out users with more than 10 applications

In [5]:
temp = sorted(dict(dataset.UserID.value_counts()).items(), key=lambda x: x[1], reverse=True)
exclude_user_id = [i[0] for i in temp if i [1]>=10]
len(exclude_user_id)

6765

In [6]:
dataset = dataset[~dataset.UserID.isin(exclude_user_id)]

- select data in ```work_history,user_set```

In [7]:
user_id = dataset.UserID.unique()
work_history = work_history[work_history.UserID.isin(user_id)]
user_set = user_set[user_set.UserID.isin(user_id)]
user_set.reset_index(drop=True, inplace=True)

- drop duplicates in ```work_history```

In [8]:
work_history = work_history.drop(columns=["Sequence"]).drop_duplicates()

In [29]:
word_history_tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=1, max_features=50, stop_words='english')
word_history_tf_matrix = word_history_tf.fit_transform(work_history.groupby("UserID").JobTitle.sum().values)

# 3. Deal with the user set and the job set

In [12]:
user_set = user_set.drop(columns=["Country","ZipCode","Major","GraduationDate","WindowID"])

In ```user_set``` 
- label encoding for ```DegreeType```
- one-hot encoding for ```State```
- binary labels for Currently ```Employed/ManagedOthers```

In [13]:
# user_set = pd.get_dummies(user_set, columns=["State"])
user_set.replace({"CurrentlyEmployed":{"Yes":1,"No":0}}, inplace=True)
user_set.replace({"ManagedOthers":{"Yes":1,"No":0}}, inplace=True)
user_set.replace({"DegreeType":{"None":0,"High School":1, "Vocational":2, "Associate's":3, "Bachelor's":4, "Master's":5, "PhD":6}}, 
                 inplace=True)

In [14]:
user_set

Unnamed: 0,UserID,Split,City,State,DegreeType,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany
0,13,Test,Philadelphia,PA,4.0,6,5.0,1,0,0
1,64,Train,Columbus,OH,5.0,3,22.0,1,0,0
2,101,Train,Brick,NJ,1.0,1,2.0,0,1,4
3,133,Train,Wilmington,DE,4.0,6,9.0,1,1,6
4,182,Train,Lenexa,KS,1.0,3,5.0,1,1,10
...,...,...,...,...,...,...,...,...,...,...
18741,1471625,Train,Indianapolis,IN,4.0,4,4.0,1,1,10
18742,1471661,Train,Shartlesville,PA,4.0,1,3.0,0,0,0
18743,1471838,Train,Peoria,AZ,5.0,3,8.0,1,0,0
18744,1471948,Train,Glendale,AZ,1.0,4,6.0,0,0,0


- add binary labels into the dataset, indicating that whether the user and job are in the same city/state.

In [15]:
city = []
state = []
groups = dataset.groupby("UserID")
for idx, group in tqdm(groups):
    user_city = user_set[user_set.UserID==idx]["City"].values
    user_state = user_set[user_set.UserID==idx]["State"].values
    job_id_list = group.JobID.values
    job_city = job_set[job_set.JobID.isin(job_id_list)]["City"].values
    job_state = job_set[job_set.JobID.isin(job_id_list)]["State"].values
    city.extend([0 if i!=user_city else 1 for i in job_city])
    state.extend([0 if i!=user_state else 1 for i in job_state])
dataset["City"] = city
dataset["State"] = state

100%|███████████████████████████████████████████████████████████████████████████| 18746/18746 [01:49<00:00, 171.92it/s]


In [16]:
user_set.to_csv("user_set_cleaned.csv", index=False)
dataset.to_csv("dataset_cleaned.csv", index=False)
work_history.to_csv("work_history_cleaned.csv", index=False)

# 4. Build the training set and testing set

In [17]:
train_user = user_set[user_set.Split=="Train"].UserID.values
test_user = user_set[user_set.Split=="Test"].UserID.values
train_data = dataset[dataset.UserID.isin(train_user)]
test_data = dataset[dataset.UserID.isin(test_user)]

In [18]:
groups = train_data.groupby("UserID")
X_train = np.zeros((1,158))
Y_train = []
for u_id, group in tqdm(groups):
    user = user_set[user_set.UserID==u_id][["DegreeType", "WorkHistoryCount", "TotalYearsExperience", "CurrentlyEmployed", 
                                            "ManagedOthers", "ManagedHowMany"]]
    u_idx = user.index.values[0]
    user_feature = np.concatenate((user.values, word_history_tf_matrix[u_idx,:].toarray()),axis=1)
    job_id_list = group.JobID.values
    jobs = job_set[job_set.JobID.isin(job_id_list)]
    j_idx = jobs.index.values
    f = []
    for i in j_idx:
        feature = np.concatenate((user_feature, tfidf_matrix[i,:].toarray()), axis=1).reshape(156,).tolist()
        f.append(feature)
    feature = np.concatenate((group[["City","State"]].values, np.array(f)),axis=1)
    X_train = np.concatenate((X_train, feature), axis=0)
    Y_train.extend(group.label.values.tolist())

100%|████████████████████████████████████████████████████████████████████████████| 18486/18486 [11:39<00:00, 26.44it/s]


In [19]:
X_train.shape, len(Y_train)

((70669, 158), 70668)

In [20]:
groups = test_data.groupby("UserID")
X_test = np.zeros((1,158))
Y_test = []
for u_id, group in tqdm(groups):
    user = user_set[user_set.UserID==u_id][["DegreeType", "WorkHistoryCount", "TotalYearsExperience", "CurrentlyEmployed", 
                                            "ManagedOthers", "ManagedHowMany"]]
    u_idx = user.index.values[0]
    user_feature = np.concatenate((user.values, word_history_tf_matrix[u_idx,:].toarray()),axis=1)
    job_id_list = group.JobID.values
    jobs = job_set[job_set.JobID.isin(job_id_list)]
    j_idx = jobs.index.values
    f = []
    for i in j_idx:
        feature = np.concatenate((user_feature, tfidf_matrix[i,:].toarray()), axis=1).reshape(156,).tolist()
        f.append(feature)
    feature = np.concatenate((group[["City","State"]].values, np.array(f)),axis=1)
    X_test = np.concatenate((X_test, feature), axis=0)
    Y_test.extend(group.label.values.tolist())

100%|████████████████████████████████████████████████████████████████████████████████| 260/260 [00:02<00:00, 87.06it/s]


In [21]:
X_test.shape, len(Y_test)

((1055, 158), 1054)

In [22]:
np.save("X_train.npy",X_train[1:,])
np.save("Y_train.npy",np.array(Y_train))
np.save("X_test.npy",X_test[1:,])
np.save("Y_test.npy",np.array(Y_test))

# 5. Construct models

In [23]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [24]:
def show_result(y_true, y_prediction):
    report = classification_report(y_true,y_prediction,digits=4)
    report = report.splitlines()
    columns = ['class'] + report[0].split()
    col_1, col_2, col_3, col_4, col_5 = [], [], [], [], []
    for row in report[1:]:
        if len(row.split()) != 0:
            row = row.split()
            if len(row) < 5:
                col_1.append(row[0])
                col_2.append('')
                col_3.append('')
                col_4.append(row[1])
                col_5.append(row[2])
            elif len(row) > 5:
                col_1.append(row[0] + ' ' + row[1])
                col_2.append(row[2])
                col_3.append(row[3])
                col_4.append(row[4])
                col_5.append(row[5])
            else:
                col_1.append(row[0])
                col_2.append(row[1])
                col_3.append(row[2])
                col_4.append(row[3])
                col_5.append(row[4])
    col_1.append("overall")
    col_2.append(precision_score(y_true, y_prediction))
    col_3.append(recall_score(y_true, y_prediction))
    col_4.append(f1_score(y_true, y_prediction))
    col_5.append(roc_auc_score(y_true, y_prediction))
    result = pd.DataFrame()
    result[columns[0]] = col_1
    result[columns[1]] = col_2
    result[columns[2]] = col_3
    result[columns[3]] = col_4
    result[columns[4]] = col_5
    print("——————Test——————")
    print(result)

In [25]:
X_train = np.load("X_train.npy")
X_test = np.load("X_test.npy")
Y_train = np.load("Y_train.npy")
Y_texs = np.load("Y_test.npy")

In [31]:
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

# Create the imputer and fill missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Fit the model
lr = LinearRegression()
lr.fit(X_train_imputed, Y_train)

# Predict and transform the predictions
y_pred = lr.predict(X_test_imputed)
y_pred = [0 if i < 0.5 else 1 for i in y_pred]
show_result(Y_test, y_pred)

——————Test——————
          class precision    recall  f1-score   support
0             0    0.5251    0.5351    0.5301       527
1             1    0.5261    0.5161    0.5211       527
2      accuracy                        0.5256      1054
3     macro avg    0.5256    0.5256    0.5256      1054
4  weighted avg    0.5256    0.5256    0.5256      1054
5       overall  0.526112  0.516129  0.521073  0.525617


In [33]:
from sklearn.linear_model import LogisticRegression


# Create the imputer with a chosen strategy (mean, median, most_frequent, or constant)
imputer = SimpleImputer(strategy='mean')

# Impute the missing values in the training and test sets
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Create and fit the Logistic Regression model
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_imputed, Y_train)

# Make predictions
y_pred = lr.predict(X_test_imputed)

# Function to show results (assuming show_result is defined)
show_result(Y_test, y_pred)


——————Test——————
          class precision    recall  f1-score   support
0             0    0.5261    0.5351    0.5306       527
1             1    0.5270    0.5180    0.5225       527
2      accuracy                        0.5266      1054
3     macro avg    0.5266    0.5266    0.5265      1054
4  weighted avg    0.5266    0.5266    0.5265      1054
5       overall  0.527027  0.518027  0.522488  0.526565


In [35]:
from sklearn.naive_bayes import GaussianNB

# Create the imputer with a chosen strategy (mean, median, most_frequent, or constant)
imputer = SimpleImputer(strategy='mean')

# Impute the missing values in the training and test sets
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Create and fit the Gaussian Naive Bayes model
nb = GaussianNB()
nb.fit(X_train_imputed, Y_train)

# Make predictions
y_pred = nb.predict(X_test_imputed)

# Function to show results (assuming show_result is defined)
show_result(Y_test, y_pred)


——————Test——————
          class precision    recall  f1-score   support
0             0    0.5174    0.5655    0.5403       527
1             1    0.5209    0.4725    0.4955       527
2      accuracy                        0.5190      1054
3     macro avg    0.5191    0.5190    0.5179      1054
4  weighted avg    0.5191    0.5190    0.5179      1054
5       overall  0.520921  0.472486  0.495522  0.518975


In [36]:
dt = DecisionTreeClassifier()
dt.fit(X_train, Y_train)
y_pred = dt.predict(X_test)
show_result(Y_test, y_pred)

——————Test——————
          class precision    recall  f1-score   support
0             0    0.5361    0.7476    0.6244       527
1             1    0.5831    0.3529    0.4397       527
2      accuracy                        0.5503      1054
3     macro avg    0.5596    0.5503    0.5321      1054
4  weighted avg    0.5596    0.5503    0.5321      1054
5       overall  0.583072  0.352941  0.439716  0.550285


In [38]:
from sklearn.ensemble import RandomForestClassifier

# Create the imputer with a chosen strategy (mean, median, most_frequent, or constant)
imputer = SimpleImputer(strategy='mean')

# Impute the missing values in the training and test sets
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Create and fit the RandomForestClassifier model
rf = RandomForestClassifier()
rf.fit(X_train_imputed, Y_train)

# Make predictions
y_pred = rf.predict(X_test_imputed)

# Function to show results (assuming show_result is defined)
show_result(Y_test, y_pred)


——————Test——————
          class precision    recall  f1-score   support
0             0    0.6241    0.6395    0.6317       527
1             1    0.6304    0.6148    0.6225       527
2      accuracy                        0.6271      1054
3     macro avg    0.6272    0.6271    0.6271      1054
4  weighted avg    0.6272    0.6271    0.6271      1054
5       overall   0.63035  0.614801  0.622478  0.627135
