In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [2]:
apps = pd.read_csv('filtered_apps.csv')
users = pd.read_csv('filtered_users.csv')
jobs = pd.read_csv('filtered_jobs.csv')

In [3]:
# First, generate dictionaries for mapping old id to new id for users and movies
unique_JobID = apps['JobID'].unique()
unique_UserID = apps['UserID'].unique()
j = 0
user_old2new_id_dict = dict()
user_new2old_id_dict = dict()
for u in unique_UserID:
    user_old2new_id_dict[u] = j
    user_new2old_id_dict[j] = u
    j += 1
j = 0
job_old2new_id_dict = dict()
job_new2old_id_dict = dict()
for i in unique_JobID:
    job_old2new_id_dict[i] = j
    job_new2old_id_dict[j] = i
    j += 1

# Then, use the generated dictionaries to reindex UserID and JobID in the data_df
user_list = apps['UserID'].values
job_list = apps['JobID'].values
for j in range(len(apps)):
    user_list[j] = user_old2new_id_dict[user_list[j]]
    job_list[j] = job_old2new_id_dict[job_list[j]]
apps['UserID'] = user_list
apps['JobID'] = job_list

# generate train_df with 70% samples and test_df with 30% samples, and there should have no overlap between them.
train_index = np.random.random(len(apps)) <= 0.7
train_df = apps[train_index]
test_df = apps[~train_index]

In [4]:
train_df['Applied?'] = 1
test_df['Applied?'] = 1

from scipy.sparse import coo_matrix

# generate train_mat and test_mat
num_users = len(apps['UserID'].unique())
num_jobs = len(apps['JobID'].unique())

train_mat = coo_matrix((train_df['Applied?'].values, (train_df['UserID'].values, train_df['JobID'].values)), shape=(num_users, num_jobs)).astype(float).toarray()
test_mat = coo_matrix((test_df['Applied?'].values, (test_df['UserID'].values, test_df['JobID'].values)), shape=(num_users, num_jobs)).astype(float).toarray()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['Applied?'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['Applied?'] = 1


In [5]:
class MF:
    def __init__(self, train_mat, test_mat, latent=5, lr=0.01, reg=0.01):
        self.train_mat = train_mat  # the training rating matrix of size (#user, #movie)
        self.test_mat = test_mat  # the training rating matrix of size (#user, #movie)

        self.latent = latent  # the latent dimension
        self.lr = lr  # learning rate
        self.reg = reg  # regularization weight, i.e., the lambda in the objective function

        self.num_users, self.num_jobs = train_mat.shape

        self.sample_user, self.sample_movie = self.train_mat.nonzero()  # get the user-movie paris having ratings in train_mat
        self.num_sample = len(self.sample_user)  # the number of user-movie pairs having ratings in train_mat

        self.train_indicator_mat = 1.0 * (train_mat > 0)  # binary matrix to indicate whether s user-movie pair has rating or not in train_mat
        self.test_indicator_mat = 1.0 * (test_mat > 0)  # binary matrix to indicate whether s user-movie pair has rating or not in test_mat

        self.P = np.random.random((self.num_users, self.latent))  # latent factors for users, size (#user, self.latent), randomly initialized
        self.Q = np.random.random((self.num_jobs, self.latent))  # latent factors for users, size (#movie, self.latent), randomly initialized

    def train(self, epoch=20, verbose=True):
        """
        Goal: Write your code to train your matrix factorization model for epoch iterations in this function
        Input: epoch -- the number of training epoch
        Output: epoch_loss_list -- a list recording the training loss for each epoch
                epoch_test_RMSE_list -- a list recording the testing RMSE after each training epoch
        """
        epoch_loss_list = []
        epoch_test_RMSE_list = []
        for ep in range(epoch):
            """
            Write your code here to implement the training process for one epoch,
            and at the end of each epoch, print out the epoch number, the training loss after this epoch,
            and the test RMSE after this epoch
            """
            random_indices = np.random.permutation(self.num_sample)
            total_loss = 0
            for idx in random_indices:
              u, i = self.sample_user[idx], self.sample_movie[idx]
              pred_val = np.dot(self.P[u], self.Q[i])
              error = pred_val - self.train_mat[u, i]
              total_loss += error ** 2
              self.P[u] -= self.lr * ((error * self.Q[i]) + self.reg * self.P[u])
              self.Q[i] -= self.lr * ((error * self.P[u]) + self.reg * self.Q[i])

            total_loss /= self.num_sample
            epoch_loss_list.append(total_loss)

            test_RMSE = self.calculate_RMSE(self.predict(), self.test_mat)
            epoch_test_RMSE_list.append(test_RMSE)

            if verbose:
              print(f"Epoch {ep + 1} -- Training Loss: {total_loss}, Test RMSE: {test_RMSE}")

            """
            End of your code for this function
            """
        return epoch_loss_list, epoch_test_RMSE_list

    def calculate_RMSE(self, prediction_mat, true_mat):
      num_ratings = np.sum(true_mat > 0)
      squared_error = np.sum(((prediction_mat - true_mat) ** 2) * (true_mat > 0))
      mean_squared_error = squared_error / num_ratings
      RMSE = np.sqrt(mean_squared_error)
      return RMSE


    def predict(self):
        prediction_mat = np.matmul(self.P, self.Q.T)
        return prediction_mat

In [6]:
mf = MF(train_mat, test_mat, latent=5, lr=0.01, reg=0.001)
epoch_loss_list, epoch_test_RMSE_list = mf.train(epoch=30)

Epoch 1 -- Training Loss: 0.2368065877013845, Test RMSE: 0.4363881723608754
Epoch 2 -- Training Loss: 0.14509954902772534, Test RMSE: 0.37045798129863905
Epoch 3 -- Training Loss: 0.1020023332028524, Test RMSE: 0.32864199417184137
Epoch 4 -- Training Loss: 0.07757872566782537, Test RMSE: 0.29937359858582535
Epoch 5 -- Training Loss: 0.06203793692077939, Test RMSE: 0.27747018944325524
Epoch 6 -- Training Loss: 0.05135822749270367, Test RMSE: 0.2603234058058171
Epoch 7 -- Training Loss: 0.04361140428588258, Test RMSE: 0.24644061472267187
Epoch 8 -- Training Loss: 0.03776309785072731, Test RMSE: 0.23491736024655618
Epoch 9 -- Training Loss: 0.03320953491197013, Test RMSE: 0.22516098978022264
Epoch 10 -- Training Loss: 0.029576074177349202, Test RMSE: 0.21676809055179433
Epoch 11 -- Training Loss: 0.026618395210485982, Test RMSE: 0.20945239827852108
Epoch 12 -- Training Loss: 0.024167737683082773, Test RMSE: 0.2030056446033119
Epoch 13 -- Training Loss: 0.022108546258808936, Test RMSE: 0.1

**ANALYSIS**

Now, for a particular user, I will try and find out which jobs should be recommended to him the most.

In [7]:
pred_mat = mf.predict()

In [8]:
pred_mat

array([[0.91190708, 0.94022928, 1.02391408, ..., 1.07616094, 0.99246189,
        0.56909408],
       [1.08510789, 0.93350162, 0.998513  , ..., 0.88087253, 0.95750721,
        0.550224  ],
       [1.05714097, 1.08200405, 0.98520639, ..., 0.86682857, 1.1230422 ,
        0.47874434],
       ...,
       [0.92127895, 0.96079189, 1.02870382, ..., 1.05033127, 0.99896855,
        0.53475794],
       [1.09559892, 1.02601212, 1.09866855, ..., 0.87736585, 1.00891893,
        0.47984134],
       [0.78437666, 0.81893455, 1.05168893, ..., 1.17174899, 0.84927306,
        0.5753096 ]])

In [9]:
def find_best_recommendation_for_user(user_real_id, numberofJob):
    user_id = user_old2new_id_dict[user_real_id]
    arr = pred_mat[user_id]
    indexed_arr = list(enumerate(arr))

    # Sort the array of tuples based on probabilities in descending order
    sorted_arr = sorted(indexed_arr, key=lambda x: x[1], reverse=True)

    # Extract the indices of the top 10 probabilities
    top_10_indices = [job_new2old_id_dict[index] for index, _ in sorted_arr[:numberofJob]]

    best_jobs = jobs[jobs['JobID'].isin(top_10_indices)]
    # print("Top 10 jobs for the user are: \n")
    
    # for _, job in best_jobs.iterrows():
    #     print(job['Title'])

    return best_jobs

In [11]:
find_best_recommendation_for_user(963422,10)

Unnamed: 0,JobID,WindowID,Title,Description,Requirements,City,State,Country,Zip5,StartDate,EndDate,DescCleaned,ReqCleaned
1159,899638,1,Administrative Assistant,"<hr>\r<p align=""center""> <strong>Administrativ...","<div align=""center""><hr></div>\r<p align=""cent...",Addison,IL,US,60101.0,2012-03-30 12:51:50.583,2012-04-29 23:59:59,administrative assistant our traffic departme...,"administrative assistant strong written, orga..."
2137,981303,2,Warehouse Clerk,<p><span>About Our Client</span></p>\r<p></p>\...,Our client is currently seeking to fill a 1st ...,Glendale,CA,US,91201.0,2012-04-11 14:30:51.657,2012-05-10 23:59:59,"about our client located in glendale, ca our c...",our client is currently seeking to fill a 1st ...
2628,496156,3,Call Center Representative / Customer Service ...,<p><span><strong>What is ING? <br>\r<br>\r</st...,<p><span><strong> </strong></span><span><stron...,Jacksonville,FL,US,,2012-04-27 11:42:41.027,2012-05-26 23:59:59,what is ing? ing insurance americas is a vital...,who is our ideal candidate? our ideal candida...
3163,1056098,3,Receptionist/Admin. -,"Great, part time position . Work- Monday, Tues...","Excel,",Sacramento,CA,US,95821.0,2012-04-26 16:13:31.613,2012-05-25 23:59:59,"great, part time position . work- monday, tues...","excel,"
3578,403391,4,Accounts Receivable / Indiana Medicaid Specialist,"<p style=""text-align: justify""><span>We are cu...","<p style=""text-align: justify""><span> </span><...",Meridian Hills,IN,US,46260.0,2012-04-25 15:39:55.717,2012-05-24 23:59:59,we are currently seeking an experienced and dy...,skilled nursing facility business office expe...
4624,328237,5,FRONT OFFICE,"<p align=""center""><strong><span style=""text-de...","To Apply <br />\rE-Mail resumes to: <a href=""m...",New Albany,IN,US,47150.0,2012-05-25 10:57:22.97,2012-06-24 23:59:00,"front office are you a team player? if so, our...",to apply e-mail resumes to jmichaelwilliamsdmd...
5953,788921,6,WAREHOUSE POSITION F,"WAREHOUSE position, full-time. Must be able to...",Please refer to the Job Description to view th...,Manteca,CA,US,,2012-05-14 21:15:47.623,2012-06-13 23:59:59,"warehouse position, full-time. must be able to...",please refer to the job description to view th...
6237,5311,7,Customer Service Representative - Up to $16 pe...,"<p style=""text-align: center"" align=""center""><...","<p>Please visit <a title=""硺餡뿿��ꀬҌӅᶠЀ"" href=""ht...",Buford,GA,US,30518.0,2012-06-17 02:03:18.147,2012-07-16 23:59:59,customer service representative up to $16 per ...,please visit www.uline.com/jobs to learn more ...
6343,194860,7,Planning Analyst,<span>\r<p><span><br>\rEstablished on the prin...,"<p style=""text-align: justify""><b><span>Planni...",Chicago,IL,US,,2012-06-14 10:41:50.05,2012-07-13 23:59:00,established on the principle that our people ...,planning analyst requirements bachelor s degre...
6455,327242,7,Executive Administrative Assistant,"<p style=""text-align: center;"" align=""center"">...",<ul>\r <li>4-5 years of experience in a top...,Pennsauken,NJ,US,,2012-06-01 09:47:18.273,2012-06-30 23:59:00,executive administrative assistant experienced...,"4-5 years of experience in a top-level, high-..."


In [12]:
apps[apps['UserID'] == 963422]

Unnamed: 0,UserID,WindowID,Split,ApplicationDate,JobID


In [13]:
apps = pd.read_csv('filtered_apps.csv')
users = pd.read_csv('filtered_users.csv')
jobs = pd.read_csv('filtered_jobs.csv')
user_history = pd.read_csv('work_history.csv')

In [14]:
train_user = users[users.Split=="Train"]
test_user = users[users.Split=="Test"]

In [15]:
test_user

Unnamed: 0,UserID,WindowID,Split,City,State,Country,ZipCode,DegreeType,Major,GraduationDate,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany
1,769,1,Test,Roselle,IL,US,60172,Bachelor's,Radio-Television,2011-05-01 00:00:00,5,5.0,Yes,No,0
3,2652,1,Test,Glendale,CA,US,91204,High School,Not Applicable,1987-01-01 00:00:00,3,21.0,No,Yes,3
10,9280,1,Test,Pompano Beach,FL,US,33060,Associate's,Education,2008-05-01 00:00:00,4,7.0,Yes,Yes,15
29,32653,1,Test,Dallas,TX,US,75243,High School,Not Applicable,1995-05-01 00:00:00,2,5.0,No,No,0
36,39346,1,Test,Houston,TX,US,77034,High School,High School Diploma,2004-01-01 00:00:00,4,8.0,No,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6202,1426954,7,Test,Rancho Cucamonga,CA,US,91730,,,2009-01-01 00:00:00,7,24.0,Yes,No,0
6204,1428443,7,Test,Antioch,CA,US,94531,Vocational,Medical/Clinical Assistant,2010-02-01 00:00:00,5,14.0,Yes,No,0
6205,1430102,7,Test,Washington,DC,US,20019,Associate's,Business Administration,2010-01-01 00:00:00,7,22.0,No,No,0
6206,1441962,7,Test,New York,NY,US,10001,Master's,Counselor Education,2005-04-01 00:00:00,5,7.0,No,Yes,10


In [54]:
import tqdm

hit = []

for _, user in tqdm.tqdm(test_user.iterrows(), total=len(test_user)):
    id = user['UserID']

    predictions = find_best_recommendation_for_user(id, 20)

    found_applied = False
    for job in predictions['JobID']:
        applied_to = apps[(apps['JobID'] == job) & (apps['UserID'] == id)]
        if (len(applied_to) > 0):
            found_applied = True
            break
    
    if found_applied:
        hit.append(1)
    else:
        hit.append(0)

100%|██████████| 1387/1387 [00:07<00:00, 195.09it/s]


In [55]:
np.mean(hit)

0.006488824801730353