In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [24]:
apps = pd.read_csv('apps.csv')
users = pd.read_csv('users.csv')
jobs = pd.read_csv('jobs.csv')

In [3]:
# First, generate dictionaries for mapping old id to new id for users and movies
unique_JobID = apps['JobID'].unique()
unique_UserID = apps['UserID'].unique()
j = 0
user_old2new_id_dict = dict()
user_new2old_id_dict = dict()
for u in unique_UserID:
    user_old2new_id_dict[u] = j
    user_new2old_id_dict[j] = u
    j += 1
j = 0
job_old2new_id_dict = dict()
job_new2old_id_dict = dict()
for i in unique_JobID:
    job_old2new_id_dict[i] = j
    job_new2old_id_dict[j] = i
    j += 1

# Then, use the generated dictionaries to reindex UserID and JobID in the data_df
user_list = apps['UserID'].values
job_list = apps['JobID'].values
for j in range(len(apps)):
    user_list[j] = user_old2new_id_dict[user_list[j]]
    job_list[j] = job_old2new_id_dict[job_list[j]]
apps['UserID'] = user_list
apps['JobID'] = job_list

# generate train_df with 70% samples and test_df with 30% samples, and there should have no overlap between them.
train_index = np.random.random(len(apps)) <= 0.7
train_df = apps[train_index]
test_df = apps[~train_index]

In [4]:
train_df['Applied?'] = 1
test_df['Applied?'] = 1

from scipy.sparse import coo_matrix

# generate train_mat and test_mat
num_users = len(apps['UserID'].unique())
num_jobs = len(apps['JobID'].unique())

train_mat = coo_matrix((train_df['Applied?'].values, (train_df['UserID'].values, train_df['JobID'].values)), shape=(num_users, num_jobs)).astype(float).toarray()
test_mat = coo_matrix((test_df['Applied?'].values, (test_df['UserID'].values, test_df['JobID'].values)), shape=(num_users, num_jobs)).astype(float).toarray()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['Applied?'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['Applied?'] = 1


In [5]:
class MF:
    def __init__(self, train_mat, test_mat, latent=5, lr=0.01, reg=0.01):
        self.train_mat = train_mat  # the training rating matrix of size (#user, #movie)
        self.test_mat = test_mat  # the training rating matrix of size (#user, #movie)

        self.latent = latent  # the latent dimension
        self.lr = lr  # learning rate
        self.reg = reg  # regularization weight, i.e., the lambda in the objective function

        self.num_users, self.num_jobs = train_mat.shape

        self.sample_user, self.sample_movie = self.train_mat.nonzero()  # get the user-movie paris having ratings in train_mat
        self.num_sample = len(self.sample_user)  # the number of user-movie pairs having ratings in train_mat

        self.train_indicator_mat = 1.0 * (train_mat > 0)  # binary matrix to indicate whether s user-movie pair has rating or not in train_mat
        self.test_indicator_mat = 1.0 * (test_mat > 0)  # binary matrix to indicate whether s user-movie pair has rating or not in test_mat

        self.P = np.random.random((self.num_users, self.latent))  # latent factors for users, size (#user, self.latent), randomly initialized
        self.Q = np.random.random((self.num_jobs, self.latent))  # latent factors for users, size (#movie, self.latent), randomly initialized

    def train(self, epoch=20, verbose=True):
        """
        Goal: Write your code to train your matrix factorization model for epoch iterations in this function
        Input: epoch -- the number of training epoch
        Output: epoch_loss_list -- a list recording the training loss for each epoch
                epoch_test_RMSE_list -- a list recording the testing RMSE after each training epoch
        """
        epoch_loss_list = []
        epoch_test_RMSE_list = []
        for ep in range(epoch):
            """
            Write your code here to implement the training process for one epoch,
            and at the end of each epoch, print out the epoch number, the training loss after this epoch,
            and the test RMSE after this epoch
            """
            random_indices = np.random.permutation(self.num_sample)
            total_loss = 0
            for idx in random_indices:
              u, i = self.sample_user[idx], self.sample_movie[idx]
              pred_val = np.dot(self.P[u], self.Q[i])
              error = pred_val - self.train_mat[u, i]
              total_loss += error ** 2
              self.P[u] -= self.lr * ((error * self.Q[i]) + self.reg * self.P[u])
              self.Q[i] -= self.lr * ((error * self.P[u]) + self.reg * self.Q[i])

            total_loss /= self.num_sample
            epoch_loss_list.append(total_loss)

            test_RMSE = self.calculate_RMSE(self.predict(), self.test_mat)
            epoch_test_RMSE_list.append(test_RMSE)

            if verbose:
              print(f"Epoch {ep + 1} -- Training Loss: {total_loss}, Test RMSE: {test_RMSE}")

            """
            End of your code for this function
            """
        return epoch_loss_list, epoch_test_RMSE_list

    def calculate_RMSE(self, prediction_mat, true_mat):
      num_ratings = np.sum(true_mat > 0)
      squared_error = np.sum(((prediction_mat - true_mat) ** 2) * (true_mat > 0))
      mean_squared_error = squared_error / num_ratings
      RMSE = np.sqrt(mean_squared_error)
      return RMSE


    def predict(self):
        prediction_mat = np.matmul(self.P, self.Q.T)
        return prediction_mat

In [6]:
mf = MF(train_mat, test_mat, latent=5, lr=0.01, reg=0.001)
epoch_loss_list, epoch_test_RMSE_list = mf.train(epoch=30)

Epoch 1 -- Training Loss: 0.23639215154879395, Test RMSE: 0.429027993444168
Epoch 2 -- Training Loss: 0.14438695224259404, Test RMSE: 0.3632399272597542
Epoch 3 -- Training Loss: 0.10158350053747651, Test RMSE: 0.32180124774530583
Epoch 4 -- Training Loss: 0.0773873671576295, Test RMSE: 0.2928646669080056
Epoch 5 -- Training Loss: 0.06198057747546223, Test RMSE: 0.2712711254943288
Epoch 6 -- Training Loss: 0.051371279368963435, Test RMSE: 0.25437365944437407
Epoch 7 -- Training Loss: 0.04365089135253524, Test RMSE: 0.24069006844198854
Epoch 8 -- Training Loss: 0.03780335759008195, Test RMSE: 0.22933890378574653
Epoch 9 -- Training Loss: 0.03323910400472229, Test RMSE: 0.21973429340009334
Epoch 10 -- Training Loss: 0.0295885654675396, Test RMSE: 0.21147847060853936
Epoch 11 -- Training Loss: 0.026610870322590484, Test RMSE: 0.20429248977747375
Epoch 12 -- Training Loss: 0.024140061647593424, Test RMSE: 0.1979694969688579
Epoch 13 -- Training Loss: 0.022061672350447252, Test RMSE: 0.1923

**ANALYSIS**

Now, for a particular user, I will try and find out which jobs should be recommended to him the most.

In [7]:
pred_mat = mf.predict()

In [8]:
pred_mat

array([[0.83139616, 1.16377256, 0.98245068, ..., 0.93491823, 1.06735357,
        0.59623074],
       [0.73712132, 1.17268451, 0.99184864, ..., 0.79149004, 0.91911124,
        0.45304032],
       [0.93791125, 1.02148229, 1.03478344, ..., 0.88668511, 1.12000929,
        0.81596457],
       ...,
       [1.01357159, 0.82421389, 0.94131562, ..., 0.93642453, 1.18115029,
        1.03317214],
       [0.87179319, 0.86723909, 0.9051489 , ..., 0.87294544, 1.13968451,
        0.77212501],
       [0.89403046, 1.06152702, 0.97834624, ..., 0.67356837, 0.83193576,
        0.89702833]])

In [14]:
def find_best_recommendation_for_user(user_real_id):
    user_id = user_old2new_id_dict[user_real_id]
    arr = pred_mat[user_id]
    indexed_arr = list(enumerate(arr))

    # Sort the array of tuples based on probabilities in descending order
    sorted_arr = sorted(indexed_arr, key=lambda x: x[1], reverse=True)

    # Extract the indices of the top 10 probabilities
    top_10_indices = [job_new2old_id_dict[index] for index, _ in sorted_arr[:10]]

    best_jobs = jobs[jobs['JobID'].isin(top_10_indices)]
    print("Top 10 jobs for the user are: \n")
    
    for _, job in best_jobs.iterrows():
        print(job['Title'])

    return best_jobs

In [22]:
find_best_recommendation_for_user(963422)

Top 10 jobs for the user are: 

Shipping and Receiving Supervisor
Receptionist/Patient Check Out
Receptionist


Unnamed: 0,JobID,WindowID,Title,City,State,Country,Zip5,StartDate,EndDate,DescCleaned,ReqCleaned
61213,590247,6,Shipping and Receiving Supervisor,Atlanta,GA,US,,2012-06-08 02:02:18.11,2012-07-07 23:59:00,shipping and receiving supervisor- night shift...,experience/education requirements 5 years of s...
65262,631626,6,Receptionist/Patient Check Out,Rochester,NY,US,,2012-05-20 00:25:30.393,2012-06-19 23:59:00,receptionist/patient check out - busy obgyn pr...,please refer to the job description to view th...
114600,1105210,6,Receptionist,Cleveland,OH,US,44144.0,2012-05-30 12:37:20.993,2012-06-29 23:59:00,receptionist purpose greet customers and visit...,"job requirements - education background, train..."


In [23]:
apps[apps['UserID'] == 963422]

Unnamed: 0,UserID,JobID,label


In [9]:
apps = pd.read_csv('apps.csv')
jobs = pd.read_csv('jobs.csv')
users = pd.read_csv('users.csv')
user_history = pd.read_csv('work_history.csv')

In [10]:
train_user = users[users.Split=="Train"]
test_user = users[users.Split=="Test"]

In [None]:
import tqdm

hit = []

for _, user in tqdm(test_user.iterrows(), total=len(test_user)):
    predictions = getPredictions(input_data, 20)

    found_applied = False
    for job in predictions:
        applied_to = apps[(apps['JobID'] == job) & (apps['UserID'] == id)]
        if (len(applied_to) > 0):
            found_applied = True
            break
    
    if found_applied:
        hit.append(1)
    else:
        hit.append(0)