In [49]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [50]:
apps = pd.read_csv('filtered_apps.csv')
users = pd.read_csv('filtered_users.csv')
jobs = pd.read_csv('filtered_jobs.csv')

In [51]:
# First, generate dictionaries for mapping old id to new id for users and movies
unique_JobID = apps['JobID'].unique()
unique_UserID = apps['UserID'].unique()
j = 0
user_old2new_id_dict = dict()
user_new2old_id_dict = dict()
for u in unique_UserID:
    user_old2new_id_dict[u] = j
    user_new2old_id_dict[j] = u
    j += 1
j = 0
job_old2new_id_dict = dict()
job_new2old_id_dict = dict()
for i in unique_JobID:
    job_old2new_id_dict[i] = j
    job_new2old_id_dict[j] = i
    j += 1


# Then, use the generated dictionaries to reindex UserID and JobID in the data_df
user_list = apps['UserID'].values
job_list = apps['JobID'].values
for j in range(len(apps)):
    user_list[j] = user_old2new_id_dict[user_list[j]]
    job_list[j] = job_old2new_id_dict[job_list[j]]
apps['UserID'] = user_list
apps['JobID'] = job_list

# generate train_df with 70% samples and test_df with 30% samples, and there should have no overlap between them.
train_index = np.random.random(len(apps)) <= 0.7
train_df = apps[train_index]
test_df = apps[~train_index]

In [52]:
train_df['Applied?'] = 1
test_df['Applied?'] = 1

from scipy.sparse import coo_matrix

# generate train_mat and test_mat
num_users = len(apps['UserID'].unique())
num_jobs = len(apps['JobID'].unique())

train_mat = coo_matrix((train_df['Applied?'].values, (train_df['UserID'].values, train_df['JobID'].values)), shape=(num_users, num_jobs)).astype(float).toarray()
test_mat = coo_matrix((test_df['Applied?'].values, (test_df['UserID'].values, test_df['JobID'].values)), shape=(num_users, num_jobs)).astype(float).toarray()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['Applied?'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['Applied?'] = 1


In [53]:
class MF:
    def __init__(self, train_mat, test_mat, latent=5, lr=0.01, reg=0.01):
        self.train_mat = train_mat  # the training rating matrix of size (#user, #movie)
        self.test_mat = test_mat  # the training rating matrix of size (#user, #movie)

        self.latent = latent  # the latent dimension
        self.lr = lr  # learning rate
        self.reg = reg  # regularization weight, i.e., the lambda in the objective function

        self.num_users, self.num_jobs = train_mat.shape

        self.sample_user, self.sample_movie = self.train_mat.nonzero()  # get the user-movie paris having ratings in train_mat
        self.num_sample = len(self.sample_user)  # the number of user-movie pairs having ratings in train_mat

        self.train_indicator_mat = 1.0 * (train_mat > 0)  # binary matrix to indicate whether s user-movie pair has rating or not in train_mat
        self.test_indicator_mat = 1.0 * (test_mat > 0)  # binary matrix to indicate whether s user-movie pair has rating or not in test_mat

        self.P = np.random.random((self.num_users, self.latent))  # latent factors for users, size (#user, self.latent), randomly initialized
        self.Q = np.random.random((self.num_jobs, self.latent))  # latent factors for users, size (#movie, self.latent), randomly initialized

    def train(self, epoch=20, verbose=True):
        """
        Goal: Write your code to train your matrix factorization model for epoch iterations in this function
        Input: epoch -- the number of training epoch
        Output: epoch_loss_list -- a list recording the training loss for each epoch
                epoch_test_RMSE_list -- a list recording the testing RMSE after each training epoch
        """
        epoch_loss_list = []
        epoch_test_RMSE_list = []
        for ep in range(epoch):
            """
            Write your code here to implement the training process for one epoch,
            and at the end of each epoch, print out the epoch number, the training loss after this epoch,
            and the test RMSE after this epoch
            """
            random_indices = np.random.permutation(self.num_sample)
            total_loss = 0
            for idx in random_indices:
              u, i = self.sample_user[idx], self.sample_movie[idx]
              pred_val = np.dot(self.P[u], self.Q[i])
              error = pred_val - self.train_mat[u, i]
              total_loss += error ** 2
              self.P[u] -= self.lr * ((error * self.Q[i]) + self.reg * self.P[u])
              self.Q[i] -= self.lr * ((error * self.P[u]) + self.reg * self.Q[i])

            total_loss /= self.num_sample
            epoch_loss_list.append(total_loss)

            test_RMSE = self.calculate_RMSE(self.predict(), self.test_mat)
            epoch_test_RMSE_list.append(test_RMSE)

            if verbose:
              print(f"Epoch {ep + 1} -- Training Loss: {total_loss}, Test RMSE: {test_RMSE}")

            """
            End of your code for this function
            """
        return epoch_loss_list, epoch_test_RMSE_list

    def calculate_RMSE(self, prediction_mat, true_mat):
      num_ratings = np.sum(true_mat > 0)
      squared_error = np.sum(((prediction_mat - true_mat) ** 2) * (true_mat > 0))
      mean_squared_error = squared_error / num_ratings
      RMSE = np.sqrt(mean_squared_error)
      return RMSE


    def predict(self):
        prediction_mat = np.matmul(self.P, self.Q.T)
        return prediction_mat

In [55]:
mf = MF(train_mat, test_mat, latent=5, lr=0.01, reg=0.001)
epoch_loss_list, epoch_test_RMSE_list = mf.train(epoch=30)

Epoch 1 -- Training Loss: 0.23662562398973788, Test RMSE: 0.4335657701206561
Epoch 2 -- Training Loss: 0.14329545165047228, Test RMSE: 0.3670825961854032
Epoch 3 -- Training Loss: 0.1004611596728579, Test RMSE: 0.3251845093274829
Epoch 4 -- Training Loss: 0.0763572394361383, Test RMSE: 0.2959021980559313
Epoch 5 -- Training Loss: 0.06104943011296383, Test RMSE: 0.2740328609181345
Epoch 6 -- Training Loss: 0.050530936417249005, Test RMSE: 0.25691724977189206
Epoch 7 -- Training Loss: 0.04289622466483379, Test RMSE: 0.24306595875488884
Epoch 8 -- Training Loss: 0.03713164150311581, Test RMSE: 0.2315813967357987
Epoch 9 -- Training Loss: 0.032641177539423374, Test RMSE: 0.22187015194477505
Epoch 10 -- Training Loss: 0.02905817406061118, Test RMSE: 0.21352687249012747
Epoch 11 -- Training Loss: 0.026140749492292985, Test RMSE: 0.2062618794431641
Epoch 12 -- Training Loss: 0.023724405242030257, Test RMSE: 0.19986247818904482
Epoch 13 -- Training Loss: 0.02169369882338611, Test RMSE: 0.19417

**ANALYSIS**

Now, for a particular user, I will try and find out which jobs should be recommended to him the most.

In [56]:
pred_mat = mf.predict()

In [57]:
pred_mat

array([[1.00534711, 1.04179893, 1.10880591, ..., 1.43660337, 0.83723732,
        1.275175  ],
       [1.19520614, 0.81565988, 0.77423111, ..., 1.24661053, 1.39388445,
        1.20707195],
       [1.16666736, 1.07543694, 1.00729028, ..., 1.40799141, 1.05904334,
        1.42275828],
       ...,
       [1.24300291, 0.99170949, 1.15150167, ..., 1.51532976, 1.07380632,
        1.08500007],
       [1.46015168, 1.24346454, 1.38321907, ..., 1.84409969, 1.25558939,
        1.44928774],
       [1.51998836, 1.23179488, 1.383776  , ..., 1.81766253, 1.14899964,
        1.32001119]])

In [58]:
def find_best_recommendation_for_user(user_id):
    arr = pred_mat[user_id]
    indexed_arr = list(enumerate(arr))

    # Sort the array of tuples based on probabilities in descending order
    sorted_arr = sorted(indexed_arr, key=lambda x: x[1], reverse=True)

    # Extract the indices of the top 10 probabilities
    top_10_indices = [job_new2old_id_dict[index] for index, _ in sorted_arr[:10]]

    best_jobs = jobs[jobs['JobID'].isin(top_10_indices)]
    print("Top 10 jobs for the user are: \n")
    
    for _, job in best_jobs.iterrows():
        print(job['Title'])

In [60]:
find_best_recommendation_for_user(543)

Top 10 jobs for the user are: 

Material Handler – PR1953
Medical Assistant for Pediatric Cardiology
Administrative Assistant
CUSTOMER SERVICE REPS
Warehouse / Driver
Accounting Payroll
Administrative Assistant
Contract Administrative Assistant
Assembly Technician
Accounting/Financial Recruiter
