<a href="https://colab.research.google.com/github/philadelphia24/Thesis-Job-Recommender-System-/blob/main/Collaborative%20Filtering%20(Item-based).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Collaborative Filtering

## Clean data

In [1]:
# STEP 1: import libraries and data

import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
!pip install auto-surprise
from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNWithZScore, BaselineOnly, CoClustering # We will not be using all of these, but you can select the ones of interest.
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise import Dataset, Reader
from surprise import accuracy
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting auto-surprise
  Downloading auto_surprise-0.1.8-py3-none-any.whl (19 kB)
Collecting scikit-surprise (from auto-surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3095491 sha256=ba0fdb28f58e0a75a9b979f0e554366c83b3256e68c1a45aac41cefaec701a67
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, auto-surprise
Successfully installed aut

### KNNBasic

In [None]:
#Import our data
from google.colab import files
uploaded = files.upload()

import io
df = pd.read_csv(io.BytesIO(uploaded['matches.csv']))

#Let's have reproducible experiments
my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)


Saving matches.csv to matches (2).csv


In [None]:
#Instantiate reader and data
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['JobseekerID', 'VacancyID', 'Rating']], reader)
raw_ratings = data.raw_ratings

# shuffle ratings if you want
random.shuffle(raw_ratings)

# train= 80% of the data, test = 20% of the data
threshold = int(0.8 * len(raw_ratings))
train_raw_ratings = raw_ratings[:threshold]
test_raw_ratings = raw_ratings[threshold:]

# Let's check the split
print("Number of rows in train_raw_ratings:", len(train_raw_ratings))
print("Number of rows in test_raw_ratings:", len(test_raw_ratings))

data.raw_ratings = train_raw_ratings  # data is now the set train_raw_ratings

Number of rows in train_raw_ratings: 36253
Number of rows in test_raw_ratings: 9064


In [None]:
# STEP 3: Perform grid search over the hyperparameters, fit the algorithm to the training set. 
# After knowing the best parameters you may want to plot k against the MAE. In this case, you may skip the code until then.

# Define the algortihm to be used for GridSearch
algo = KNNBasic

# Define the GridSearchCV object and fit the trainset to it
print("Performing Grid Search...")
param_grid = {
      'k': [5],
      'sim_options': {
      'name': ['pearson_baseline'],
      'min_support': [0],
      'user_based': [False],
      'shrinkage': [20],
      'random_state': [42],
    },
}

gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=10, n_jobs = -1)
gs.fit(data) #Recall: this is the train_raw_ratings!

print("Best MAE score:", gs.best_score["mae"])
print("Best MAE parameters:", gs.best_params["mae"])
print("Best RMSE score:", gs.best_score["rmse"])
print("Best RMSE parameters:", gs.best_params["rmse"])

Performing Grid Search...
Best MAE score: 1.0542449008032515
Best MAE parameters: {'k': 5, 'sim_options': {'name': 'pearson_baseline', 'min_support': 0, 'user_based': False, 'shrinkage': 20, 'random_state': 42}}
Best RMSE score: 1.3515530808149872
Best RMSE parameters: {'k': 5, 'sim_options': {'name': 'pearson_baseline', 'min_support': 0, 'user_based': False, 'shrinkage': 20, 'random_state': 42}}


In [None]:
# get the results. This code outputs the tables
results = pd.DataFrame(gs.cv_results)
display(results)

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,split3_test_rmse,split4_test_rmse,split5_test_rmse,split6_test_rmse,split7_test_rmse,split8_test_rmse,split9_test_rmse,...,mean_test_mae,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_k,param_sim_options
0,1.349841,1.367151,1.350785,1.344116,1.358519,1.364637,1.348571,1.342758,1.365353,1.323799,...,1.054245,0.012805,1,0.109771,0.035787,0.094017,0.019201,"{'k': 5, 'sim_options': {'name': 'pearson_base...",5,"{'name': 'pearson_baseline', 'min_support': 0,..."


In [None]:
algo = gs.best_estimator["mae"]

In [None]:
# Step 4: Fit the algorithm on the training set with the best hyperparameters of MAE, so retrain on the whole set train_raw_ratings.
trainset = data.build_full_trainset()
algo.fit(trainset)

# Compute biased accuracy on train_raw_ratings
predictions = algo.test(trainset.build_testset())
print("Biased accuracy on the train set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Biased accuracy on the train set,
MAE:  0.2599
MAE: 0.2599342005836737
RMSE: 0.3831
RMSE: 0.3830891890030330


In [None]:
# Compute unbiased accuracy on test_raw_ratings
testset = data.construct_testset(test_raw_ratings)  # testset is now the set test_raw_rating
predictions = algo.test(testset)
print("Unbiased accuracy on the test set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Unbiased accuracy on the test set,
MAE:  1.0243
MAE: 1.0242763628249911
RMSE: 1.3214
RMSE: 1.3214465120604506


### KNNBaseline

In [None]:
#Import our data
from google.colab import files
uploaded = files.upload()

import io
df = pd.read_csv(io.BytesIO(uploaded['matches.csv']))

#Let's have reproducible experiments
my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)


Saving matches.csv to matches (3).csv


In [None]:
#Instantiate reader and data
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['JobseekerID', 'VacancyID', 'Rating']], reader)
raw_ratings = data.raw_ratings

# shuffle ratings if you want
random.shuffle(raw_ratings)

# train= 80% of the data, test = 20% of the data
threshold = int(0.8 * len(raw_ratings))
train_raw_ratings = raw_ratings[:threshold]
test_raw_ratings = raw_ratings[threshold:]

# Let's check the split
print("Number of rows in train_raw_ratings:", len(train_raw_ratings))
print("Number of rows in test_raw_ratings:", len(test_raw_ratings))

data.raw_ratings = train_raw_ratings  # data is now the set train_raw_ratings

Number of rows in train_raw_ratings: 36253
Number of rows in test_raw_ratings: 9064


In [None]:
# STEP 3: Perform grid search over the hyperparameters, fit the algorithm to the training set. 
# After knowing the best parameters you may want to plot k against the MAE. In this case, you may skip the code until then.

# Define the algortihm to be used for GridSearch
algo = KNNBaseline

# Define the GridSearchCV object and fit the trainset to it
print("Performing Grid Search...")
param_grid = {
    'k': [5],
    'sim_options': {
       'name': ['pearson_baseline'],
       'user_based': [False] },
    'bsl_options': {
        'method': ['sgd'],
        'n_epochs': [30],
        'reg_u': [0],
        'reg_i': [0] },
    }

gs = GridSearchCV(KNNBaseline, param_grid, measures=['mae'], cv=10, n_jobs = -1)
gs.fit(data) #Recall: this is the train_raw_ratings!

print("Best MAE score:", gs.best_score["mae"])
print("Best MAE parameters:", gs.best_params["mae"])

Performing Grid Search...
Best MAE score: 1.053110958081563
Best MAE parameters: {'k': 5, 'sim_options': {'name': 'pearson_baseline', 'user_based': False}, 'bsl_options': {'method': 'sgd', 'n_epochs': 30, 'reg_u': 0, 'reg_i': 0}}


In [None]:
# get the results. This code outputs the tables
results = pd.DataFrame(gs.cv_results)
display(results)

Unnamed: 0,split0_test_mae,split1_test_mae,split2_test_mae,split3_test_mae,split4_test_mae,split5_test_mae,split6_test_mae,split7_test_mae,split8_test_mae,split9_test_mae,...,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_k,param_sim_options,param_bsl_options
0,1.052086,1.050783,1.050927,1.047416,1.070028,1.050204,1.04859,1.060264,1.066529,1.034283,...,0.009718,1,0.17067,0.035357,0.061289,0.012173,"{'k': 5, 'sim_options': {'name': 'pearson_base...",5,"{'name': 'pearson_baseline', 'user_based': False}","{'method': 'sgd', 'n_epochs': 30, 'reg_u': 0, ..."


In [None]:
algo = gs.best_estimator["mae"]

In [None]:
# Step 4: Fit the algorithm on the training set with the best hyperparameters of MAE, so retrain on the whole set train_raw_ratings.
trainset = data.build_full_trainset()
algo.fit(trainset)

# Compute biased accuracy on train_raw_ratings
predictions = algo.test(trainset.build_testset())
print("Biased accuracy on the train set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Biased accuracy on the train set,
MAE:  0.1325
MAE: 0.1324511898078785
RMSE: 0.2056
RMSE: 0.2056389142239863


In [None]:
# Compute unbiased accuracy on test_raw_ratings
testset = data.construct_testset(test_raw_ratings)  # testset is now the set test_raw_rating
predictions = algo.test(testset)
print("Unbiased accuracy on the test set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Unbiased accuracy on the test set,
MAE:  1.0264
MAE: 1.0264323184159108
RMSE: 1.3161
RMSE: 1.3160941654525538


### KNNWithMeans

In [None]:
#Import our data
from google.colab import files
uploaded = files.upload()

import io
df = pd.read_csv(io.BytesIO(uploaded['matches.csv']))

#Let's have reproducible experiments
my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)


Saving matches.csv to matches (6).csv


In [None]:
#Instantiate reader and data
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['JobseekerID', 'VacancyID', 'Rating']], reader)
raw_ratings = data.raw_ratings

# shuffle ratings if you want
random.shuffle(raw_ratings)

# train= 80% of the data, test = 20% of the data
threshold = int(0.8 * len(raw_ratings))
train_raw_ratings = raw_ratings[:threshold]
test_raw_ratings = raw_ratings[threshold:]

# Let's check the split
print("Number of rows in train_raw_ratings:", len(train_raw_ratings))
print("Number of rows in test_raw_ratings:", len(test_raw_ratings))

data.raw_ratings = train_raw_ratings  # data is now the set train_raw_ratings

Number of rows in train_raw_ratings: 36253
Number of rows in test_raw_ratings: 9064


In [None]:
# STEP 3: Perform grid search over the hyperparameters, fit the algorithm to the training set. 
# After knowing the best parameters you may want to plot k against the MAE. In this case, you may skip the code until then.

# Define the algortihm to be used for GridSearch
algo = KNNWithMeans

# Define the GridSearchCV object and fit the trainset to it
print("Performing Grid Search...")
param_grid = {
      'k': [5],
      'sim_options': {
      'name': ['pearson_baseline'],
      'min_support': [0],
      'user_based': [False],
      'shrinkage': [80],
    },
}

gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=10, n_jobs = -1)
gs.fit(data) #Recall: this is the train_raw_ratings!

print("Best MAE score:", gs.best_score["mae"])
print("Best MAE parameters:", gs.best_params["mae"])
print("Best RMSE score:", gs.best_score["rmse"])
print("Best RMSE parameters:", gs.best_params["rmse"])

Performing Grid Search...
Best MAE score: 1.0569952813193932
Best MAE parameters: {'k': 5, 'sim_options': {'name': 'pearson_baseline', 'min_support': 0, 'user_based': False, 'shrinkage': 80}}
Best RMSE score: 1.346049527242155
Best RMSE parameters: {'k': 5, 'sim_options': {'name': 'pearson_baseline', 'min_support': 0, 'user_based': False, 'shrinkage': 80}}


In [None]:
# get the results. This code outputs the tables
results = pd.DataFrame(gs.cv_results)
display(results)

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,split3_test_rmse,split4_test_rmse,split5_test_rmse,split6_test_rmse,split7_test_rmse,split8_test_rmse,split9_test_rmse,...,mean_test_mae,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_k,param_sim_options
0,1.344481,1.362962,1.343696,1.338834,1.352948,1.355764,1.340703,1.338614,1.360598,1.321897,...,1.056995,0.011684,1,0.059031,0.011633,0.054183,0.010497,"{'k': 5, 'sim_options': {'name': 'pearson_base...",5,"{'name': 'pearson_baseline', 'min_support': 0,..."


In [None]:
algo = gs.best_estimator["mae"]

In [None]:
# Step 4: Fit the algorithm on the training set with the best hyperparameters of MAE, so retrain on the whole set train_raw_ratings.
trainset = data.build_full_trainset()
algo.fit(trainset)

# Compute biased accuracy on train_raw_ratings
predictions = algo.test(trainset.build_testset())
print("Biased accuracy on the train set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Biased accuracy on the train set,
MAE:  0.1868
MAE: 0.1868234155158906
RMSE: 0.2775
RMSE: 0.2775499719291059


In [None]:
# Compute unbiased accuracy on test_raw_ratings
testset = data.construct_testset(test_raw_ratings)  # testset is now the set test_raw_rating
predictions = algo.test(testset)
print("Unbiased accuracy on the test set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Unbiased accuracy on the test set,
MAE:  1.0275
MAE: 1.0274793496305346
RMSE: 1.3161
RMSE: 1.3160913198638526


## 10% Noise

### KNNBasic

In [2]:
#Import our data
from google.colab import files
uploaded = files.upload()

import io
df = pd.read_csv(io.BytesIO(uploaded['matches_ln10.csv']))

#Let's have reproducible experiments
my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)


Saving matches_ln10.csv to matches_ln10.csv


In [None]:
#Instantiate reader and data
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['JobseekerID', 'VacancyID', 'Rating']], reader)
raw_ratings = data.raw_ratings

# shuffle ratings if you want
random.shuffle(raw_ratings)

# train= 80% of the data, test = 20% of the data
threshold = int(0.8 * len(raw_ratings))
train_raw_ratings = raw_ratings[:threshold]
test_raw_ratings = raw_ratings[threshold:]

# Let's check the split
print("Number of rows in train_raw_ratings:", len(train_raw_ratings))
print("Number of rows in test_raw_ratings:", len(test_raw_ratings))

data.raw_ratings = train_raw_ratings  # data is now the set train_raw_ratings

Number of rows in train_raw_ratings: 36253
Number of rows in test_raw_ratings: 9064


In [None]:
# STEP 3: Perform grid search over the hyperparameters, fit the algorithm to the training set. 
# After knowing the best parameters you may want to plot k against the MAE. In this case, you may skip the code until then.

# Define the algortihm to be used for GridSearch
algo = KNNBasic

# Define the GridSearchCV object and fit the trainset to it
print("Performing Grid Search...")
param_grid = {
      'k': [5],
      'sim_options': {
      'name': ['pearson_baseline'],
      'min_support': [0],
      'user_based': [False],
      'shrinkage': [20],
    },
}

gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=10, n_jobs = -1)
gs.fit(data) #Recall: this is the train_raw_ratings!

print("Best MAE score:", gs.best_score["mae"])
print("Best MAE parameters:", gs.best_params["mae"])
print("Best RMSE score:", gs.best_score["rmse"])
print("Best RMSE parameters:", gs.best_params["rmse"])

Performing Grid Search...
Best MAE score: 1.0770360819135143
Best MAE parameters: {'k': 5, 'sim_options': {'name': 'pearson_baseline', 'min_support': 0, 'user_based': False, 'shrinkage': 20}}
Best RMSE score: 1.3835797143852122
Best RMSE parameters: {'k': 5, 'sim_options': {'name': 'pearson_baseline', 'min_support': 0, 'user_based': False, 'shrinkage': 20}}


In [None]:
# get the results. This code outputs the tables
results = pd.DataFrame(gs.cv_results)
display(results)

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,split3_test_rmse,split4_test_rmse,split5_test_rmse,split6_test_rmse,split7_test_rmse,split8_test_rmse,split9_test_rmse,...,mean_test_mae,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_k,param_sim_options
0,1.381442,1.395983,1.379154,1.375541,1.381457,1.40572,1.383962,1.374063,1.395426,1.363049,...,1.077036,0.012045,1,0.097225,0.016516,0.077314,0.017084,"{'k': 5, 'sim_options': {'name': 'pearson_base...",5,"{'name': 'pearson_baseline', 'min_support': 0,..."


In [None]:
algo = gs.best_estimator["mae"]

In [None]:
# Step 4: Fit the algorithm on the training set with the best hyperparameters of MAE, so retrain on the whole set train_raw_ratings.
trainset = data.build_full_trainset()
algo.fit(trainset)

# Compute biased accuracy on train_raw_ratings
predictions = algo.test(trainset.build_testset())
print("Biased accuracy on the train set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Biased accuracy on the train set,
MAE:  0.2506
MAE: 0.2506226646287761
RMSE: 0.3741
RMSE: 0.3740859616544884


In [None]:
# Compute unbiased accuracy on test_raw_ratings
testset = data.construct_testset(test_raw_ratings)  # testset is now the set test_raw_rating
predictions = algo.test(testset)
print("Unbiased accuracy on the test set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Unbiased accuracy on the test set,
MAE:  1.0508
MAE: 1.0507984903365810
RMSE: 1.3554
RMSE: 1.3553705347869198


### KNNBaseline

In [None]:
#Import our data
from google.colab import files
uploaded = files.upload()

import io
df = pd.read_csv(io.BytesIO(uploaded['matches_ln10.csv']))

#Let's have reproducible experiments
my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)


Saving matches_ln10.csv to matches_ln10.csv


In [None]:
#Instantiate reader and data
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['JobseekerID', 'VacancyID', 'Rating']], reader)
raw_ratings = data.raw_ratings

# shuffle ratings if you want
random.shuffle(raw_ratings)

# train= 80% of the data, test = 20% of the data
threshold = int(0.8 * len(raw_ratings))
train_raw_ratings = raw_ratings[:threshold]
test_raw_ratings = raw_ratings[threshold:]

# Let's check the split
print("Number of rows in train_raw_ratings:", len(train_raw_ratings))
print("Number of rows in test_raw_ratings:", len(test_raw_ratings))

data.raw_ratings = train_raw_ratings  # data is now the set train_raw_ratings

Number of rows in train_raw_ratings: 36253
Number of rows in test_raw_ratings: 9064


In [None]:
# STEP 3: Perform grid search over the hyperparameters, fit the algorithm to the training set. 
# After knowing the best parameters you may want to plot k against the MAE. In this case, you may skip the code until then.

# Define the algortihm to be used for GridSearch
algo = KNNBaseline

# Define the GridSearchCV object and fit the trainset to it
print("Performing Grid Search...")
param_grid = {
    'k': [5],
    'sim_options': {
       'name': ['pearson_baseline'],
       'user_based': [False] },
    'bsl_options': {
        'method': ['sgd'],
        'n_epochs': [30],
        'reg_u': [0],
        'reg_i': [0] },
    }

gs = GridSearchCV(KNNBaseline, param_grid, measures=['mae'], cv=10, n_jobs = -1)
gs.fit(data) #Recall: this is the train_raw_ratings!

print("Best MAE score:", gs.best_score["mae"])
print("Best MAE parameters:", gs.best_params["mae"])

Performing Grid Search...
Best MAE score: 1.0785659694343777
Best MAE parameters: {'k': 5, 'sim_options': {'name': 'pearson_baseline', 'user_based': False}, 'bsl_options': {'method': 'sgd', 'n_epochs': 30, 'reg_u': 0, 'reg_i': 0}}


In [None]:
# get the results. This code outputs the tables
results = pd.DataFrame(gs.cv_results)
display(results)

Unnamed: 0,split0_test_mae,split1_test_mae,split2_test_mae,split3_test_mae,split4_test_mae,split5_test_mae,split6_test_mae,split7_test_mae,split8_test_mae,split9_test_mae,...,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_k,param_sim_options,param_bsl_options
0,1.077298,1.079595,1.082132,1.062693,1.086814,1.083857,1.073848,1.087594,1.093333,1.058496,...,0.010431,1,0.175294,0.037672,0.063941,0.0165,"{'k': 5, 'sim_options': {'name': 'pearson_base...",5,"{'name': 'pearson_baseline', 'user_based': False}","{'method': 'sgd', 'n_epochs': 30, 'reg_u': 0, ..."


In [None]:
algo = gs.best_estimator["mae"]

In [None]:
# Step 4: Fit the algorithm on the training set with the best hyperparameters of MAE, so retrain on the whole set train_raw_ratings.
trainset = data.build_full_trainset()
algo.fit(trainset)

# Compute biased accuracy on train_raw_ratings
predictions = algo.test(trainset.build_testset())
print("Biased accuracy on the train set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Biased accuracy on the train set,
MAE:  0.1230
MAE: 0.1230401802672115
RMSE: 0.1941
RMSE: 0.1940718905207510


In [None]:
# Compute unbiased accuracy on test_raw_ratings
testset = data.construct_testset(test_raw_ratings)  # testset is now the set test_raw_rating
predictions = algo.test(testset)
print("Unbiased accuracy on the test set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Unbiased accuracy on the test set,
MAE:  1.0526
MAE: 1.0525881097308012
RMSE: 1.3504
RMSE: 1.3503620069117444


### KNNWithMeans

In [None]:
#Import our data
from google.colab import files
uploaded = files.upload()

import io
df = pd.read_csv(io.BytesIO(uploaded['matches_ln10.csv']))

#Let's have reproducible experiments
my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)


Saving matches_ln10.csv to matches_ln10 (1).csv


In [None]:
#Instantiate reader and data
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['JobseekerID', 'VacancyID', 'Rating']], reader)
raw_ratings = data.raw_ratings

# shuffle ratings if you want
random.shuffle(raw_ratings)

# train= 80% of the data, test = 20% of the data
threshold = int(0.8 * len(raw_ratings))
train_raw_ratings = raw_ratings[:threshold]
test_raw_ratings = raw_ratings[threshold:]

# Let's check the split
print("Number of rows in train_raw_ratings:", len(train_raw_ratings))
print("Number of rows in test_raw_ratings:", len(test_raw_ratings))

data.raw_ratings = train_raw_ratings  # data is now the set train_raw_ratings

Number of rows in train_raw_ratings: 36253
Number of rows in test_raw_ratings: 9064


In [None]:
# STEP 3: Perform grid search over the hyperparameters, fit the algorithm to the training set. 
# After knowing the best parameters you may want to plot k against the MAE. In this case, you may skip the code until then.

# Define the algortihm to be used for GridSearch
algo = KNNWithMeans

# Define the GridSearchCV object and fit the trainset to it
print("Performing Grid Search...")
param_grid = {
      'k': [5],
      'sim_options': {
      'name': ['pearson_baseline'],
      'min_support': [0],
      'user_based': [False],
      'shrinkage': [80],
    },
}

gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=10, n_jobs = -1)
gs.fit(data) #Recall: this is the train_raw_ratings!

print("Best MAE score:", gs.best_score["mae"])
print("Best MAE parameters:", gs.best_params["mae"])
print("Best RMSE score:", gs.best_score["rmse"])
print("Best RMSE parameters:", gs.best_params["rmse"])

Performing Grid Search...
Best MAE score: 1.0783497710745682
Best MAE parameters: {'k': 5, 'sim_options': {'name': 'pearson_baseline', 'min_support': 0, 'user_based': False, 'shrinkage': 80}}
Best RMSE score: 1.3778355157194002
Best RMSE parameters: {'k': 5, 'sim_options': {'name': 'pearson_baseline', 'min_support': 0, 'user_based': False, 'shrinkage': 80}}


In [None]:
# get the results. This code outputs the tables
results = pd.DataFrame(gs.cv_results)
display(results)

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,split3_test_rmse,split4_test_rmse,split5_test_rmse,split6_test_rmse,split7_test_rmse,split8_test_rmse,split9_test_rmse,...,mean_test_mae,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_k,param_sim_options
0,1.376702,1.394401,1.371556,1.367765,1.376903,1.397725,1.376258,1.370074,1.388727,1.358244,...,1.07835,0.010883,1,0.06226,0.014025,0.050085,0.009328,"{'k': 5, 'sim_options': {'name': 'pearson_base...",5,"{'name': 'pearson_baseline', 'min_support': 0,..."


In [None]:
algo = gs.best_estimator["mae"]

In [None]:
# Step 4: Fit the algorithm on the training set with the best hyperparameters of MAE, so retrain on the whole set train_raw_ratings.
trainset = data.build_full_trainset()
algo.fit(trainset)

# Compute biased accuracy on train_raw_ratings
predictions = algo.test(trainset.build_testset())
print("Biased accuracy on the train set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Biased accuracy on the train set,
MAE:  0.1784
MAE: 0.1783566846223700
RMSE: 0.2685
RMSE: 0.2685173842953701


In [None]:
# Compute unbiased accuracy on test_raw_ratings
testset = data.construct_testset(test_raw_ratings)  # testset is now the set test_raw_rating
predictions = algo.test(testset)
print("Unbiased accuracy on the test set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Unbiased accuracy on the test set,
MAE:  1.0528
MAE: 1.0527849571814960
RMSE: 1.3500
RMSE: 1.3499794581880609


## 20% Noise

### KNNBasic

In [None]:
#Import our data
from google.colab import files
uploaded = files.upload()

import io
df = pd.read_csv(io.BytesIO(uploaded['matches_ln20.csv']))

#Let's have reproducible experiments
my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)


Saving matches_ln20.csv to matches_ln20 (1).csv


In [None]:
#Instantiate reader and data
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['JobseekerID', 'VacancyID', 'Rating']], reader)
raw_ratings = data.raw_ratings

# shuffle ratings if you want
random.shuffle(raw_ratings)

# train= 80% of the data, test = 20% of the data
threshold = int(0.8 * len(raw_ratings))
train_raw_ratings = raw_ratings[:threshold]
test_raw_ratings = raw_ratings[threshold:]

# Let's check the split
print("Number of rows in train_raw_ratings:", len(train_raw_ratings))
print("Number of rows in test_raw_ratings:", len(test_raw_ratings))

data.raw_ratings = train_raw_ratings  # data is now the set train_raw_ratings

Number of rows in train_raw_ratings: 36253
Number of rows in test_raw_ratings: 9064


In [None]:
# STEP 3: Perform grid search over the hyperparameters, fit the algorithm to the training set. 
# After knowing the best parameters you may want to plot k against the MAE. In this case, you may skip the code until then.

# Define the algortihm to be used for GridSearch
algo = KNNBasic

# Define the GridSearchCV object and fit the trainset to it
print("Performing Grid Search...")
param_grid = {
      'k': [5],
      'sim_options': {
      'name': ['pearson_baseline'],
      'min_support': [0],
      'user_based': [False],
      'shrinkage': [20],
    },
}

gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=10, n_jobs = -1)
gs.fit(data) #Recall: this is the train_raw_ratings!

print("Best MAE score:", gs.best_score["mae"])
print("Best MAE parameters:", gs.best_params["mae"])
print("Best RMSE score:", gs.best_score["rmse"])
print("Best RMSE parameters:", gs.best_params["rmse"])

Performing Grid Search...
Best MAE score: 1.1061126224923046
Best MAE parameters: {'k': 5, 'sim_options': {'name': 'pearson_baseline', 'min_support': 0, 'user_based': False, 'shrinkage': 20}}
Best RMSE score: 1.4328356390147725
Best RMSE parameters: {'k': 5, 'sim_options': {'name': 'pearson_baseline', 'min_support': 0, 'user_based': False, 'shrinkage': 20}}


In [None]:
# get the results. This code outputs the tables
results = pd.DataFrame(gs.cv_results)
display(results)

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,split3_test_rmse,split4_test_rmse,split5_test_rmse,split6_test_rmse,split7_test_rmse,split8_test_rmse,split9_test_rmse,...,mean_test_mae,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_k,param_sim_options
0,1.428445,1.437347,1.424203,1.427759,1.440445,1.451332,1.412314,1.439968,1.442016,1.424528,...,1.106113,0.010476,1,0.11278,0.022609,0.096645,0.019966,"{'k': 5, 'sim_options': {'name': 'pearson_base...",5,"{'name': 'pearson_baseline', 'min_support': 0,..."


In [None]:
algo = gs.best_estimator["mae"]

In [None]:
# Step 4: Fit the algorithm on the training set with the best hyperparameters of MAE, so retrain on the whole set train_raw_ratings.
trainset = data.build_full_trainset()
algo.fit(trainset)

# Compute biased accuracy on train_raw_ratings
predictions = algo.test(trainset.build_testset())
print("Biased accuracy on the train set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Biased accuracy on the train set,
MAE:  0.2282
MAE: 0.2281785289357445
RMSE: 0.3489
RMSE: 0.3488939951915405


In [None]:
# Compute unbiased accuracy on test_raw_ratings
testset = data.construct_testset(test_raw_ratings)  # testset is now the set test_raw_rating
predictions = algo.test(testset)
print("Unbiased accuracy on the test set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Unbiased accuracy on the test set,
MAE:  1.0828
MAE: 1.0827665866431821
RMSE: 1.4114
RMSE: 1.4114372045568067


### KNNBaseline

In [None]:
#Import our data
from google.colab import files
uploaded = files.upload()

import io
df = pd.read_csv(io.BytesIO(uploaded['matches_ln20.csv']))

#Let's have reproducible experiments
my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)


Saving matches_ln20.csv to matches_ln20.csv


In [None]:
#Instantiate reader and data
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['JobseekerID', 'VacancyID', 'Rating']], reader)
raw_ratings = data.raw_ratings

# shuffle ratings if you want
random.shuffle(raw_ratings)

# train= 80% of the data, test = 20% of the data
threshold = int(0.8 * len(raw_ratings))
train_raw_ratings = raw_ratings[:threshold]
test_raw_ratings = raw_ratings[threshold:]

# Let's check the split
print("Number of rows in train_raw_ratings:", len(train_raw_ratings))
print("Number of rows in test_raw_ratings:", len(test_raw_ratings))

data.raw_ratings = train_raw_ratings  # data is now the set train_raw_ratings

Number of rows in train_raw_ratings: 36253
Number of rows in test_raw_ratings: 9064


In [None]:
# STEP 3: Perform grid search over the hyperparameters, fit the algorithm to the training set. 
# After knowing the best parameters you may want to plot k against the MAE. In this case, you may skip the code until then.

# Define the algortihm to be used for GridSearch
algo = KNNBaseline

# Define the GridSearchCV object and fit the trainset to it
print("Performing Grid Search...")
param_grid = {
    'k': [5],
    'sim_options': {
       'name': ['pearson_baseline'],
       'user_based': [False] },
    'bsl_options': {
        'method': ['sgd'],
        'n_epochs': [30],
        'reg_u': [0],
        'reg_i': [0] },
    }

gs = GridSearchCV(KNNBaseline, param_grid, measures=['mae'], cv=10, n_jobs = -1)
gs.fit(data) #Recall: this is the train_raw_ratings!

print("Best MAE score:", gs.best_score["mae"])
print("Best MAE parameters:", gs.best_params["mae"])

Performing Grid Search...
Best MAE score: 1.1102348562315083
Best MAE parameters: {'k': 5, 'sim_options': {'name': 'pearson_baseline', 'user_based': False}, 'bsl_options': {'method': 'sgd', 'n_epochs': 30, 'reg_u': 0, 'reg_i': 0}}


In [None]:
# get the results. This code outputs the tables
results = pd.DataFrame(gs.cv_results)
display(results)

Unnamed: 0,split0_test_mae,split1_test_mae,split2_test_mae,split3_test_mae,split4_test_mae,split5_test_mae,split6_test_mae,split7_test_mae,split8_test_mae,split9_test_mae,...,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_k,param_sim_options,param_bsl_options
0,1.100124,1.108592,1.103204,1.112057,1.118221,1.12149,1.092044,1.125273,1.1198,1.101544,...,0.010355,1,0.153688,0.022734,0.063816,0.017103,"{'k': 5, 'sim_options': {'name': 'pearson_base...",5,"{'name': 'pearson_baseline', 'user_based': False}","{'method': 'sgd', 'n_epochs': 30, 'reg_u': 0, ..."


In [None]:
algo = gs.best_estimator["mae"]

In [None]:
# Step 4: Fit the algorithm on the training set with the best hyperparameters of MAE, so retrain on the whole set train_raw_ratings.
trainset = data.build_full_trainset()
algo.fit(trainset)

# Compute biased accuracy on train_raw_ratings
predictions = algo.test(trainset.build_testset())
print("Biased accuracy on the train set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Biased accuracy on the train set,
MAE:  0.1047
MAE: 0.1046523731743364
RMSE: 0.1711
RMSE: 0.1710727560470575


In [None]:
# Compute unbiased accuracy on test_raw_ratings
testset = data.construct_testset(test_raw_ratings)  # testset is now the set test_raw_rating
predictions = algo.test(testset)
print("Unbiased accuracy on the test set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Unbiased accuracy on the test set,
MAE:  1.0953
MAE: 1.0952592938403571
RMSE: 1.4185
RMSE: 1.4185048191889686


### KNNWithMeans

In [None]:
#Import our data
from google.colab import files
uploaded = files.upload()

import io
df = pd.read_csv(io.BytesIO(uploaded['matches_ln20.csv']))

#Let's have reproducible experiments
my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)


Saving matches_ln20.csv to matches_ln20 (1).csv


In [None]:
#Instantiate reader and data
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['JobseekerID', 'VacancyID', 'Rating']], reader)
raw_ratings = data.raw_ratings

# shuffle ratings if you want
random.shuffle(raw_ratings)

# train= 80% of the data, test = 20% of the data
threshold = int(0.8 * len(raw_ratings))
train_raw_ratings = raw_ratings[:threshold]
test_raw_ratings = raw_ratings[threshold:]

# Let's check the split
print("Number of rows in train_raw_ratings:", len(train_raw_ratings))
print("Number of rows in test_raw_ratings:", len(test_raw_ratings))

data.raw_ratings = train_raw_ratings  # data is now the set train_raw_ratings

Number of rows in train_raw_ratings: 36253
Number of rows in test_raw_ratings: 9064


In [None]:
# STEP 3: Perform grid search over the hyperparameters, fit the algorithm to the training set. 
# After knowing the best parameters you may want to plot k against the MAE. In this case, you may skip the code until then.

# Define the algortihm to be used for GridSearch
algo = KNNWithMeans

# Define the GridSearchCV object and fit the trainset to it
print("Performing Grid Search...")
param_grid = {
      'k': [5],
      'sim_options': {
      'name': ['pearson_baseline'],
      'min_support': [0],
      'user_based': [False],
      'shrinkage': [80],
    },
}

gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=10, n_jobs = -1)
gs.fit(data) #Recall: this is the train_raw_ratings!

print("Best MAE score:", gs.best_score["mae"])
print("Best MAE parameters:", gs.best_params["mae"])
print("Best RMSE score:", gs.best_score["rmse"])
print("Best RMSE parameters:", gs.best_params["rmse"])

Performing Grid Search...
Best MAE score: 1.1074772388703475
Best MAE parameters: {'k': 5, 'sim_options': {'name': 'pearson_baseline', 'min_support': 0, 'user_based': False, 'shrinkage': 80}}
Best RMSE score: 1.4260674098427908
Best RMSE parameters: {'k': 5, 'sim_options': {'name': 'pearson_baseline', 'min_support': 0, 'user_based': False, 'shrinkage': 80}}


In [None]:
# get the results. This code outputs the tables
results = pd.DataFrame(gs.cv_results)
display(results)

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,split3_test_rmse,split4_test_rmse,split5_test_rmse,split6_test_rmse,split7_test_rmse,split8_test_rmse,split9_test_rmse,...,mean_test_mae,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_k,param_sim_options
0,1.418903,1.42783,1.419886,1.424923,1.436824,1.443579,1.402744,1.434151,1.434316,1.417519,...,1.107477,0.010853,1,0.070044,0.014987,0.05957,0.011633,"{'k': 5, 'sim_options': {'name': 'pearson_base...",5,"{'name': 'pearson_baseline', 'min_support': 0,..."


In [None]:
algo = gs.best_estimator["mae"]

In [None]:
# Step 4: Fit the algorithm on the training set with the best hyperparameters of MAE, so retrain on the whole set train_raw_ratings.
trainset = data.build_full_trainset()
algo.fit(trainset)

# Compute biased accuracy on train_raw_ratings
predictions = algo.test(trainset.build_testset())
print("Biased accuracy on the train set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Biased accuracy on the train set,
MAE:  0.1598
MAE: 0.1598032747499177
RMSE: 0.2461
RMSE: 0.2461463002754193


In [None]:
# Compute unbiased accuracy on test_raw_ratings
testset = data.construct_testset(test_raw_ratings)  # testset is now the set test_raw_rating
predictions = algo.test(testset)
print("Unbiased accuracy on the test set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Unbiased accuracy on the test set,
MAE:  1.0846
MAE: 1.0845886269882499
RMSE: 1.4054
RMSE: 1.4054485564045363


## 30% Noise

### KNNBasic

In [None]:
#Import our data
from google.colab import files
uploaded = files.upload()

import io
df = pd.read_csv(io.BytesIO(uploaded['matches_ln30.csv']))

#Let's have reproducible experiments
my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)


Saving matches_ln30.csv to matches_ln30 (1).csv


In [None]:
#Instantiate reader and data
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['JobseekerID', 'VacancyID', 'Rating']], reader)
raw_ratings = data.raw_ratings

# shuffle ratings if you want
random.shuffle(raw_ratings)

# train= 80% of the data, test = 20% of the data
threshold = int(0.8 * len(raw_ratings))
train_raw_ratings = raw_ratings[:threshold]
test_raw_ratings = raw_ratings[threshold:]

# Let's check the split
print("Number of rows in train_raw_ratings:", len(train_raw_ratings))
print("Number of rows in test_raw_ratings:", len(test_raw_ratings))

data.raw_ratings = train_raw_ratings  # data is now the set train_raw_ratings

Number of rows in train_raw_ratings: 36253
Number of rows in test_raw_ratings: 9064


In [None]:
# STEP 3: Perform grid search over the hyperparameters, fit the algorithm to the training set. 
# After knowing the best parameters you may want to plot k against the MAE. In this case, you may skip the code until then.

# Define the algortihm to be used for GridSearch
algo = KNNBasic

# Define the GridSearchCV object and fit the trainset to it
print("Performing Grid Search...")
param_grid = {
      'k': [5],
      'sim_options': {
      'name': ['pearson_baseline'],
      'min_support': [0],
      'user_based': [False],
      'shrinkage': [20],
    },
}

gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=10, n_jobs = -1)
gs.fit(data) #Recall: this is the train_raw_ratings!

print("Best MAE score:", gs.best_score["mae"])
print("Best MAE parameters:", gs.best_params["mae"])
print("Best RMSE score:", gs.best_score["rmse"])
print("Best RMSE parameters:", gs.best_params["rmse"])

Performing Grid Search...
Best MAE score: 1.1071730042761598
Best MAE parameters: {'k': 5, 'sim_options': {'name': 'pearson_baseline', 'min_support': 0, 'user_based': False, 'shrinkage': 20}}
Best RMSE score: 1.4306045079252958
Best RMSE parameters: {'k': 5, 'sim_options': {'name': 'pearson_baseline', 'min_support': 0, 'user_based': False, 'shrinkage': 20}}


In [None]:
# get the results. This code outputs the tables
results = pd.DataFrame(gs.cv_results)
display(results)

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,split3_test_rmse,split4_test_rmse,split5_test_rmse,split6_test_rmse,split7_test_rmse,split8_test_rmse,split9_test_rmse,...,mean_test_mae,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_k,param_sim_options
0,1.440905,1.44679,1.440027,1.415225,1.425116,1.442878,1.415402,1.42462,1.43385,1.421234,...,1.107173,0.007796,1,0.088376,0.014863,0.075459,0.012566,"{'k': 5, 'sim_options': {'name': 'pearson_base...",5,"{'name': 'pearson_baseline', 'min_support': 0,..."


In [None]:
algo = gs.best_estimator["mae"]

In [None]:
# Step 4: Fit the algorithm on the training set with the best hyperparameters of MAE, so retrain on the whole set train_raw_ratings.
trainset = data.build_full_trainset()
algo.fit(trainset)

# Compute biased accuracy on train_raw_ratings
predictions = algo.test(trainset.build_testset())
print("Biased accuracy on the train set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Biased accuracy on the train set,
MAE:  0.2285
MAE: 0.2285178698146269
RMSE: 0.3506
RMSE: 0.3505658833354339


In [None]:
# Compute unbiased accuracy on test_raw_ratings
testset = data.construct_testset(test_raw_ratings)  # testset is now the set test_raw_rating
predictions = algo.test(testset)
print("Unbiased accuracy on the test set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Unbiased accuracy on the test set,
MAE:  1.0818
MAE: 1.0818415308818907
RMSE: 1.4073
RMSE: 1.4072646788229297


### KNNBaseline

In [None]:
#Import our data
from google.colab import files
uploaded = files.upload()

import io
df = pd.read_csv(io.BytesIO(uploaded['matches_ln30.csv']))

#Let's have reproducible experiments
my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)


Saving matches_ln30.csv to matches_ln30.csv


In [None]:
#Instantiate reader and data
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['JobseekerID', 'VacancyID', 'Rating']], reader)
raw_ratings = data.raw_ratings

# shuffle ratings if you want
random.shuffle(raw_ratings)

# train= 80% of the data, test = 20% of the data
threshold = int(0.8 * len(raw_ratings))
train_raw_ratings = raw_ratings[:threshold]
test_raw_ratings = raw_ratings[threshold:]

# Let's check the split
print("Number of rows in train_raw_ratings:", len(train_raw_ratings))
print("Number of rows in test_raw_ratings:", len(test_raw_ratings))

data.raw_ratings = train_raw_ratings  # data is now the set train_raw_ratings

Number of rows in train_raw_ratings: 36253
Number of rows in test_raw_ratings: 9064


In [None]:
# STEP 3: Perform grid search over the hyperparameters, fit the algorithm to the training set. 
# After knowing the best parameters you may want to plot k against the MAE. In this case, you may skip the code until then.

# Define the algortihm to be used for GridSearch
algo = KNNBaseline

# Define the GridSearchCV object and fit the trainset to it
print("Performing Grid Search...")
param_grid = {
    'k': [5],
    'sim_options': {
       'name': ['pearson_baseline'],
       'user_based': [False] },
    'bsl_options': {
        'method': ['sgd'],
        'n_epochs': [30],
        'reg_u': [0],
        'reg_i': [0] },
    }

gs = GridSearchCV(KNNBaseline, param_grid, measures=['mae'], cv=10, n_jobs = -1)
gs.fit(data) #Recall: this is the train_raw_ratings!

print("Best MAE score:", gs.best_score["mae"])
print("Best MAE parameters:", gs.best_params["mae"])

Performing Grid Search...
Best MAE score: 1.1058220765860098
Best MAE parameters: {'k': 5, 'sim_options': {'name': 'pearson_baseline', 'user_based': False}, 'bsl_options': {'method': 'sgd', 'n_epochs': 30, 'reg_u': 0, 'reg_i': 0}}


In [None]:
# get the results. This code outputs the tables
results = pd.DataFrame(gs.cv_results)
display(results)

Unnamed: 0,split0_test_mae,split1_test_mae,split2_test_mae,split3_test_mae,split4_test_mae,split5_test_mae,split6_test_mae,split7_test_mae,split8_test_mae,split9_test_mae,...,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_k,param_sim_options,param_bsl_options
0,1.109728,1.107221,1.10196,1.104244,1.103994,1.11295,1.088159,1.111445,1.110748,1.107773,...,0.006786,1,0.166863,0.045183,0.055348,0.012026,"{'k': 5, 'sim_options': {'name': 'pearson_base...",5,"{'name': 'pearson_baseline', 'user_based': False}","{'method': 'sgd', 'n_epochs': 30, 'reg_u': 0, ..."


In [None]:
algo = gs.best_estimator["mae"]

In [None]:
# Step 4: Fit the algorithm on the training set with the best hyperparameters of MAE, so retrain on the whole set train_raw_ratings.
trainset = data.build_full_trainset()
algo.fit(trainset)

# Compute biased accuracy on train_raw_ratings
predictions = algo.test(trainset.build_testset())
print("Biased accuracy on the train set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Biased accuracy on the train set,
MAE:  0.1051
MAE: 0.1050777608871810
RMSE: 0.1712
RMSE: 0.1711690122738889


In [None]:
# Compute unbiased accuracy on test_raw_ratings
testset = data.construct_testset(test_raw_ratings)  # testset is now the set test_raw_rating
predictions = algo.test(testset)
print("Unbiased accuracy on the test set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Unbiased accuracy on the test set,
MAE:  1.0820
MAE: 1.0819997030018536
RMSE: 1.4060
RMSE: 1.4060125399433079


### KNNWithMeans

In [None]:
#Import our data
from google.colab import files
uploaded = files.upload()

import io
df = pd.read_csv(io.BytesIO(uploaded['matches_ln30.csv']))

#Let's have reproducible experiments
my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)


Saving matches_ln30.csv to matches_ln30 (1).csv


In [None]:
#Instantiate reader and data
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['JobseekerID', 'VacancyID', 'Rating']], reader)
raw_ratings = data.raw_ratings

# shuffle ratings if you want
random.shuffle(raw_ratings)

# train= 80% of the data, test = 20% of the data
threshold = int(0.8 * len(raw_ratings))
train_raw_ratings = raw_ratings[:threshold]
test_raw_ratings = raw_ratings[threshold:]

# Let's check the split
print("Number of rows in train_raw_ratings:", len(train_raw_ratings))
print("Number of rows in test_raw_ratings:", len(test_raw_ratings))

data.raw_ratings = train_raw_ratings  # data is now the set train_raw_ratings

Number of rows in train_raw_ratings: 36253
Number of rows in test_raw_ratings: 9064


In [None]:
# STEP 3: Perform grid search over the hyperparameters, fit the algorithm to the training set. 
# After knowing the best parameters you may want to plot k against the MAE. In this case, you may skip the code until then.

# Define the algortihm to be used for GridSearch
algo = KNNWithMeans

# Define the GridSearchCV object and fit the trainset to it
print("Performing Grid Search...")
param_grid = {
      'k': [5],
      'sim_options': {
      'name': ['pearson_baseline'],
      'min_support': [0],
      'user_based': [False],
      'shrinkage': [80],
    },
}

gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=10, n_jobs = -1)
gs.fit(data) #Recall: this is the train_raw_ratings!

print("Best MAE score:", gs.best_score["mae"])
print("Best MAE parameters:", gs.best_params["mae"])
print("Best RMSE score:", gs.best_score["rmse"])
print("Best RMSE parameters:", gs.best_params["rmse"])

Performing Grid Search...
Best MAE score: 1.1071592552455491
Best MAE parameters: {'k': 5, 'sim_options': {'name': 'pearson_baseline', 'min_support': 0, 'user_based': False, 'shrinkage': 80}}
Best RMSE score: 1.4240941035171168
Best RMSE parameters: {'k': 5, 'sim_options': {'name': 'pearson_baseline', 'min_support': 0, 'user_based': False, 'shrinkage': 80}}


In [None]:
# get the results. This code outputs the tables
results = pd.DataFrame(gs.cv_results)
display(results)

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,split3_test_rmse,split4_test_rmse,split5_test_rmse,split6_test_rmse,split7_test_rmse,split8_test_rmse,split9_test_rmse,...,mean_test_mae,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_k,param_sim_options
0,1.433534,1.438233,1.431659,1.412151,1.424247,1.432866,1.406027,1.418393,1.423984,1.419847,...,1.107159,0.00618,1,0.056799,0.011061,0.046937,0.004443,"{'k': 5, 'sim_options': {'name': 'pearson_base...",5,"{'name': 'pearson_baseline', 'min_support': 0,..."


In [None]:
algo = gs.best_estimator["mae"]

In [None]:
# Step 4: Fit the algorithm on the training set with the best hyperparameters of MAE, so retrain on the whole set train_raw_ratings.
trainset = data.build_full_trainset()
algo.fit(trainset)

# Compute biased accuracy on train_raw_ratings
predictions = algo.test(trainset.build_testset())
print("Biased accuracy on the train set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Biased accuracy on the train set,
MAE:  0.1601
MAE: 0.1600994809644660
RMSE: 0.2477
RMSE: 0.2476575919748311


In [None]:
# Compute unbiased accuracy on test_raw_ratings
testset = data.construct_testset(test_raw_ratings)  # testset is now the set test_raw_rating
predictions = algo.test(testset)
print("Unbiased accuracy on the test set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Unbiased accuracy on the test set,
MAE:  1.0820
MAE: 1.0819792416002525
RMSE: 1.4010
RMSE: 1.4009527962898818


## 40% Noise

### KNNBasic

In [None]:
#Import our data
from google.colab import files
uploaded = files.upload()

import io
df = pd.read_csv(io.BytesIO(uploaded['matches_ln40.csv']))

#Let's have reproducible experiments
my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)


Saving matches_ln40.csv to matches_ln40 (2).csv


In [None]:
#Instantiate reader and data
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['JobseekerID', 'VacancyID', 'Rating']], reader)
raw_ratings = data.raw_ratings

# shuffle ratings if you want
random.shuffle(raw_ratings)

# train= 80% of the data, test = 20% of the data
threshold = int(0.8 * len(raw_ratings))
train_raw_ratings = raw_ratings[:threshold]
test_raw_ratings = raw_ratings[threshold:]

# Let's check the split
print("Number of rows in train_raw_ratings:", len(train_raw_ratings))
print("Number of rows in test_raw_ratings:", len(test_raw_ratings))

data.raw_ratings = train_raw_ratings  # data is now the set train_raw_ratings

Number of rows in train_raw_ratings: 36253
Number of rows in test_raw_ratings: 9064


In [None]:
# STEP 3: Perform grid search over the hyperparameters, fit the algorithm to the training set. 
# After knowing the best parameters you may want to plot k against the MAE. In this case, you may skip the code until then.

# Define the algortihm to be used for GridSearch
algo = KNNBasic

# Define the GridSearchCV object and fit the trainset to it
print("Performing Grid Search...")
param_grid = {
      'k': [5],
      'sim_options': {
      'name': ['pearson_baseline'],
      'min_support': [0],
      'user_based': [False],
      'shrinkage': [20],
    },
}

gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=10, n_jobs = -1)
gs.fit(data) #Recall: this is the train_raw_ratings!

print("Best MAE score:", gs.best_score["mae"])
print("Best MAE parameters:", gs.best_params["mae"])
print("Best RMSE score:", gs.best_score["rmse"])
print("Best RMSE parameters:", gs.best_params["rmse"])

Performing Grid Search...
Best MAE score: 1.1032302792207636
Best MAE parameters: {'k': 5, 'sim_options': {'name': 'pearson_baseline', 'min_support': 0, 'user_based': False, 'shrinkage': 20}}
Best RMSE score: 1.437800810054269
Best RMSE parameters: {'k': 5, 'sim_options': {'name': 'pearson_baseline', 'min_support': 0, 'user_based': False, 'shrinkage': 20}}


In [None]:
# get the results. This code outputs the tables
results = pd.DataFrame(gs.cv_results)
display(results)

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,split3_test_rmse,split4_test_rmse,split5_test_rmse,split6_test_rmse,split7_test_rmse,split8_test_rmse,split9_test_rmse,...,mean_test_mae,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_k,param_sim_options
0,1.42453,1.465962,1.443139,1.43963,1.465304,1.442821,1.431196,1.436956,1.428372,1.400096,...,1.10323,0.015821,1,0.089906,0.020373,0.075309,0.013658,"{'k': 5, 'sim_options': {'name': 'pearson_base...",5,"{'name': 'pearson_baseline', 'min_support': 0,..."


In [None]:
algo = gs.best_estimator["mae"]

In [None]:
# Step 4: Fit the algorithm on the training set with the best hyperparameters of MAE, so retrain on the whole set train_raw_ratings.
trainset = data.build_full_trainset()
algo.fit(trainset)

# Compute biased accuracy on train_raw_ratings
predictions = algo.test(trainset.build_testset())
print("Biased accuracy on the train set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Biased accuracy on the train set,
MAE:  0.2199
MAE: 0.2199049433882882
RMSE: 0.3357
RMSE: 0.3357009628182187


In [None]:
# Compute unbiased accuracy on test_raw_ratings
testset = data.construct_testset(test_raw_ratings)  # testset is now the set test_raw_rating
predictions = algo.test(testset)
print("Unbiased accuracy on the test set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Unbiased accuracy on the test set,
MAE:  1.0716
MAE: 1.0716237682779566
RMSE: 1.4004
RMSE: 1.4004256721365544


### KNNBaseline

In [None]:
#Import our data
from google.colab import files
uploaded = files.upload()

import io
df = pd.read_csv(io.BytesIO(uploaded['matches_ln40.csv']))

#Let's have reproducible experiments
my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)


Saving matches_ln40.csv to matches_ln40.csv


In [None]:
#Instantiate reader and data
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['JobseekerID', 'VacancyID', 'Rating']], reader)
raw_ratings = data.raw_ratings

# shuffle ratings if you want
random.shuffle(raw_ratings)

# train= 80% of the data, test = 20% of the data
threshold = int(0.8 * len(raw_ratings))
train_raw_ratings = raw_ratings[:threshold]
test_raw_ratings = raw_ratings[threshold:]

# Let's check the split
print("Number of rows in train_raw_ratings:", len(train_raw_ratings))
print("Number of rows in test_raw_ratings:", len(test_raw_ratings))

data.raw_ratings = train_raw_ratings  # data is now the set train_raw_ratings

Number of rows in train_raw_ratings: 36253
Number of rows in test_raw_ratings: 9064


In [None]:
# STEP 3: Perform grid search over the hyperparameters, fit the algorithm to the training set. 
# After knowing the best parameters you may want to plot k against the MAE. In this case, you may skip the code until then.

# Define the algortihm to be used for GridSearch
algo = KNNBaseline

# Define the GridSearchCV object and fit the trainset to it
print("Performing Grid Search...")
param_grid = {
    'k': [5],
    'sim_options': {
       'name': ['pearson_baseline'],
       'user_based': [False] },
    'bsl_options': {
        'method': ['sgd'],
        'n_epochs': [30],
        'reg_u': [0],
        'reg_i': [0] },
    }

gs = GridSearchCV(KNNBaseline, param_grid, measures=['mae'], cv=10, n_jobs = -1)
gs.fit(data) #Recall: this is the train_raw_ratings!

print("Best MAE score:", gs.best_score["mae"])
print("Best MAE parameters:", gs.best_params["mae"])

Performing Grid Search...
Best MAE score: 1.1121262211103211
Best MAE parameters: {'k': 5, 'sim_options': {'name': 'pearson_baseline', 'user_based': False}, 'bsl_options': {'method': 'sgd', 'n_epochs': 30, 'reg_u': 0, 'reg_i': 0}}


In [None]:
# get the results. This code outputs the tables
results = pd.DataFrame(gs.cv_results)
display(results)

Unnamed: 0,split0_test_mae,split1_test_mae,split2_test_mae,split3_test_mae,split4_test_mae,split5_test_mae,split6_test_mae,split7_test_mae,split8_test_mae,split9_test_mae,...,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_k,param_sim_options,param_bsl_options
0,1.102864,1.134188,1.117717,1.119093,1.130918,1.101842,1.112498,1.11554,1.106519,1.080083,...,0.014769,1,0.214477,0.037648,0.069262,0.011645,"{'k': 5, 'sim_options': {'name': 'pearson_base...",5,"{'name': 'pearson_baseline', 'user_based': False}","{'method': 'sgd', 'n_epochs': 30, 'reg_u': 0, ..."


In [None]:
algo = gs.best_estimator["mae"]

In [None]:
# Step 4: Fit the algorithm on the training set with the best hyperparameters of MAE, so retrain on the whole set train_raw_ratings.
trainset = data.build_full_trainset()
algo.fit(trainset)

# Compute biased accuracy on train_raw_ratings
predictions = algo.test(trainset.build_testset())
print("Biased accuracy on the train set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Biased accuracy on the train set,
MAE:  0.0955
MAE: 0.0954709788549656
RMSE: 0.1570
RMSE: 0.1569754865478251


In [None]:
# Compute unbiased accuracy on test_raw_ratings
testset = data.construct_testset(test_raw_ratings)  # testset is now the set test_raw_rating
predictions = algo.test(testset)
print("Unbiased accuracy on the test set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Unbiased accuracy on the test set,
MAE:  1.0739
MAE: 1.0739245713589387
RMSE: 1.4044
RMSE: 1.4044203517301623


### KNNWithMeans

In [None]:
#Import our data
from google.colab import files
uploaded = files.upload()

import io
df = pd.read_csv(io.BytesIO(uploaded['matches_ln40.csv']))

#Let's have reproducible experiments
my_seed = 42
random.seed(my_seed)
np.random.seed(my_seed)


Saving matches_ln40.csv to matches_ln40 (1).csv


In [None]:
#Instantiate reader and data
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['JobseekerID', 'VacancyID', 'Rating']], reader)
raw_ratings = data.raw_ratings

# shuffle ratings if you want
random.shuffle(raw_ratings)

# train= 80% of the data, test = 20% of the data
threshold = int(0.8 * len(raw_ratings))
train_raw_ratings = raw_ratings[:threshold]
test_raw_ratings = raw_ratings[threshold:]

# Let's check the split
print("Number of rows in train_raw_ratings:", len(train_raw_ratings))
print("Number of rows in test_raw_ratings:", len(test_raw_ratings))

data.raw_ratings = train_raw_ratings  # data is now the set train_raw_ratings

Number of rows in train_raw_ratings: 36253
Number of rows in test_raw_ratings: 9064


In [None]:
# STEP 3: Perform grid search over the hyperparameters, fit the algorithm to the training set. 
# After knowing the best parameters you may want to plot k against the MAE. In this case, you may skip the code until then.

# Define the algortihm to be used for GridSearch
algo = KNNWithMeans

# Define the GridSearchCV object and fit the trainset to it
print("Performing Grid Search...")
param_grid = {
      'k': [5],
      'sim_options': {
      'name': ['pearson_baseline'],
      'min_support': [0],
      'user_based': [False],
      'shrinkage': [80],
    },
}

gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=10, n_jobs = -1)
gs.fit(data) #Recall: this is the train_raw_ratings!

print("Best MAE score:", gs.best_score["mae"])
print("Best MAE parameters:", gs.best_params["mae"])
print("Best RMSE score:", gs.best_score["rmse"])
print("Best RMSE parameters:", gs.best_params["rmse"])

Performing Grid Search...
Best MAE score: 1.1058124754894245
Best MAE parameters: {'k': 5, 'sim_options': {'name': 'pearson_baseline', 'min_support': 0, 'user_based': False, 'shrinkage': 80}}
Best RMSE score: 1.4316305869913957
Best RMSE parameters: {'k': 5, 'sim_options': {'name': 'pearson_baseline', 'min_support': 0, 'user_based': False, 'shrinkage': 80}}


In [None]:
# get the results. This code outputs the tables
results = pd.DataFrame(gs.cv_results)
display(results)

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,split3_test_rmse,split4_test_rmse,split5_test_rmse,split6_test_rmse,split7_test_rmse,split8_test_rmse,split9_test_rmse,...,mean_test_mae,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_k,param_sim_options
0,1.419802,1.460555,1.433305,1.434304,1.460586,1.433571,1.425266,1.428806,1.42264,1.39747,...,1.105812,0.015611,1,0.050999,0.001797,0.048981,0.006019,"{'k': 5, 'sim_options': {'name': 'pearson_base...",5,"{'name': 'pearson_baseline', 'min_support': 0,..."


In [None]:
algo = gs.best_estimator["mae"]

In [None]:
# Step 4: Fit the algorithm on the training set with the best hyperparameters of MAE, so retrain on the whole set train_raw_ratings.
trainset = data.build_full_trainset()
algo.fit(trainset)

# Compute biased accuracy on train_raw_ratings
predictions = algo.test(trainset.build_testset())
print("Biased accuracy on the train set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Biased accuracy on the train set,
MAE:  0.1527
MAE: 0.1527047444834473
RMSE: 0.2350
RMSE: 0.2350158426847993


In [None]:
# Compute unbiased accuracy on test_raw_ratings
testset = data.construct_testset(test_raw_ratings)  # testset is now the set test_raw_rating
predictions = algo.test(testset)
print("Unbiased accuracy on the test set,")
print(f"MAE: {accuracy.mae(predictions):.16f}") 
print(f"RMSE: {accuracy.rmse(predictions):.16f}")

Unbiased accuracy on the test set,
MAE:  1.0735
MAE: 1.0735051065134356
RMSE: 1.3940
RMSE: 1.3939555370703964
