### Load Library

In [None]:
# !pip install kneed, plotly

In [None]:
from sklearnex import patch_sklearn
patch_sklearn()
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.cluster import KMeans 
from kneed import KneeLocator
import plotly.graph_objects as go

In [None]:
import sys
import warnings
import pickle 
warnings.filterwarnings('ignore')
sys.path.append("..")
from modules import helper_functions as hf
from modules import similarity_functions as sf

### Load Data

In [None]:
df_jobs = pd.read_csv("../data/df_job_final.csv",  usecols=['title', 'department', 'description_combined'])
df_resume = pd.read_csv("../data/data_resume_cc.csv", usecols=['Category', 'Resume_c'])

In [None]:
THRESHOLD = 50
print(f'length of job data before filtering: {len(df_jobs)}')
df_jobs = hf.get_map_category(df_jobs, 'department', THRESHOLD )
print(f'length of job data after filtering: {len(df_jobs)}')

print(f'length of resume data before filtering: {len(df_resume)}')
df_resume = hf.get_map_category(df_resume, 'Category', THRESHOLD)
print(f'length of resume data after filtering: {len(df_resume)}')

In [None]:
# fig = go.Figure()
# resume_category = df_resume['Category'].value_counts()
# keys = resume_category.keys()
# list_y = [count for count in resume_category]

# fig.add_trace(go.Bar(x = keys,
#                     y = list_y,
#                     text= list_y))
# fig.update_layout(
#     title_text='Resume Data'
# )
# fig.show()

In [None]:
# fig = go.Figure()
# job_department = df_jobs['department'].value_counts()
# keys = job_department.keys()
# list_y = [count for count in job_department]

# fig.add_trace(go.Bar(x = keys,
#                     y = list_y,
#                     text= list_y))
# fig.update_layout(
#     title_text='Job Data'
# )
# fig.show()

### Data persistency

In [None]:
# save to pickle
tfidf_jobs = TfidfVectorizer()
# Generate matrix of word vectors
tfidf_job_matrix = tfidf_jobs.fit_transform(df_jobs['description_combined'])
df_tfidf_jobs = pd.DataFrame(tfidf_job_matrix.toarray())
df_tfidf_jobs.columns = tfidf_jobs.get_feature_names_out()
path = './pretrained/tfidf_myjob.pkl'
hf.save_tfidf(path, tfidf_jobs)

In [None]:
vec = hf.load_tfidf('./pretrained/tfidf_myjob.pkl')

In [None]:
# load from pickle
# vec = hf.load_tfidf('./pretrained/tfidf_job.pkl')
# svm_clf = hf.load_tfidf('./pretrained/tfidf_clf.pkl')
# kmean_model = hf.load_tfidf('./pretrained/tfidf_cluster.pkl')

### Train Validation Test

In [None]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
df_jobs['encoded_department'] = enc.fit_transform(df_jobs[['department']])

df_jobs_nan = df_jobs[df_jobs['department'].isna() == True]
df_jobs = df_jobs[df_jobs['department'].isna() == False]
# train_test_val 60, 20, 20
x_train_60, x_val_20, x_test_20, y_train_60, y_val_20, y_test_20 = hf.train_val_test_split(df_jobs['description_combined'], df_jobs['department'],0.6, 0.2, 0.2)
# train_test_val 70, 15, 15
x_train_70, x_val_15, x_test_15, y_train_70, y_val_15, y_test_15 = hf.train_val_test_split(df_jobs['description_combined'], df_jobs['department'],0.7, 0.15, 0.15)
# train_test_val 80, 10, 10
x_train_80, x_val_10, x_test_10, y_train_80, y_val_10, y_test_10 = hf.train_val_test_split(df_jobs['description_combined'], df_jobs['department'],0.8, 0.1, 0.1)

### Model Tunning and choose best classification model

####  Naive Bayes

In [None]:
nb_clfs = dict()
crit= [{"alpha": [0.001, 0.01, 0.1, 1], "fit_prior": [True, False]}]

In [None]:
tunned_nb_estimator = hf.tunning(model=MultinomialNB(),
                                     vectorizer=vec,
                                     crit=crit,
                                     cv=10,
                                     x=x_train_60,
                                     y=y_train_60)

nb_clfs['60'] = hf.get_classification_model_performance(tunned_nb_estimator, vec,
                                                        x_train_60, x_test_20, x_val_20,
                                                        y_train_60, y_test_20, y_val_20)

In [None]:
tunned_nb_estimator = hf.tunning(model=MultinomialNB(),
                                     vectorizer=vec,
                                     crit=crit,
                                     cv=10,
                                     x=x_train_70,
                                     y=y_train_70)

nb_clfs['70'] = nb_clfs['60'] = hf.get_classification_model_performance(tunned_nb_estimator, vec,
                                                        x_train_70, x_test_15, x_val_15,
                                                        y_train_70, y_test_15, y_val_15)

In [None]:
tunned_nb_estimator = hf.tunning(model=MultinomialNB(),
                                     vectorizer=vec,
                                     crit=crit,
                                     cv=10,
                                     x=x_train_80,
                                     y=y_train_80)

nb_clfs['80'] = hf.get_classification_model_performance(tunned_nb_estimator, vec,
                                                        x_train_80, x_test_10, x_val_10,
                                                        y_train_80, y_test_10, y_val_10)

#### Linear SVM

In [None]:
svm_clfs = dict()
crit= [{"C": [0.01, 0.1, 1],
        "kernel":['linear','poly','rbf']} # note that, all the segmoid kernel cases failed in the fitting process, so it's omitted
       ]

In [None]:
tunned_svm_estimator = hf.tunning(model= SVC(),
                                     vectorizer=vec,
                                     crit=crit,
                                     cv=10,
                                     x=x_train_60,
                                     y=y_train_60)
                                     
svm_clfs['60'] = hf.get_classification_model_performance(tunned_svm_estimator, vec,
                                                        x_train_60, x_test_20, x_val_20,
                                                        y_train_60, y_test_20, y_val_20)

In [None]:
tunned_svm_estimator = hf.tunning(model= SVC(),
                                     vectorizer=vec,
                                     crit=crit,
                                     cv=10,
                                     x=x_train_70,
                                     y=y_train_70)

svm_clfs['70'] = hf.get_classification_model_performance(tunned_svm_estimator, vec,
                                                        x_train_70, x_test_15, x_val_15,
                                                        y_train_70, y_test_15, y_val_15)

In [None]:
tunned_svm_estimator = hf.tunning(model= SVC(),
                                     vectorizer=vec,
                                     crit=crit,
                                     cv=10,
                                     x=x_train_60,
                                     y=y_train_60)

svm_clfs['80'] = hf.get_classification_model_performance(tunned_svm_estimator, vec,
                                                        x_train_80, x_test_10, x_val_10,
                                                        y_train_80, y_test_10, y_val_10)

#### KNN

In [None]:
knn_clfs = dict()
crit= [{"n_neighbors":range(4,15),
        "weights": ['uniform', 'distance']}
       ]

In [None]:
tunned_knn_estimator = hf.tunning(model= KNeighborsClassifier(),
                                     vectorizer=vec,
                                     crit=crit,
                                     cv=10,
                                     x=x_train_60,
                                     y=y_train_60)
                                     
knn_clfs['60'] = hf.get_classification_model_performance(tunned_knn_estimator, vec,
                                                        x_train_60, x_test_20, x_val_20,
                                                        y_train_60, y_test_20, y_val_20)

In [None]:
tunned_knn_estimator = hf.tunning(model= KNeighborsClassifier(),
                                     vectorizer=vec,
                                     crit=crit,
                                     cv=10,
                                     x=x_train_70,
                                     y=y_train_70)

knn_clfs['70'] = hf.get_classification_model_performance(tunned_knn_estimator, vec,
                                                        x_train_70, x_test_15, x_val_15,
                                                        y_train_70, y_test_15, y_val_15)

In [None]:
tunned_knn_estimator = hf.tunning(model= KNeighborsClassifier(),
                                     vectorizer=vec,
                                     crit=crit,
                                     cv=10,
                                     x=x_train_80,
                                     y=y_train_80)

knn_clfs['80'] = hf.get_classification_model_performance(tunned_knn_estimator, vec,
                                                        x_train_80, x_test_10, x_val_10,
                                                        y_train_80, y_test_10, y_val_10)

In [None]:
nb_clfs

In [None]:
svm_clfs

In [None]:
knn_clfs

### Classification Model Summary

In [None]:
import plotly.graph_objects as go
SET_RED = np.arange(0, 255,30)
SET_GREEN = [120] * len(SET_RED)
SET_BLUE = np.arange(235, 10, -25)
evaluation = ['accuracy', 'cv', 'macro_precision', 'macro_recall', 'macro_f1_score', 'time_cost']

In [None]:
model_name = ['nb_clf_60', 'nb_clf_70', 'nb_clf_80',
            'svm_clf_60', 'svm_clf_70', 'svm_clf_80',
            'knn_clf_60', 'knn_clf_70', 'knn_clf_80']
            
dict_evaluation = dict()
dict_evaluation['estimator'] = []
dict_evaluation['accuracy'] = []
dict_evaluation['cv10'] = []
dict_evaluation['precision'] = []
dict_evaluation['recall'] = []
dict_evaluation['f1_score'] = []
dict_evaluation['time_cost'] = []

for train_size in nb_clfs:
    for key in nb_clfs[train_size]:
        dict_evaluation[key].append(nb_clfs[train_size][key])

for train_size in svm_clfs:
    for key in nb_clfs[train_size]:
        dict_evaluation[key].append(svm_clfs[train_size][key])

for train_size in knn_clfs:
    for key in nb_clfs[train_size]:
        dict_evaluation[key].append(knn_clfs[train_size][key])

In [None]:
# !pip install nbformat

In [None]:
import plotly.io as pio
pio.renderers.default = "notebook"
fig = go.Figure()
keys = ['accuracy', 'precision', 'recall', 'f1_score', 'cv10']

for i, color_feature in enumerate(SET_RED):
    list_y = [dict_evaluation[key][i] for key in keys]
    fig.add_trace(go.Bar(x = keys,
                        y = list_y,
                        name = model_name[i],
                        marker_color= f'rgb({SET_RED[i]}, {SET_GREEN[i]}, {SET_BLUE[i]})',
                        text= list_y
                        ))

fig.update_traces(texttemplate='%{text:.3f}', textposition='outside')
fig.update_layout(
    title_text='Different Models Accuracy-Precision-Recall-F1',
    title_x=0.5,
    xaxis_tickfont_size=16,
    yaxis=dict(
        title='Score',
        titlefont_size=16,
        tickfont_size=16,
    ),
    legend=dict(
    orientation="h",
    yanchor="top",
    y=-0.1,
    xanchor="left",
    x=-0.04
    ),

    barmode='group',
    bargap=0.15, # gap between bars of adjacent location coordinates.
    bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()

In [None]:
fig = go.Figure()
keys = ['time_cost']
for i, color_feature in enumerate(SET_RED):
    list_y = [dict_evaluation[key][i] for key in keys]
    fig.add_trace(go.Bar(x = keys,
                        y = list_y,
                        name = model_name[i],
                        marker_color= f'rgb({SET_RED[i]}, {SET_GREEN[i]}, {SET_BLUE[i]})',
                        text= list_y
                        ))

fig.update_traces(texttemplate='%{text:.3f}', textposition='outside')
fig.update_layout(
    title_text='Time Cost for Training',
    title_x=0.5,
    xaxis_tickfont_size=16,
    yaxis=dict(
        title='minutes',
        titlefont_size=16,
        tickfont_size=16,
    ),
    legend=dict(
    orientation="h",
    yanchor="top",
    y=-0.1,
    xanchor="left",
    x=-0.04
    ),

    barmode='group',
    bargap=0.15, # gap between bars of adjacent location coordinates.
    bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()

## Content-based RS

### Classification Models(Naive Bayes, SVM, KNN)

#### Naive Bayes

In [None]:
# train a classification model based on job description
nb_clf = Pipeline([
    ('tf', vec),
    ('clf', nb_clfs['60']['estimator']),
])

nb_clf.fit(x_train_60, y_train_60)
pred = nb_clf.predict(x_test_20)
pred_resume = nb_clf.predict(df_resume['Resume_c'])
accuracy = np.mean(pred_resume == df_resume['Category'])
accuracy

In [None]:
# get confusion matrix
unique_labels = np.unique(df_jobs['department'])
cf_report = hf.get_classification_report(unique_labels, y_test_20, pred)

In [None]:
# select one user, recommend top-10 jobs
applicant = df_resume.iloc[0]
# model predict category of the resume
pred_applicant = nb_clf.predict([applicant['Resume_c']])[0]
pred_applicant == applicant['Category']

In [None]:
jobs_top_n = hf.get_top_n_jobs_from_clf(df_jobs, pred_applicant, applicant['Resume_c'], vec, sf.cal_cosine_similarity)
recommendation_from_clf = df_jobs.iloc[jobs_top_n][['title', 'department', 'description_combined']]
recommendation_from_clf_filtered = recommendation_from_clf[recommendation_from_clf['department'] == applicant['Category']]

#### SVM


In [None]:
# train a classification model based on job description
svm_clf = Pipeline([
    ('tf', vec),
    ('clf', svm_clfs['70']['estimator']),
])

svm_clf.fit(x_train_70, y_train_70)
pred = svm_clf.predict(x_test_15)
pred_resume = svm_clf.predict(df_resume['Resume_c'])
accuracy = np.mean(pred_resume == df_resume['Category'])
accuracy
x_train_70.shape

In [None]:
# get confusion matrix
unique_labels = np.unique(df_jobs['department'])
cf_report = hf.get_classification_report(unique_labels, y_test_15, pred)
# select one user, recommend top-10 jobs
applicant = df_resume.iloc[0]
# model predict category of the resume
pred_applicant = svm_clf.predict([applicant['Resume_c']])[0]
jobs_top_n = hf.get_top_n_jobs_from_clf(df_jobs, pred_applicant, applicant['Resume_c'], vec, sf.cal_cosine_similarity)
recommendation_from_svm_clf = df_jobs.iloc[jobs_top_n][['title', 'department', 'description_combined']]
recommendation_from_svm_clf_filtered = recommendation_from_svm_clf[recommendation_from_svm_clf['department'] == applicant['Category']]

#### KNN

In [None]:
# train a classification model based on job description
knn_clf = Pipeline([
    ('tf', vec),
    ('clf', knn_clfs['80']['estimator']),
])

knn_clf.fit(x_train_80, y_train_80)
pred = knn_clf.predict(x_test_10)
pred_resume = knn_clf.predict(df_resume['Resume_c'])
accuracy = np.mean(pred_resume == df_resume['Category'])
accuracy


In [None]:
# get confusion matrix
unique_labels = np.unique(df_jobs['department'])
cf_report = hf.get_classification_report(unique_labels, y_test_10, pred)
# select one user, recommend top-10 jobs
applicant = df_resume.iloc[0]
# model predict category of the resume
pred_applicant = knn_clf.predict([applicant['Resume_c']])[0]
pred_applicant == applicant['Category']
jobs_top_n = hf.get_top_n_jobs_from_clf(df_jobs, pred_applicant, applicant['Resume_c'], vec, sf.cal_cosine_similarity)
recommendation_from_knn_clf = df_jobs.iloc[jobs_top_n][['title', 'department', 'description_combined']]
recommendation_from_knn_clf_filtered = recommendation_from_knn_clf[recommendation_from_knn_clf['department'] == applicant['Category']]

### Clustering Model(Kmeans)

In [None]:
tfidf_matrix = vec.transform(df_jobs['description_combined'])
df_tfidf = pd.DataFrame(tfidf_matrix.toarray())

In [None]:
from sklearnex import unpatch_sklearn
unpatch_sklearn()
from sklearn.cluster import KMeans
from kneed import KneeLocator
import plotly.graph_objects as go
def elbow_method(data, number):
    wcss = []
    for i in range(1, number+1):
        kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
        kmeans.fit(data)
        wcss.append(kmeans.inertia_)
    kn = KneeLocator(range(1, number+1), wcss, curve='convex', direction='decreasing')

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=list(range(1, number+1)),
                            y=wcss))
    fig.add_vline(x=kn.knee, line_width=3, line_dash="dash", line_color="green")

    fig.update_layout(title='Elbow Method',
                      xaxis_title='Number of clusters',
                      yaxis_title='WCSS',
                      title_x=0.5,
                      height=500, 
                      width=800)
    fig.show()

In [None]:
# render setting for vscode
import plotly.io as pio
pio.renderers.default = "notebook"
elbow_method(df_tfidf, 15)

In [None]:
# Train clustering model
applicant = df_resume.iloc[0]
kmean_model = KMeans(n_clusters=len(np.unique(df_jobs['department'])), random_state=41)
pred_kmeans_jobs = kmean_model.fit_predict(pd.DataFrame(vec.transform(df_jobs['description_combined']).toarray()))
# prediction: number of cluster
pred_kmeans_applicant = kmean_model.predict(vec.transform([applicant['Resume_c']]))[0] 
# add clustering label to data
df_jobs['cluster'] = kmean_model.labels_

In [None]:
top_n_recommend_cluster = hf.get_top_n_jobs_from_cluster(df_jobs, pred_kmeans_applicant, applicant['Resume_c'], vec, sf.cal_cosine_similarity)
recommendation_from_cluster = df_jobs.iloc[top_n_recommend_cluster][['title', 'department', 'description_combined']]
recommendation_from_cluster_filtered = recommendation_from_cluster[recommendation_from_cluster['department'] == applicant['Category']]

## Collaborative filtering

In [None]:
# Assumption: last 10% of the applicants got offers from their top-5 recommendation
applicant_pool_with_offer = df_resume[:(int)(len(df_resume)*0.5)]

In [None]:
applicant = df_resume.iloc[0]
application_pool = applicant_pool_with_offer[applicant_pool_with_offer['Category'] == applicant['Category']]

In [None]:
temp = application_pool['Resume_c'].append(pd.Series(applicant['Resume_c']))
matrix =vec.transform(temp)
term_matrix = matrix.todense()
cossim = sf.cal_cosine_similarity(term_matrix)
index_similar_applicant = np.asarray(cossim[-1][np.where(cossim[-1] < 1)]).argsort()[::-1][:10]

In [None]:
index_similar_applicant

In [None]:
cf_jobs = hf.get_top_n_jobs_from_cf(df_jobs, df_resume, index_similar_applicant, knn_clf, vec, sf.cal_cosine_similarity, 1)

In [None]:
recommendation_from_cf = df_jobs.iloc[cf_jobs][['title', 'department', 'description_combined']]
recommendation_from_cf_filtered = recommendation_from_cf[recommendation_from_cf['department'] == applicant['Category']]

## Overall Recommendation Accuracy

## Similarity

### Cosine Similarity

In [None]:
from sklearnex import patch_sklearn
unpatch_sklearn()
from sklearn.svm import SVC
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score

def cal_cosine_similarity(term_matrix):
    return cosine_similarity(term_matrix, term_matrix)

def cal_jaccard_score(term_matrix):
    return jaccard_score(term_matrix, term_matrix, average='samples')

def cal_pearson_score(term_matrix):
    return np.corrcoef(term_matrix, term_matrix)

In [None]:
# accuracy for each department
resumes = df_resume[:(int)(len(df_resume)*0.5)]['Resume_c']
categories = df_resume[:(int)(len(df_resume)*0.5)]['Category']
sim_method = cal_cosine_similarity
unique_labels = df_jobs['department'].unique()
accuracy_clf = dict()
accuracy_cluster = dict()
accuracy_cf = dict()
for department in unique_labels:
    accuracy_clf[department] = []
    accuracy_cluster[department] = []
    accuracy_cf[department] = []
    resumes = df_resume[df_resume['Category'] == department]['Resume_c']
    for i, resume in enumerate(resumes):
#         # content-based with svm 
        pred_applicant_ = svm_clf.predict([resume])[0]
        job_index = hf.get_top_n_jobs_from_clf(df_jobs, pred_applicant_, resume, vec, sim_method)
        rec = df_jobs.iloc[job_index][['department']]
        rec_filtered = rec[rec['department'] == department]
        accuracy_clf[department].append(len(rec_filtered) / len(rec) if len(rec) != 0 else 0)
        # content-based with kmeans
        pred_applicant_ = kmean_model.predict(vec.transform([resume]))[0] 
        job_index = hf.get_top_n_jobs_from_cluster(df_jobs, pred_applicant_, resume, vec, sim_method)
        rec = df_jobs.iloc[job_index][['department']]
        rec_filtered = rec[rec['department'] == department]
        accuracy_cluster[department].append(len(rec_filtered) / len(rec) if len(rec) != 0 else 0)
        # cf
        application_pool = applicant_pool_with_offer[applicant_pool_with_offer['Category'] == department]
        temp = application_pool['Resume_c'].append(pd.Series(resume))
        matrix =vec.transform(temp)
        term_matrix = matrix.todense()
        cossim = cal_cosine_similarity(term_matrix)
        index_similar_applicant = np.asarray(cossim[-1][np.where(cossim[-1] < 1)]).argsort()[::-1][:10]
        cf_jobs = hf.get_top_n_jobs_from_cf(df_jobs, df_resume, index_similar_applicant, svm_clf, vec, sim_method, 1)
        rec = df_jobs.iloc[cf_jobs][['department']]
        rec_filtered = rec[rec['department'] == categories.iloc[i]]
        accuracy_cf[department].append(len(rec_filtered) / len(rec) if len(rec) != 0 else 0)

for department in unique_labels:
    accuracy_clf[department] = np.mean(accuracy_clf[department])
    accuracy_cluster[department] = np.mean(accuracy_cluster[department])
    accuracy_cf[department] = np.mean(accuracy_cf[department])

In [None]:
fig = go.Figure()
list_y = [accuracy_clf[department] for department in accuracy_clf]
fig.add_trace(go.Bar(x = unique_labels,
                    name='SVM',
                    y = list_y,
                    text= list_y))
list_y = [accuracy_cluster[department] for department in accuracy_cluster]
fig.add_trace(go.Bar(x = unique_labels,
                    name='KMeans',
                    y = list_y,
                    text= list_y))
list_y = [accuracy_cf[department] for department in accuracy_cf]
fig.add_trace(go.Bar(x = unique_labels,
                    name='CF',
                    y = list_y,
                    text= list_y))
                    
fig.update_traces(texttemplate='%{text:.3f}', textposition='outside')
fig.update_layout(
    title_text='Recommendation Accuracy'
)
fig.show()

### Pearson Similarity

In [None]:
# accuracy for each department
resumes = df_resume[:(int)(len(df_resume)*0.5)]['Resume_c']
categories = df_resume[:(int)(len(df_resume)*0.5)]['Category']
sim_method = sf.cal_pearson_score
unique_labels = df_jobs['department'].unique()
accuracy_clf = dict()
accuracy_cluster = dict()
accuracy_cf = dict()
for department in unique_labels:
    accuracy_clf[department] = []
    accuracy_cluster[department] = []
    accuracy_cf[department] = []
    resumes = df_resume[df_resume['Category'] == department]['Resume_c']
    print(resumes.shape)
    for i, resume in enumerate(resumes):
        # content-based with svm 
        pred_applicant_ = svm_clf.predict([resume])[0]
        job_index = hf.get_top_n_jobs_from_clf(df_jobs, pred_applicant_, resume, vec, sim_method)
        rec = df_jobs.iloc[job_index][['department']]
        rec_filtered = rec[rec['department'] == department]
        accuracy_clf[department].append(len(rec_filtered) / len(rec) if len(rec) != 0 else 0)
        # content-based with kmeans
        pred_applicant_ = kmean_model.predict(vec.transform([resume]))[0] 
        job_index = hf.get_top_n_jobs_from_cluster(df_jobs, pred_applicant_, resume, vec, sim_method)
        rec = df_jobs.iloc[job_index][['department']]
        rec_filtered = rec[rec['department'] == department]
        accuracy_cluster[department].append(len(rec_filtered) / len(rec) if len(rec) != 0 else 0)
        # cf
        application_pool = applicant_pool_with_offer[applicant_pool_with_offer['Category'] == department]
        temp = application_pool['Resume_c'].append(pd.Series(resume))
        matrix =vec.transform(temp)
        term_matrix = matrix.todense()
        cossim = sf.cal_pearson_score(term_matrix)
        index_similar_applicant = np.asarray(cossim[-1][np.where(cossim[-1] < 1)]).argsort()[::-1][:10]
        cf_jobs = hf.get_top_n_jobs_from_cf(df_jobs, df_resume, index_similar_applicant, svm_clf, vec, sim_method, 1)
        rec = df_jobs.iloc[cf_jobs][['department']]
        rec_filtered = rec[rec['department'] == categories.iloc[i]]
        accuracy_cf[department].append(len(rec_filtered) / len(rec) if len(rec) != 0 else 0)

for department in unique_labels:
    accuracy_clf[department] = np.mean(accuracy_clf[department])
    accuracy_cluster[department] = np.mean(accuracy_cluster[department])
    accuracy_cf[department] = np.mean(accuracy_cf[department])

In [None]:
fig = go.Figure()
list_y = [accuracy_clf[department] for department in accuracy_clf]
fig.add_trace(go.Bar(x = unique_labels,
                    name='SVM',
                    y = list_y,
                    text= list_y))
list_y = [accuracy_cluster[department] for department in accuracy_cluster]
fig.add_trace(go.Bar(x = unique_labels,
                    name='KMeans',
                    y = list_y,
                    text= list_y))
list_y = [accuracy_cf[department] for department in accuracy_cf]
fig.add_trace(go.Bar(x = unique_labels,
                    name='CF',
                    y = list_y,
                    text= list_y))
                    
fig.update_traces(texttemplate='%{text:.3f}', textposition='outside')
fig.update_layout(
    title_text='Recommendation Accuracy'
)
fig.show()

### Jaccard Similarity

In [None]:
from sklearn.metrics import jaccard_score
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()

y_true= [df_jobs['department'].append(pd.Series(df_resume.iloc[0]['Category']))]
y_true = enc.fit_transform(y_true)

In [None]:
x_pred = df_jobs['description_combined'].append(pd.Series(df_resume.iloc[0]['Resume_c']))
y_pred = [[svm_clf.predict([x])[0],0] for x in x_pred]


In [None]:
y_pred = enc.transform(y_pred[0])
# # matrix = vec.transform(temp)
# # term_matrix = matrix.todense()
# sim_matrix = jaccard_score(y_true, y_pred, average='samples')

In [None]:
# accuracy for each department
resumes = df_resume[:(int)(len(df_resume)*0.5)]['Resume_c']
categories = df_resume[:(int)(len(df_resume)*0.5)]['Category']
sim_method = sf.cal_jaccard_score
unique_labels = df_jobs['department'].unique()
accuracy_clf = dict()
accuracy_cluster = dict()
accuracy_cf = dict()
for department in unique_labels:
    accuracy_clf[department] = []
    accuracy_cluster[department] = []
    accuracy_cf[department] = []
    resumes = df_resume[df_resume['Category'] == department]['Resume_c']
    for i, resume in enumerate(resumes):
        # content-based with knn 
        pred_applicant_ = svm_clf.predict([resume])[0]
        job_index = hf.get_top_n_jobs_from_clf(df_jobs, pred_applicant_, resume, vec, sim_method)
        rec = df_jobs.iloc[job_index][['department']]
        rec_filtered = rec[rec['department'] == department]
        accuracy_clf[department].append(len(rec_filtered) / len(rec) if len(rec) != 0 else 0)

for department in unique_labels:
    accuracy_clf[department] = np.mean(accuracy_clf[department])
    accuracy_cluster[department] = np.mean(accuracy_cluster[department])
    accuracy_cf[department] = np.mean(accuracy_cf[department])

## Result

In [None]:
recommendation_from_clf

In [None]:
recommendation_from_clf_filtered

In [None]:
recommendation_from_cluster

In [None]:
recommendation_from_cluster_filtered

In [None]:
recommendation_from_cf

In [None]:
recommendation_from_cf_filtered

In [None]:
list_all_rec = [recommendation_from_cf, recommendation_from_clf, recommendation_from_cluster]
all_recommendation = pd.concat(list_all_rec)
all_recommendation

In [None]:
list_filtered_rec = [recommendation_from_cf_filtered, recommendation_from_clf_filtered, recommendation_from_cluster_filtered]
final_recommendation = pd.concat(list_filtered_rec)
final_recommendation

## Error Analysis

#### data 

In [None]:
false_recommendation = all_recommendation[all_recommendation['department'] != applicant['Category']]
false_recommendation

In [None]:
false_recommendation_matrix = vec.transform(false_recommendation['description_combined'])
false_recommendation_matrix = false_recommendation_matrix.todense()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import wordcloud

index_false_rec = [5973, 2295, 962, 948]
for it, target in enumerate(index_false_rec):
    vec.get_feature_names_out()[target]
    false_recommendation_word_list = []
    for index in range(0, len(false_recommendation)):
        a_list = []
        word_indice = np.where(false_recommendation_matrix[index] != 0)[1]
        for i in word_indice:
            a_list.append(vec.get_feature_names_out()[i])
        false_recommendation_word_list.append(a_list)
        
    text = ' '.join(false_recommendation_word_list[0])
    wc = wordcloud.WordCloud(collocations=False, background_color='black', max_words=1000, 
                            max_font_size=50)
    wc = wc.generate(text)
    fig = plt.figure(num=it)
    plt.axis('off')
    plt.imshow(wc, cmap=None)
    plt.show()

In [None]:
b = vec.transform([applicant['Resume_c']])
b = b.todense()
text = ' '.join(vec.get_feature_names_out()[np.where(b != 0)[1]])

wc = wordcloud.WordCloud(collocations=False, background_color='black', max_words=100, 
                          max_font_size=50)
wc = wc.generate(text)

fig = plt.figure(num=1)
plt.axis('off')
plt.imshow(wc, cmap=None)
plt.show()

#### bias and variance

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95, iterated_power='auto', svd_solver='auto', random_state=41)
a = vec.transform(df_jobs['description_combined'])
pca_a = pca.fit(a.todense())
pca_a.n_components_

In [None]:
import matplotlib.pyplot as plt
plt.plot([i for i in range(pca_a.n_components_)],
        [np.sum(pca_a.explained_variance_ratio_[:i+1]) for i in range(pca_a.n_components_)])
plt.show()

In [None]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=pca_a.n_components_, random_state=41 )
data = svd.fit_transform(vec.transform(df_jobs['description_combined']))
data

In [None]:
svm_model = SVC(C=1, kernel='linear') # svm_clf_70
svm_clf_reduced = svm_model.fit(svd.transform(vec.transform(x_train_70)), y_train_70)

In [None]:
pred_department_reduced = svm_clf_reduced.predict(svd.transform(vec.transform(x_test_15)))
np.mean(pred_department_reduced == y_test_15)

In [None]:
from mlxtend.evaluate import bias_variance_decomp

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        svm_clf, x_train_70.values, df_jobs['encoded_department'][y_train_70.index].values, x_test_15.values, df_jobs['encoded_department'][y_test_15.index].values, 
        loss='mse',
        random_seed=41)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)

In [None]:
from mlxtend.evaluate import bias_variance_decomp

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        svm_clf_reduced, svd.transform(vec.transform(x_train_70.values)), df_jobs['encoded_department'][y_train_70.index].values,
                        svd.transform(vec.transform(x_test_15.values)), df_jobs['encoded_department'][y_test_15.index].values, 
        loss='mse',
        random_seed=41)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)