In [1]:
!pip install scikit-learn



In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [3]:
apps_data = pd.read_csv('filtered_apps.csv')
users_data = pd.read_csv('filtered_users.csv')
jobs_data = pd.read_csv('filtered_jobs.csv')

In [4]:
users.head()

Unnamed: 0,UserID,WindowID,Split,City,State,Country,ZipCode,DegreeType,Major,GraduationDate,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany
0,554,1,Train,Altamonte Springs,FL,US,32701,Bachelor's,Legal Studies,2006-12-01 00:00:00,2,4.0,No,No,0
1,769,1,Test,Roselle,IL,US,60172,Bachelor's,Radio-Television,2011-05-01 00:00:00,5,5.0,Yes,No,0
2,1697,1,Train,Justice,IL,US,60458,High School,Not Applicable,2007-01-01 00:00:00,4,6.0,,No,0
3,2652,1,Test,Glendale,CA,US,91204,High School,Not Applicable,1987-01-01 00:00:00,3,21.0,No,Yes,3
4,4359,1,Train,Valrico,FL,US,33594,High School,Not Applicable,2010-01-01 00:00:00,3,2.0,Yes,No,0


### Job Recommendation based on item-item

In [None]:
apps = apps_data.copy()
users = users_data.copy()
jobs = jobs_data.copy()

In [6]:

# First, generate dictionaries for mapping old id to new id for users and movies
unique_JobID = apps['JobID'].unique()
unique_UserID = apps['UserID'].unique()
j = 0
user_old2new_id_dict = dict()
user_new2old_id_dict = dict()
for u in unique_UserID:
    user_old2new_id_dict[u] = j
    user_new2old_id_dict[j] = u
    j += 1
j = 0
job_old2new_id_dict = dict()
job_new2old_id_dict = dict()
for i in unique_JobID:
    job_old2new_id_dict[i] = j
    job_new2old_id_dict[j] = i
    j += 1


# Then, use the generated dictionaries to reindex UserID and JobID in the data_df
user_list = apps['UserID'].values
job_list = apps['JobID'].values
for j in range(len(apps)):
    user_list[j] = user_old2new_id_dict[user_list[j]]
    job_list[j] = job_old2new_id_dict[job_list[j]]
apps['UserID'] = user_list
apps['JobID'] = job_list

# generate train_df with 70% samples and test_df with 30% samples, and there should have no overlap between them.
train_index = np.random.random(len(apps)) <= 0.7
train_df = apps[train_index]
test_df = apps[~train_index]

In [7]:
train_df['Applied?'] = 1
test_df['Applied?'] = 1

from scipy.sparse import coo_matrix

# generate train_mat and test_mat
num_users = len(apps['UserID'].unique())
num_jobs = len(apps['JobID'].unique())

train_mat = coo_matrix((train_df['Applied?'].values, (train_df['UserID'].values, train_df['JobID'].values)), shape=(num_users, num_jobs)).astype(float).toarray()
test_mat = coo_matrix((test_df['Applied?'].values, (test_df['UserID'].values, test_df['JobID'].values)), shape=(num_users, num_jobs)).astype(float).toarray()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['Applied?'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['Applied?'] = 1


In [9]:
print(train_mat)

[[1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


We added the title to the job description column of the data to run the tfidf on it.

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
itemitem_cosine_similarities = cosine_similarity(train_mat.transpose())
print(itemitem_cosine_similarities)

[[1.         0.64285714 0.43643578 ... 0.         0.         0.        ]
 [0.64285714 1.         0.32732684 ... 0.         0.         0.        ]
 [0.43643578 0.32732684 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [37]:
train_mat.shape[1]

6971

In [26]:

itemitem = np.zeros((train_mat.shape[0], train_mat.shape[1]))
valval = np.zeros((train_mat.shape[0], train_mat.shape[1]))

for i in range(0, itemitem.shape[0]):
  for j in range(0, itemitem.shape[1]):
    itemitem[i][j] = -1

twenty_neighbours = np.argsort(itemitem_cosine_similarities, axis = 0)[::-1][1:101]

mean = np.average(train_mat)
b_u = np.average(train_mat, axis = 1)
b_i = np.average(train_mat, axis = 0)

for user in range(train_mat.shape[0]):
  remaining_items = np.where(train_mat[user] == 0)[0]
  remaining_items_rating_dict = dict()
  for item in remaining_items:
    b_ui = b_u[user] + b_i[item] - mean
    num = 0
    den = 0
    for n in twenty_neighbours[:, item]:
      num = num + (itemitem_cosine_similarities[item][n] * (train_mat[user][n] - b_ui))
      den = den + itemitem_cosine_similarities[item][n]
    remaining_items_rating_dict[item] = (num/den) + b_ui
  remaining_items_rating_dict = dict(sorted(remaining_items_rating_dict.items(), key = lambda k: k[1], reverse = True))

  i = 0
  for item in remaining_items_rating_dict:
    itemitem[user][i] = item
    valval[user][i] = remaining_items_rating_dict[item]
    i = i + 1
    if i == train_mat.shape[1]:
      break

  remaining_items_rating_dict[item] = (num/den) + b_ui


In [27]:
print(remaining_items_rating_dict)

{177: 8.673617379884035e-18, 1060: 6.938893903907228e-18, 1107: 6.071532165918825e-18, 252: 5.204170427930421e-18, 891: 4.7704895589362195e-18, 3630: 4.7704895589362195e-18, 3617: 4.336808689942018e-18, 3616: 3.903127820947816e-18, 3537: 3.469446951953614e-18, 3842: 3.469446951953614e-18, 3546: 3.0357660829594124e-18, 3691: 3.0357660829594124e-18, 3734: 3.0357660829594124e-18, 3751: 3.0357660829594124e-18, 3782: 3.0357660829594124e-18, 5295: 2.8189256484623115e-18, 3503: 2.6020852139652106e-18, 3504: 2.6020852139652106e-18, 3733: 2.6020852139652106e-18, 3737: 2.6020852139652106e-18, 3746: 2.6020852139652106e-18, 3498: 2.168404344971009e-18, 3629: 2.168404344971009e-18, 3646: 2.168404344971009e-18, 3741: 2.168404344971009e-18, 3783: 2.168404344971009e-18, 3846: 2.168404344971009e-18, 3799: 1.951563910473908e-18, 3930: 1.951563910473908e-18, 3535: 1.734723475976807e-18, 3544: 1.734723475976807e-18, 3549: 1.734723475976807e-18, 3554: 1.734723475976807e-18, 3788: 1.734723475976807e-18, 379

In [28]:
print(itemitem.shape)

(6223, 6971)


In [29]:
print(valval)

[[4.31764696e-01 4.17338688e-01 4.11126873e-01 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [1.76800237e-01 1.67874544e-01 1.65090794e-01 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [2.10069540e-01 1.99259287e-01 1.98216879e-01 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [8.67361738e-18 8.23993651e-18 6.93889390e-18 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [6.07153217e-18 6.07153217e-18 6.07153217e-18 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [8.67361738e-18 6.93889390e-18 6.07153217e-18 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]]


In [30]:
predicted = np.zeros((train_mat.shape[0], train_mat.shape[1]))
for x in range(0, itemitem.shape[0]):
  for y in range(0, itemitem.shape[1]):
    if itemitem[x][y]!= 0 and valval[x][y] != np.NaN and valval[x][y] > 0.01:
      predicted[x][int(itemitem[x][y])] = valval[x][y]
print(predicted)
     

[[0.         0.38143858 0.36908104 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [31]:
import math
cnt = 0
sum = 0
for x in range(0, test_mat.shape[0]):
  for y in range(0, test_mat.shape[1]):
    if test_mat[x][y] != 0 and test_mat[x][y] != np.nan:
      a1 = float(test_mat[x][y])
      a2 = predicted[x][y]
      sum = sum + (a1-a2)*(a1-a2)
      #print(sum)
      if a1 == np.nan or a2 == np.nan or sum == np.nan:
        print(x, y)
      cnt = cnt + 1
    if cnt == 10:
      break
print(math.sqrt(sum/cnt))

0.7222767236001382


In [32]:
def find_best_recommendation_for_user(user_id):
    arr = predicted[user_id]
    indexed_arr = list(enumerate(arr))

    # Sort the array of tuples based on probabilities in descending order
    sorted_arr = sorted(indexed_arr, key=lambda x: x[1], reverse=True)

    # Extract the indices of the top 10 probabilities
    top_10_indices = [job_new2old_id_dict[index] for index, _ in sorted_arr[:10]]

    best_jobs = jobs[jobs['JobID'].isin(top_10_indices)]

    print("Top 10 jobs for the user are: \n")
    
    for _, job in best_jobs.iterrows():
        print(job['Title'])

In [35]:
find_best_recommendation_for_user(90)

Top 10 jobs for the user are: 

Office Assistant
Customer Service Representative
Customer Service
Customer Service Representative
Office assistant/Receptionist
Contact Center Agent
Customer Service
Administrative Assistant
Accounts Receivable
Receptionist/Customer Service


### Job Search based on items


We will be applying TF-IDF on the description column of the dataset.

In [None]:
jobs['Title'] = jobs['Title'].fillna('')
jobs['DescCleaned'] = jobs['DescCleaned'].fillna('')
jobs['DescCleaned'] = jobs['Title'] + ' ' + jobs['DescCleaned']

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0.0, stop_words='english')
tfidf_matrix = tf.fit_transform(jobs['Description'])

In [16]:
tfidf_matrix

<6971x332029 sparse matrix of type '<class 'numpy.float64'>'
	with 1328457 stored elements in Compressed Sparse Row format>

In [17]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [18]:
cosine_sim[0]

array([1.        , 0.0652381 , 0.00246659, ..., 0.00964788, 0.01140206,
       0.04338644])

In [19]:
jobs_US_base_line = jobs.reset_index()
titles = jobs_US_base_line['Title']
indices = pd.Series(jobs.index, index=jobs['Title'])

In [20]:
def get_recommendations(query):
    # Calculate the TF-IDF vector representation of the query
    query_tfidf = tf.transform([query])

    cosine_similarities = linear_kernel(query_tfidf, tfidf_matrix).flatten()

    top_similar_indices = cosine_similarities.argsort()[::-1]

    # Get the top N closest jobs
    N = 10 
    top_N_jobs = top_similar_indices[:N]

    # Printing those top 10 job's cleaned description
    for idx in top_N_jobs:
        print(jobs.iloc[idx]['DescCleaned'])
        print()


In [22]:
get_recommendations('Customer Service Rep')

Customer Service Customer Service heavy duty truck parts company looking for customer service rep.  

Customer Service Rep Customer Service Rep spring into a fantastic career opportunity! | customer service rep | we are a firm located in a popular cincinnati neighborhood, making it difficult to leave even after the day is done! we feel that welcoming a new customer service rep through our doors will help complete our growing company! experience with customer service will be beneficial for you as our new customer service rep. if you ve always wanted a career helping others, you ll fit right in! we offer a competitive benefits package, even for your family! holidays, vacation, great salary, and more! we're looking to begin interviewing immediately, so apply now and become our newest customer service rep!

Customer Service/Sales Customer Service/Salescustomer service rep needed w/citizens & flood experience. bilingual eng/span a must. source - miami herald

Customer Service Representative

In [23]:
get_recommendations('Immediate Opening')

HR Assistant HR Assistantimmediate opening for a human resource assistant

Administrative Assistant Administrative Assistantimmediate opening for an administrative assistant. temp-hire administrative position with a large distribution center.

Medical Receptionst Medical Receptionstimmediate opening! a successful, busy, baltimore county medical practice has an immediate opening for a medical receptionist you will also be a front desk receptionist. previous experience in the medical field is a prerequisite. coding, charge posting, payment and co-payment collection. accounts receivable experience preferred. working knowledge of insurances, computerized registration, billing and collection procedures. full-time position.

Call Center Customer Service Call Center Customer Servicefast paced company in arlington is seeking an ambitious and outgoing call center representative for an immediate opening. this is a full-time temporary position lasting up to a month and is metro accessible.   

Li

In [24]:
get_recommendations('P/T HUMAN RESOURCES ASSISTANT')

Human Resources Assistant Human Resources Assistantfirm is seeking a human resources assistant with exceptional skills to join their team. human resources assistant must possess excellent verbal and written communication skills.

Human Resources Assistant / Jr Generalist Human Resources Assistant / Jr Generalist human resources assistant / jr. generalist growing trading and technology firm is looking for motivated candidates for an human resources assistant / jr generalist to join our new york based team . this person will report to the head of human resources and will provide support to the hr director and the hr team in multiple areas (benefits, payroll, new hire orientation, training, etc.). you will help to manage and update employee information in a centralized hr database. employment verifications, creating employee files, and answering general questions from employees. collegial atmosphere, great place to learn and be part of a team. jump start your career in human resources wit

In [25]:
get_recommendations('TRACTOR TRAILER DRIVER')

Office Administration Part Time - Philadelphia Office Administration Part Time - Philadelphiathe office administration position will require daily interactions with dock workers, tractor trailer drivers, delivery drivers, and customers. strong communication and organizational skills are a must. 1-2 years experience in transportation industry is preferred. this position is for second shift operating between 6 00pm and 2 00am. must have own transportation. job responsibilities receiving and responding to customers regarding international and domestic shipments preparing customs documentation communication of arrival information to customers basic data entry filing of paperwork preparing paperwork for departing loads proactively communicates with customers and resolves issues receive and enter shipments tendered through fax, e-mail, and edi in timely manner  

Administrative Management Specialist Administrative Management Specialist administrative management specialist truckway leasing an