In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [7]:
# apps = pd.read_csv('../apps.tsv', delimiter='\t',encoding='utf-8')
# user_history = pd.read_csv('../user_history.tsv', delimiter='\t',encoding='utf-8')
# jobs = pd.read_csv('../input_data/jobs.tsv', delimiter='\t',encoding='utf-8', on_bad_lines="skip")
# users = pd.read_csv('../input_data/users.tsv' ,delimiter='\t',encoding='utf-8')

apps = pd.read_csv('apps.csv')
jobs = pd.read_csv('jobs.csv')
users = pd.read_csv('users.csv')
user_history = pd.read_csv('work_history.csv')

In [9]:
train_user = users[users.Split=="Train"]
test_user = users[users.Split=="Test"]

In [10]:
import joblib
from tensorflow.keras.models import load_model

job_vectorizer = joblib.load('../Backend Algos/tfidf_data/job_description_tfidf.pkl')
jobdesc_tfidf_matrix = joblib.load('../Backend Algos/tfidf_data/job_description_tfidf_matrix.pkl')

work_history_vectorizer = joblib.load('../Backend Algos/tfidf_data/work_history_tfidf.pkl')

# For USER USER Matching
user_tfidf_matrix = joblib.load('../Backend Algos/tfidf_data/user_tfidf_matrix.pkl')
user_tfidf_vectorizer = joblib.load('../Backend Algos/tfidf_data/user_tfidf_vectorizer.pkl')

model = load_model('../Backend Algos/tfidf_data/keras_model.h5')



In [11]:
jobdesc_tfidf_matrix

<115684x100 sparse matrix of type '<class 'numpy.float64'>'
	with 3494490 stored elements in Compressed Sparse Row format>

In [12]:
jobs.shape

(115684, 11)

In [79]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [13]:
def getTopJobs(userIds):
    # print(userIds)
    jobsSet = set()
    for user in userIds:
        jobsAppliedTo = apps[apps['UserID']==user]
        jobsAppliedTo = jobsAppliedTo['JobID'].values
        for job in jobsAppliedTo:
            jobsSet.add(job)
            if (len(jobsSet) > 100):
                break
        if (len(jobsSet) > 100):
                break
    
    return jobsSet

In [17]:

def buildXtrain(jobSet, userProfile, past_work_ex, city, state):
    # Firstly, we create the user_profile to training_data form
    user_feature = np.array(userProfile)
    work_ex_transform = work_history_vectorizer.transform([past_work_ex])
    user_feature = np.concatenate((user_feature, work_ex_transform.toarray()[0]))

    X = np.zeros((1,158))

    # Then, we add the job_transform_tfidf to the user
    for jobId in jobSet:
        jobInfo = jobs[jobs['JobID'] == jobId]
        idx = jobInfo.index.values[0]
        # print(jobdesc_tfidf_matrix.shape)
        feature = np.concatenate((user_feature, jobdesc_tfidf_matrix[idx, :].toarray()[0]))

        if jobInfo['City'].values[0] == city:
            feature = np.append(feature, [1])
        else:
            feature = np.append(feature, [0])

        if jobInfo['State'].values[0] == state:
            feature = np.append(feature, [1])
        else:
            feature = np.append(feature, [0])

        feature = feature.reshape(1,158)
        X = np.concatenate((X, feature), axis=0)

    # Finally, we rank all of them based on the probabilities of model prediction
    return X


In [42]:
def getPredictions(input_data, N):
    input_data['currentlyEmployed'] = 1 if input_data['currentlyEmployed'] == 'Yes' else 0
    input_data['managedOthers'] = 1 if input_data['managedOthers'] == 'Yes' else 0
    degree_mapping = {
        'None': 0,
        'High School': 1,
        'Vocational': 2,
        'Associate\'s': 3,
        'Bachelor\'s': 4,
        'Master\'s': 5,
        'PhD': 6
    }
    degree = input_data['degree']
    input_data['degree'] = degree_mapping.get(input_data['degree'], 0)  

    input_data_list = [input_data[field] for field in ['degree', 'workHistoryCount', 
                                                        'yearsOfExp', 'currentlyEmployed',
                                                        'managedOthers', 'managedHowMany']]
    
    input_data_tf_idf_degree = degree + ' ' + input_data['major'] + ' ' + str(input_data['yearsOfExp'])
    input_data_transformed = user_tfidf_vectorizer.transform([input_data_tf_idf_degree])

    cosine_similarities = cosine_similarity(input_data_transformed, user_tfidf_matrix)
    top_similar_users_indices = cosine_similarities.flatten().argsort()[::-1][:10]
    most_similar_user = users.iloc[top_similar_users_indices]

    # Get the top 100 jobs that similar users have applied in
    top_jobs = getTopJobs(most_similar_user['UserID'].values)

    # Now, re-rank the above 100 jobs and recommend the Top 20
    Xtrain = buildXtrain(top_jobs, input_data_list, input_data['workHistory'], input_data['city'], input_data['state'])

    prediction = model.predict(Xtrain, verbose=0)

    top_jobs_list = list(top_jobs)
    job_predictions = {}

    # Iterate over each job ID in top_jobs and corresponding prediction value
    for i, job_id in enumerate(top_jobs_list):
        prediction_value = prediction[i][0]
        job_predictions[job_id] = prediction_value
    
    sorted_job_predictions = dict(sorted(job_predictions.items(), key=lambda item: item[1], reverse=True))

    recommended_jobs = []
    for job in sorted_job_predictions:
        recommended_jobs.append(job)
        if (len(recommended_jobs) >= N):
            break
    
    return recommended_jobs

In [52]:
hit = []

for _, user in tqdm(test_user.iterrows(), total=len(test_user)):
    input_data = dict()
    input_data['major'] = user['Major']
    input_data['degree'] = user['DegreeType']

    input_data['workHistoryCount'] = user['WorkHistoryCount']
    input_data['managedHowMany'] = user['ManagedHowMany']
    input_data['yearsOfExp'] = user['TotalYearsExperience']

    input_data['currentlyEmployed'] = user['CurrentlyEmployed']
    input_data['managedOthers'] = user['ManagedOthers']
    input_data['city'] = user['City']
    input_data['state'] = user['State']

    id = user['UserID']
    # print(id)
    work_hist = user_history[user_history['UserID'] == id]
    work_hist_concat = ""

    for hist in work_hist.iterrows():
        work_hist_concat += (hist[1]['JobTitle']) + " "

    input_data['workHistory'] = work_hist_concat

    predictions = getPredictions(input_data, 20)

    found_applied = False
    for job in predictions:
        applied_to = apps[(apps['JobID'] == job) & (apps['UserID'] == id)]
        if (len(applied_to) > 0):
            found_applied = True
            break
    
    if found_applied:
        hit.append(1)
    else:
        hit.append(0)

100%|██████████| 552/552 [00:27<00:00, 20.24it/s]


In [53]:
np.mean(hit)

0.5797101449275363