In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [3]:
apps = pd.read_csv('../input_data/apps.tsv', delimiter='\t',encoding='utf-8')
user_history = pd.read_csv('../input_data/user_history.tsv', delimiter='\t',encoding='utf-8')
jobs = pd.read_csv('../input_data/jobs.tsv', delimiter='\t',encoding='utf-8', on_bad_lines="skip")
users = pd.read_csv('../input_data/users.tsv' ,delimiter='\t',encoding='utf-8')

In [13]:
train_user = users[users.Split=="Train"]
test_user = users[users.Split=="Test"]

In [80]:
import joblib
from tensorflow.keras.models import load_model

job_vectorizer = joblib.load('../Backend Algos/tfidf_data/job_description_tfidf.pkl')
jobdesc_tfidf_matrix = joblib.load('../Backend Algos/tfidf_data/job_description_tfidf_matrix.pkl')


work_history_vectorizer = joblib.load('../Backend Algos/tfidf_data/work_history_tfidf.pkl')

# For USER USER CF
user_tfidf_matrix = joblib.load('../Backend Algos/tfidf_data/user_tfidf_matrix.pkl')
user_tfidf_vectorizer = joblib.load('../Backend Algos/tfidf_data/user_tfidf_vectorizer.pkl')

model = load_model('../Backend Algos/tfidf_data/keras_model.h5')
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [79]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [81]:
def getTopJobs(userIds):
    # print(userIds)
    jobsSet = set()
    for user in userIds:
        jobsAppliedTo = apps[apps['UserID']==user]
        jobsAppliedTo = jobsAppliedTo['JobID'].values
        for job in jobsAppliedTo:
            jobsSet.add(job)
            if (len(jobsSet) > 100):
                break
        if (len(jobsSet) > 100):
                break
    
    return jobsSet

In [82]:

def buildXtrain(jobSet, userProfile, past_work_ex, city, state):
    # Firstly, we create the user_profile to training_data form
    user_feature = np.array(userProfile)
    work_ex_transform = work_history_vectorizer.transform([past_work_ex])
    user_feature = np.concatenate((user_feature, work_ex_transform.toarray()[0]))

    X = np.zeros((1,158))

    # Then, we add the job_transform_tfidf to the user
    for jobId in jobSet:
        jobInfo = jobs[jobs['JobID'] == jobId]
        idx = jobInfo.index.values[0]
        print(jobdesc_tfidf_matrix.shape)
        feature = np.concatenate((user_feature, jobdesc_tfidf_matrix[idx, :].toarray()[0]))

        if jobInfo['City'].values[0] == city:
            feature = np.append(feature, [1])
        else:
            feature = np.append(feature, [0])

        if jobInfo['State'].values[0] == state:
            feature = np.append(feature, [1])
        else:
            feature = np.append(feature, [0])

        feature = feature.reshape(1,158)
        X = np.concatenate((X, feature), axis=0)

    # Finally, we rank all of them based on the probabilities of model prediction
    return X


In [73]:
def getPredictions(input_data, N):
    input_data['currentlyEmployed'] = 1 if input_data['currentlyEmployed'] == 'Yes' else 0
    input_data['managedOthers'] = 1 if input_data['managedOthers'] == 'Yes' else 0
    degree_mapping = {
        'None': 0,
        'High School': 1,
        'Vocational': 2,
        'Associate\'s': 3,
        'Bachelor\'s': 4,
        'Master\'s': 5,
        'PhD': 6
    }
    degree = input_data['degree']
    input_data['degree'] = degree_mapping.get(input_data['degree'], 0)  

    input_data_list = [input_data[field] for field in ['degree', 'workHistoryCount', 
                                                        'yearsOfExp', 'currentlyEmployed',
                                                        'managedOthers', 'managedHowMany']]
    
    input_data_tf_idf_degree = degree + ' ' + input_data['major'] + ' ' + str(input_data['yearsOfExp'])
    input_data_transformed = user_tfidf_vectorizer.transform([input_data_tf_idf_degree])

    cosine_similarities = cosine_similarity(input_data_transformed, user_tfidf_matrix)
    top_similar_users_indices = cosine_similarities.flatten().argsort()[::-1][:10]
    most_similar_user = users.iloc[top_similar_users_indices]

    # Get the top 100 jobs that similar users have applied in
    top_jobs = getTopJobs(most_similar_user['UserID'].values)

    # Now, re-rank the above 100 jobs and recommend the Top 20
    Xtrain = buildXtrain(top_jobs, input_data_list, input_data['workHistory'], input_data['city'], input_data['state'])

    prediction = model.predict(Xtrain)

    top_jobs_list = list(top_jobs)
    job_predictions = {}

    # Iterate over each job ID in top_jobs and corresponding prediction value
    for i, job_id in enumerate(top_jobs_list):
        prediction_value = prediction[i][0]
        job_predictions[job_id] = prediction_value
    
    sorted_job_predictions = dict(sorted(job_predictions.items(), key=lambda item: item[1], reverse=True))
    print(sorted_job_predictions)

    return sorted_job_predictions

In [74]:
for user in test_user.iterrows():
    print(user[1]['State'])
    input_data = dict()
    input_data['major'] = user[1]['Major']
    input_data['degree'] = user[1]['DegreeType']

    input_data['workHistoryCount'] = user[1]['WorkHistoryCount']
    input_data['managedHowMany'] = user[1]['ManagedHowMany']
    input_data['yearsOfExp'] = user[1]['TotalYearsExperience']

    input_data['currentlyEmployed'] = user[1]['CurrentlyEmployed']
    input_data['managedOthers'] = user[1]['ManagedOthers']
    input_data['city'] = user[1]['City']
    input_data['state'] = user[1]['State']

    id = user[1]['UserID']
    # print(id)
    work_hist = user_history[user_history['UserID'] == id]
    work_hist_concat = ""

    for hist in work_hist.iterrows():
        work_hist_concat += (hist[1]['JobTitle']) + " "

    input_data['workHistory'] = work_hist_concat

    predictions = getPredictions(input_data, 10)


    break

CA
(115684, 100)
(115684, 100)


IndexError: row index (148648) out of range

In [32]:
user_history

Unnamed: 0,UserID,WindowID,Split,Sequence,JobTitle
0,47,1,Train,1,National Space Communication Programs-Special ...
1,47,1,Train,2,Detention Officer
2,47,1,Train,3,"Passenger Screener, TSA"
3,72,1,Train,1,"Lecturer, Department of Anthropology"
4,72,1,Train,2,Student Assistant
...,...,...,...,...,...
1753896,1472060,7,Train,2,Sales Associate
1753897,1472060,7,Train,3,Sales Associate
1753898,1472089,7,Train,1,Founder and President-Children's Entertainment...
1753899,1472089,7,Train,2,Specification Sales


In [26]:
test_user

Unnamed: 0,UserID,WindowID,Split,City,State,Country,ZipCode,DegreeType,Major,GraduationDate,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany
31,767,1,Test,Murrieta,CA,US,92562,Bachelor's,University Studies/Business,2008-05-01 00:00:00,5,16.0,No,No,0
32,769,1,Test,Roselle,IL,US,60172,Bachelor's,Radio-Television,2011-05-01 00:00:00,5,5.0,Yes,No,0
33,861,1,Test,Morris,IL,US,60450,High School,General Studies,1989-05-01 00:00:00,7,21.0,,No,0
38,1006,1,Test,West Chester,PA,US,19382,High School,Not Applicable,2008-06-01 00:00:00,3,6.0,Yes,No,0
44,1192,1,Test,Cincinnati,OH,US,45255,Bachelor's,Marketing,,5,6.0,Yes,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389556,1467350,7,Test,Derry,NH,US,03038,Associate's,Business Administration,2009-01-01 00:00:00,4,6.0,Yes,No,0
389564,1467535,7,Test,Farmers Branch,TX,US,75244,Bachelor's,,,10,30.0,Yes,No,0
389567,1467586,7,Test,Carrollton,TX,US,75010,Bachelor's,International Studies,2012-05-01 00:00:00,6,14.0,Yes,No,0
389650,1470538,7,Test,Richmond,VA,US,23173,,,,0,,No,Yes,1
