In [20]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [21]:
apps = pd.read_csv('../input_data/apps.tsv', delimiter='\t',encoding='utf-8')
user_history = pd.read_csv('../input_data/user_history.tsv', delimiter='\t',encoding='utf-8')
jobs = pd.read_csv('../input_data/jobs.tsv', delimiter='\t',encoding='utf-8', on_bad_lines="skip")
users = pd.read_csv('../input_data/users.tsv' ,delimiter='\t',encoding='utf-8')
test_users = pd.read_csv('../input_data/test_users.tsv', delimiter='\t',encoding='utf-8')

**Window = 6**

In our project, we are using only a subset of the original dataset, due to its large nature. We will only be using the data from one of the windows (here window 6)

In [22]:
user_history[user_history.WindowID==6]

Unnamed: 0,UserID,WindowID,Split,Sequence,JobTitle
1337041,13,6,Test,1,Pennsylvania Mentor
1337042,13,6,Test,2,Student Worker
1337043,13,6,Test,3,Internship in Adoption Unit
1337044,13,6,Test,4,
1337045,13,6,Test,5,Student Worker - Continuing Education
...,...,...,...,...,...
1530889,1472087,6,Train,3,GloBull Ambassador
1530890,1472087,6,Train,4,Research Assistant for Head Start Study
1530891,1472087,6,Train,5,Volunteer
1530892,1472087,6,Train,6,Customer Service Associate


In [23]:
jobs[jobs.WindowID==6].info()

<class 'pandas.core.frame.DataFrame'>
Index: 115998 entries, 861371 to 977368
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   JobID         115998 non-null  int64 
 1   WindowID      115998 non-null  int64 
 2   Title         115996 non-null  object
 3   Description   115997 non-null  object
 4   Requirements  115923 non-null  object
 5   City          115998 non-null  object
 6   State         115998 non-null  object
 7   Country       115998 non-null  object
 8   Zip5          71528 non-null   object
 9   StartDate     115998 non-null  object
 10  EndDate       115998 non-null  object
dtypes: int64(2), object(9)
memory usage: 10.6+ MB


In [24]:
users[users.WindowID==6].info()

<class 'pandas.core.frame.DataFrame'>
Index: 43334 entries, 296639 to 339972
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   UserID                43334 non-null  int64  
 1   WindowID              43334 non-null  int64  
 2   Split                 43334 non-null  object 
 3   City                  43334 non-null  object 
 4   State                 43276 non-null  object 
 5   Country               43334 non-null  object 
 6   ZipCode               43142 non-null  object 
 7   DegreeType            32034 non-null  object 
 8   Major                 32428 non-null  object 
 9   GraduationDate        29703 non-null  object 
 10  WorkHistoryCount      43334 non-null  int64  
 11  TotalYearsExperience  41733 non-null  float64
 12  CurrentlyEmployed     40653 non-null  object 
 13  ManagedOthers         43334 non-null  object 
 14  ManagedHowMany        43334 non-null  int64  
dtypes: float64(1), int

In [25]:
user_set = users[(users.WindowID==6) & (users.Country=="US")].dropna(axis=0,subset=["Major", "TotalYearsExperience", "CurrentlyEmployed"])
user_set.Split.value_counts(), user_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30006 entries, 296639 to 339971
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   UserID                30006 non-null  int64  
 1   WindowID              30006 non-null  int64  
 2   Split                 30006 non-null  object 
 3   City                  30006 non-null  object 
 4   State                 30006 non-null  object 
 5   Country               30006 non-null  object 
 6   ZipCode               29960 non-null  object 
 7   DegreeType            26006 non-null  object 
 8   Major                 30006 non-null  object 
 9   GraduationDate        23379 non-null  object 
 10  WorkHistoryCount      30006 non-null  int64  
 11  TotalYearsExperience  30006 non-null  float64
 12  CurrentlyEmployed     30006 non-null  object 
 13  ManagedOthers         30006 non-null  object 
 14  ManagedHowMany        30006 non-null  int64  
dtypes: float64(1), int

(Split
 Train    28864
 Test      1142
 Name: count, dtype: int64,
 None)

As shown in the preprocess.ipynb file, the number of US apps are very high. We will only be considering US applications.<br>
Furthermore, we willbe dropping all such users, who do not have a user_history attached to them.

In [27]:
job_set = jobs[(jobs.WindowID==6) & (jobs.Country=="US")].dropna(axis=0,subset=["Description","Requirements","Title"])
job_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 115684 entries, 861371 to 977368
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   JobID         115684 non-null  int64 
 1   WindowID      115684 non-null  int64 
 2   Title         115684 non-null  object
 3   Description   115684 non-null  object
 4   Requirements  115684 non-null  object
 5   City          115684 non-null  object
 6   State         115684 non-null  object
 7   Country       115684 non-null  object
 8   Zip5          71502 non-null   object
 9   StartDate     115684 non-null  object
 10  EndDate       115684 non-null  object
dtypes: int64(2), object(9)
memory usage: 10.6+ MB


In [28]:
user_id = user_set.UserID.unique().tolist()
job_id = job_set.JobID.unique().tolist()
work_history = user_history[user_history.UserID.isin(user_id)]
work_history.dropna(axis=0,subset=["JobTitle"], inplace=True)

application_record = apps[(apps.UserID.isin(user_id))&(apps.JobID.isin(job_id))]

In [29]:
work_user_id = work_history.UserID.unique()
application_user_id = application_record.UserID.unique()
user_set = user_set[(user_set.UserID.isin(work_user_id))&(user_set.UserID.isin(application_user_id))]
user_id = user_set.UserID.unique()
application_record = application_record[application_record.UserID.isin(user_id)]
work_history = work_history[work_history.UserID.isin(user_id)]

In [41]:
user_set.to_csv("users.csv", index=False)
application_record.to_csv("application_record.csv", index=False)
work_history.to_csv("work_history.csv", index=False)
job_set.to_csv("jobs.csv", index=False)

**Negative-sampling**

In [35]:
final_apps = pd.DataFrame(columns = ["UserID","JobID","label"])
job_id = job_set.JobID.unique().tolist()
groups = application_record.groupby("UserID")
user_ids = []
job_ids = []
labels = []

print(groups)

for id, group in tqdm(groups):
    print(group)
    print()
    print()
    print()
    print(id)
    break

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x3c850bee0>


  0%|          | 0/25511 [00:00<?, ?it/s]

         UserID  WindowID Split          ApplicationDate   JobID
1247132      13         6  Test  2012-06-19 15:36:38.583  821691



13





In [36]:
for idx, group in tqdm(groups):
    size = len(group)
    exist_job = group.JobID.unique().tolist()
    candidate_job = [i for i in job_id if i not in exist_job ]
    sample_job = np.random.randint(0,len(candidate_job),size)
    user_ids.extend([idx] * 2 * size)
    exist_job.extend([candidate_job[i] for i in sample_job])
    job_ids.extend(exist_job)
    label = [1] * size
    label.extend([0] * size)
    labels.extend(label)

final_apps.UserID = user_ids
final_apps.JobID = job_ids
final_apps.label = labels

100%|██████████| 25511/25511 [02:30<00:00, 169.63it/s]


In [38]:
final_apps.label.value_counts()

label
1    120436
0    120436
Name: count, dtype: int64

In [39]:
final_apps.to_csv("apps.csv",index=False)

**Cleaning Jobs Dataframe**

In [42]:
jobs = pd.read_csv("jobs.csv")

In [43]:
import re

removePattern = r'(<(.*?)>)|(&\w+)'
addSpacePattern = r'([;:])|(\\r)|(\\n)'
removeExtraSpaces = r'(\s\s+?)(?=\S)'

jobs['DescCleaned'] = jobs['Description'].astype(str).str.lower()
jobs['DescCleaned'] = jobs['DescCleaned'].apply(lambda x: re.sub(removePattern, "", x))
jobs['DescCleaned'] = jobs['DescCleaned'].apply(lambda x: re.sub(addSpacePattern, " ", x))
jobs['DescCleaned'] = jobs['DescCleaned'].apply(lambda x: re.sub(removeExtraSpaces, " ", x))
# Similarly for requirements
jobs['ReqCleaned'] = jobs['Requirements'].astype(str).str.lower()
jobs['ReqCleaned'] = jobs['ReqCleaned'].apply(lambda x: re.sub(removePattern, "", x))
jobs['ReqCleaned'] = jobs['ReqCleaned'].apply(lambda x: re.sub(addSpacePattern, " ", x))
jobs['ReqCleaned'] = jobs['ReqCleaned'].apply(lambda x: re.sub(removeExtraSpaces, " ", x))


In [47]:
jobs.drop(columns=['Description', 'Requirements'], inplace=True)

In [49]:
jobs.to_csv("jobs.csv", index=False)