In [9]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize, pos_tag
from nltk.stem import  WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')


from gensim.models import Word2Vec

from tqdm import tqdm
import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, classification_report

import torch
from torch.utils import data
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lokesh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 1. Load the data and perform word segmentation

In [10]:
job_set = pd.read_csv("job_set_cleaned.csv")
user_set = pd.read_csv("user_set_cleaned.csv")
dataset = pd.read_csv("dataset_cleaned.csv")
work_history = pd.read_csv("work_history_cleaned.csv")

In [11]:
punctuation=list('。，？！：%&~（）、；“”&|,.?!:%&~();""#@【】/-\'$+*`[]{}()')
stop_words = stopwords.words("english")
stop_words.extend(["n't","wo","'m","'s","'ve", "'d", "'ll", "``", "''", "--", "..."])
stop_words.extend(punctuation)
wordnet_lematizer = WordNetLemmatizer()

def pretreatment(comment):
    '''
    remove punctuations, numbers and urls
    lower case conversion
    remove stop words
    lemmatization
    '''
    
    token_words = word_tokenize(comment)
    token_words = [w.lower() for w in token_words]
    token_words = [w for w in token_words if w not in stop_words]
    token_words =  pos_tag(token_words) 
    cleaned_word = []
    for word, tag in token_words:
        if word.isdigit():
            continue
        if tag.startswith('NN'):
            word_lematizer =  wordnet_lematizer.lemmatize(word, pos='n')  # n for noun
        elif tag.startswith('VB'): 
            word_lematizer =  wordnet_lematizer.lemmatize(word, pos='v')   # v for verb
        elif tag.startswith('JJ'): 
            word_lematizer =  wordnet_lematizer.lemmatize(word, pos='a')   # a for adjective
        elif tag.startswith('R'): 
            word_lematizer =  wordnet_lematizer.lemmatize(word, pos='r')   # r for pronoun
        else: 
            word_lematizer =  wordnet_lematizer.lemmatize(word)
        cleaned_word.append(word_lematizer)
    
    return cleaned_word

In [13]:
# Fill missing values with empty strings
job_set = job_set.fillna("")

# Concatenate relevant fields
job_set["word"] = job_set["Title"] + " " + job_set["Description"] + " " + job_set["Requirements"]

# Apply pretreatment function to each row
segment = []
for content in tqdm(job_set["word"].values):
    segment.append(pretreatment(content))

# Assign the processed text to a new column
job_set["text"] = segment

# Check the first few rows to ensure everything looks correct
print(job_set.head())


  0%|                                                                                       | 0/115684 [00:00<?, ?it/s]


LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\Lokesh/nltk_data'
    - 'C:\\Users\\Lokesh\\anaconda3\\nltk_data'
    - 'C:\\Users\\Lokesh\\anaconda3\\share\\nltk_data'
    - 'C:\\Users\\Lokesh\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\Lokesh\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [None]:
job_set.to_csv("job_set_segment.csv",index=False)

# 2. Train a Word2Vec model based on our data

In [None]:
from gensim.models import word2vec, Word2Vec

def train_word2vec(x):
    '''
    param: x is a list contain all the words
    return: the trained model
    '''
    
    model = word2vec.Word2Vec(x, size=200, window=5, min_count=2, workers=8, 
                             iter=10, sg=1)
    return model

In [None]:
# 20-30min
w2v_model = train_word2vec(job_set.text.values)
w2v_model.save('./word2vec.model')

In [None]:
w2v_model = Word2Vec.load('./word2vec.model')

In [None]:
# TF-IDF vectors for user job history
word_history_tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, max_features=50, stop_words='english')
word_history_tf_matrix = word_history_tf.fit_transform(work_history.groupby("UserID").JobTitle.sum().values)

# 3. Define a class for preprocessing

In [None]:
class Preprocess():
    def __init__(self, sentences, sen_len, w2v_path="./w2v.model"):
        '''
        param: sentences: the list of corpus
               sen_len: the max length of each sentence
               w2v_path: the path storing word emnbedding model 
        '''
        
        self.w2v_path = w2v_path
        self.sentences = sentences
        self.sen_len = sen_len
        self.idx2word = []
        self.word2idx = {}
        self.embedding_matrix = []
        
    def get_w2v_model(self):
        self.embedding = Word2Vec.load(self.w2v_path)
        self.embedding_dim = self.embedding.vector_size
        
    def add_embedding(self, word):
        vector = torch.empty(1, self.embedding_dim)
        torch.nn.init.uniform_(vector)
        self.word2idx[word] = len(self.word2idx)
        self.idx2word.append(word)
        self.embedding_matrix = torch.cat([self.embedding_matrix, vector], 0)
        
    def make_embedding(self, load=True):
        print("Get embedding ...")
        if load:
            print("loading word2vec model ...")
            self.get_w2v_model()
        else:
            raise NotImplementedError
        for i, word in enumerate(self.embedding.wv.vocab):
            self.word2idx[word] = len(self.word2idx)
            self.idx2word.append(word)
            self.embedding_matrix.append(self.embedding[word])
        self.embedding_matrix = torch.tensor(self.embedding_matrix)
        self.add_embedding("<PAD>")
        self.add_embedding("<UNK>")
        print("total words: {}".format(len(self.embedding_matrix)))
        return self.embedding_matrix
    
    def pad_sentence(self, sentence):
        if len(sentence) > self.sen_len:
            sentence = sentence[:self.sen_len]
        else:
            pad_len = self.sen_len - len(sentence)
            for _ in range(pad_len):
                sentence.append(self.word2idx['<PAD>'])
        assert len(sentence) == self.sen_len
        return sentence
    
    def sentence_word2idx(self):
        '''
        change words in sentences into idx in embedding_matrix
        '''
        sentence_list = []
        for i, sen in enumerate(self.sentences):
            sentence_idx = []
            for word in sen:
                if (word in self.word2idx.keys()):
                    sentence_idx.append(self.word2idx[word])
                else:
                    sentence_idx.append(self.word2idx['<UNK>'])
            sentence_idx = self.pad_sentence(sentence_idx)
            sentence_list.append(sentence_idx)
        return torch.LongTensor(sentence_list)
    
    def labels_to_tensor(self, y):
        return torch.LongTensor(y)

# 4. Define the dataset and the architecture of the model

In [None]:
class TextCNN(nn.Module):
    def __init__(self, channels, kernel_size, pool_size, dim, method='max'):
        super(TextCNN, self).__init__()
        self.net1 = nn.Sequential(
            nn.Conv2d(1, channels, kernel_size[0]),
            nn.BatchNorm2d(channels),
            nn.ReLU(),
            nn.MaxPool2d(pool_size)
        )
        self.net2 = nn.Sequential(
            nn.Conv2d(channels, channels, kernel_size[1]),
            nn.BatchNorm2d(channels),
            nn.ReLU(),
            nn.AdaptiveMaxPool2d((1, dim))
        )
        if method is 'max':
            self.pool = nn.AdaptiveMaxPool2d((1, dim))
        elif method is 'mean':
            self.pool = nn.AdaptiveAvgPool2d((1, dim))
        else:
            raise ValueError('method {} not exist'.format(method))

    def forward(self, x):
        x = self.net1(x)
        x = self.net2(x).squeeze(2)
        x = self.pool(x).squeeze(1)
        return x

In [None]:
class MLP(nn.Module):
    def __init__(self, input_size, output_size, dropout):
        super(MLP, self).__init__()
        self.net = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(input_size, input_size),
            nn.ReLU(),
            nn.Linear(input_size, output_size),
            nn.Sigmoid()
            
        )

    def forward(self, x):
        x = self.net(x)
        return x

In [None]:
class PJFNN(nn.Module):
    def __init__(self, embedding, input_dim, channels=1, dropout=0.5, fix_embedding=True):
        super(PJFNN, self).__init__()
        self.dim = embedding.size(1)
        self.user_dim = input_dim
        self.channels = channels
        self.embedding = nn.Embedding(embedding.size(0), embedding.size(1))
        self.embedding.weight = nn.Parameter(embedding)
        self.embedding.weight.requires_grad = False if fix_embedding else True
        # self.emb = nn.Embedding.from_pretrained(
        #    torch.from_numpy(np.load( os.path.join(args['dataset']['path'], 'emb.npy') )),
        #    freeze=False,
        #    padding_idx=0
        #)

        # self.geek_layer = TextCNN(
        #     channels=args['dataset']['max_sent_num']['geek'],
        #     kernel_size=[(5, 1), (3, 1)],
        #     pool_size=(2, 1),
        #     dim=dim,
        #     method='max'
        # )
        self.user_layer = MLP(self.user_dim , 64, dropout=dropout)
        self.linear_transform = nn.Linear(200, 64)
        self.job_layer = TextCNN(
            channels=self.channels,
            kernel_size=[(5, 1), (5, 1)],
            pool_size=(2, 1),
            dim=200,
            method='mean'
        )

        self.mlp = MLP(
            input_size=128,
            output_size=1,
            dropout=dropout
        )


    def forward(self, job, user):
        job = self.embedding(job)
        job = job.unsqueeze(1)
        job = self.job_layer(job)
        user = self.user_layer(user)
        job = self.linear_transform(job)
        x = torch.cat((user,job),dim=1)
        x = self.mlp(x).squeeze(1)
        return x

In [None]:
class PJFNN_LSTM(nn.Module):
    def __init__(self, embedding, input_dim, hidden_dim=256, num_layers=1, dropout=0., fix_embedding=True):
        super(PJFNN_LSTM, self).__init__()
        self.embedding_dim = embedding.size(1)
        self.user_dim = input_dim
        self.embedding = nn.Embedding(embedding.size(0), embedding.size(1))
        self.embedding.weight = nn.Parameter(embedding)
        self.embedding.weight.requires_grad = False if fix_embedding else True
        # self.emb = nn.Embedding.from_pretrained(
        #    torch.from_numpy(np.load( os.path.join(args['dataset']['path'], 'emb.npy') )),
        #    freeze=False,
        #    padding_idx=0
        #)

        # self.geek_layer = TextCNN(
        #     channels=args['dataset']['max_sent_num']['geek'],
        #     kernel_size=[(5, 1), (3, 1)],
        #     pool_size=(2, 1),
        #     dim=dim,
        #     method='max'
        # )
        self.user_layer = MLP(self.user_dim , 64, dropout=dropout)
        self.linear_transform = nn.Linear(256, 64)
        self.job_layer = nn.LSTM(self.embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True, dropout=dropout)

        self.classifier = nn.Sequential(nn.Dropout(dropout), nn.Linear(128, 1), nn.Sigmoid())


    def forward(self, job, user):
        job = self.embedding(job)
        job, _ = self.job_layer(job, None)
        job = job[:,-1,:]
        user = self.user_layer(user)
        job = self.linear_transform(job)
        x = torch.cat((user,job),dim=1)
        x = self.classifier(x).squeeze(1)
        return x

In [None]:
class JobUserDataset(data.Dataset):
    '''
    Expected data shape like:(data_num, data_len)
    '''
    def __init__(self, job, user, label):
        self.job = job
        self.user = user
        self.label = label
        
    def __getitem__(self, idx):
        if self.label is None: return self.job[idx], self.user[idx]
        return self.job[idx], self.user[idx], self.label[idx]
    
    def __len__(self):
        return len(self.job)

# 5. Build datasets

In [None]:
train_user = user_set[user_set.Split=="Train"].UserID.values
test_user = user_set[user_set.Split=="Test"].UserID.values
train_data = dataset[dataset.UserID.isin(train_user)]
test_data = dataset[dataset.UserID.isin(test_user)]

In [None]:
job_set = pd.read_csv("job_set_segment.csv")
text = []
for i in job_set.text:
    temp = i[1:-1].split(',')
    text.append([t.strip()[1:-1] for t in temp])
job_set["text"] = text

In [None]:
groups = train_data.groupby("UserID")
job_train = []
user_train = np.zeros((1,58))
Y_train = []
for u_id, group in tqdm(groups):
    user = user_set[user_set.UserID==u_id][["DegreeType", "WorkHistoryCount", "TotalYearsExperience", "CurrentlyEmployed", 
                                            "ManagedOthers", "ManagedHowMany"]]
    u_idx = user.index.values[0]
    user_feature = np.concatenate((user.values, word_history_tf_matrix[u_idx,:].toarray()),axis=1)
    job_id_list = group.JobID.values
    jobs = job_set[job_set.JobID.isin(job_id_list)]
    job_train.extend(jobs.text.values.tolist())
    user_feature = user_feature.repeat(len(jobs),axis=0)
    user_feature = np.concatenate((user_feature,group[["State","City"]].values),axis=1)
    user_train = np.concatenate((user_train,user_feature), axis=0)
    Y_train.extend(group.label.values.tolist())

In [None]:
groups = test_data.groupby("UserID")
job_test = []
user_test = np.zeros((1,58))
Y_test = []
for u_id, group in tqdm(groups):
    user = user_set[user_set.UserID==u_id][["DegreeType", "WorkHistoryCount", "TotalYearsExperience", "CurrentlyEmployed", 
                                            "ManagedOthers", "ManagedHowMany"]]
    u_idx = user.index.values[0]
    user_feature = np.concatenate((user.values, word_history_tf_matrix[u_idx,:].toarray()),axis=1)
    job_id_list = group.JobID.values
    jobs = job_set[job_set.JobID.isin(job_id_list)]
    job_test.extend(jobs.text.values.tolist())
    user_feature = user_feature.repeat(len(jobs),axis=0)
    user_feature = np.concatenate((user_feature,group[["State","City"]].values),axis=1)
    user_test = np.concatenate((user_test,user_feature), axis=0)
    Y_test.extend(group.label.values.tolist())

In [None]:
train_len = len(job_train)
job_train.extend(job_test)
Y_train.extend(Y_test)
sen_len = 200
preprocess = Preprocess(job_train, sen_len, w2v_path="word2vec.model")
embedding = preprocess.make_embedding(load=True)
x = preprocess.sentence_word2idx()
y = preprocess.labels_to_tensor(Y_train)
torch.save(embedding,"./embedding.pt")

In [None]:
train_x = x[:70000]
train_y = y[:70000]
val_x = x[70000:70680]
val_y = y[70000:70680]
test_x = x[70680:]
test_y = y[70680:]
train_user = torch.from_numpy(user_train[1:70001])
val_user = torch.from_numpy(user_train[70001:])
test_user = torch.from_numpy(user_test[1:])
train_dataset = JobUserDataset(train_x, train_user, train_y)
val_dataset = JobUserDataset(val_x, val_user, val_y)
test_dataset = JobUserDataset(test_x, test_user, test_y)
# torch.save(train_dataset,"train.dataset")
# torch.save(val_dataset,"val.dataset")
# torch.save(test_dataset, "test.dataset")

In [None]:
# train_dataset = torch.load("train.dataset")
# val_dataset = torch.load("val.dataset")
# test_dataset = torch.load("test.dataset")
# embedding = torch.load("embedding.pt")
batch_size = 32
train_loader = DataLoader(dataset= train_dataset, batch_size = batch_size, shuffle = False)
val_loader = DataLoader(dataset = val_dataset, batch_size = batch_size, shuffle = False)
test_loader =DataLoader(dataset = test_dataset, batch_size = batch_size, shuffle = False)

# 6. Define functions for training and testing

In [None]:
def training(batch_size, n_epoch, lr, train, valid, model, device, model_name, model_dir="./"):
    # summary model parameters
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    print("\nstart training, total parameter:{}, trainable:{}\n".format(total, trainable))
    model.cuda()
    model.train()
    criterion = nn.BCELoss()
    t_batch = len(train)
    v_batch = len(valid)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    total_loss, total_acc, best_acc = 0, 0, 0
    train_losses, val_losses = [], []
    train_acc, val_acc = [], []
    pred_label = []
    y_label = []
    
    for epoch in range(n_epoch):
        start_time = time.time()
        total_loss, total_acc = 0, 0
        # training
        for i, (jobs, users, labels) in enumerate(train):
            jobs = jobs.to(device)
            users = users.to(torch.float32)
            users = users.to(device)
            labels = labels.to(device)
            labels = labels.to(torch.float32)
            model.zero_grad()
            outputs = model(jobs, users)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            pred_label.extend([0 if i<0.5 else 1 for i in list(outputs.cpu().detach().numpy())])
            y_label.extend(list(labels.cpu().detach().numpy()))
        train_losses.append(total_loss/t_batch)
        train_acc.append(accuracy_score(y_label, pred_label))
        print('[ Epoch{}: {}/{}] '.format(
                epoch+1, i+1, t_batch)) 
        print('\nTrain | Loss:{:.5f} Time:{:.6f}'.format(total_loss/t_batch, time.time()-start_time))
        
        # evaluation
        model.eval()
        with torch.no_grad():
            pred_label = []
            y_label = []
            total_loss, total_acc = 0, 0
            for i, (jobs, users, labels) in enumerate(valid):
                jobs = jobs.to(device)
                users = users.to(torch.float32)
                users = users.to(device)
                labels = labels.to(torch.float32)
                labels = labels.to(device)
                outputs = model(jobs, users)
                loss = criterion(outputs, labels)
                total_loss += loss.item()
                pred_label.extend([0 if i<0.5 else 1 for i in list(outputs.cpu().detach().numpy())])
                y_label.extend(list(labels.cpu().detach().numpy()))
            # print('\nVal | Loss:{:.5f} Time:{:.6f}'.format(total_loss/v_batch, time.time()-start_time))
            val_losses.append(total_loss/v_batch)
            total_acc =  accuracy_score(y_label, pred_label)
            val_acc.append(total_acc)
            print('\nVal | ACC:{:.5f} Time:{:.6f}'.format(total_acc, time.time()-start_time))
            if total_acc > best_acc:
                best_acc = total_acc
                torch.save(model, "{}/{}_ckpt.model".format(model_dir, model_name))
                print('save model with acc {:.3f}'.format(total_acc))
        print('------------------------------------------------------')
        model.train()
    return train_losses, val_losses, train_acc, val_acc

In [None]:
def test(model, loader):
    predictions = []
    y_labels = []
    pred = []
    model.eval()
    for i, (jobs, users, labels) in enumerate(loader):
        jobs = jobs.to(device)
        users = users.to(device, dtype=torch.float32)
        labels = labels.to(device, dtype=torch.float32)
        outputs = model(jobs, users)
        pred.extend(list(outputs.cpu().detach().numpy()))
        predictions.extend([0 if i<0.5 else 1 for i in list(outputs.cpu().detach().numpy())])
        y_labels.extend(list(labels.cpu().detach().numpy()))
        report = classification_report(y_labels,predictions,digits=4)
    report = report.splitlines()
    columns = ['class'] + report[0].split()
    col_1, col_2, col_3, col_4, col_5 = [], [], [], [], []
    for row in report[1:]:
        if len(row.split()) != 0:
            row = row.split()
            if len(row) < 5:
                col_1.append(row[0])
                col_2.append('')
                col_3.append('')
                col_4.append(row[1])
                col_5.append(row[2])
            elif len(row) > 5:
                col_1.append(row[0] + ' ' + row[1])
                col_2.append(row[2])
                col_3.append(row[3])
                col_4.append(row[4])
                col_5.append(row[5])
            else:
                col_1.append(row[0])
                col_2.append(row[1])
                col_3.append(row[2])
                col_4.append(row[3])
                col_5.append(row[4])
    result = pd.DataFrame()
    col_1.append("overall")
    col_2.append(precision_score(y_labels, predictions))
    col_3.append(recall_score(y_labels, predictions))
    col_4.append(f1_score(y_labels, predictions))
    col_5.append(roc_auc_score(y_labels, pred))
    result[columns[0]] = col_1
    result[columns[1]] = col_2
    result[columns[2]] = col_3
    result[columns[3]] = col_4
    result[columns[4]] = col_5
    print("——————Test——————")
#     print(result)
    return result

# 7. Train and test the Text-CNN model

In [None]:
fix_embedding = False
input_dim = train_dataset[0][1].shape[0]
model = PJFNN(embedding, input_dim, dropout=0.7, channels=32, fix_embedding=fix_embedding)
# model 

In [None]:
epoch = 10
lr =  0.0005
model_dir = './'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_losses, val_losses, train_acc, val_acc = training(batch_size, epoch, lr, train_loader, val_loader, model, device, "textCNN",  model_dir )

In [None]:
test(torch.load("textCNN_ckpt.model"), test_loader)

# 8. TopN recommendation

In [None]:
ranking_data = pd.read_csv("ranking_data.csv")
groups = ranking_data.groupby("UserID")
job_rank = []
user_rank = np.zeros((1,58))
for u_id, group in tqdm(groups):
    user = user_set[user_set.UserID==u_id][["DegreeType", "WorkHistoryCount", "TotalYearsExperience", "CurrentlyEmployed", 
                                            "ManagedOthers", "ManagedHowMany"]]
    u_idx = user.index.values[0]
    user_feature = np.concatenate((user.values, word_history_tf_matrix[u_idx,:].toarray()),axis=1)
    job_id_list = group.JobID.values
    jobs = job_set[job_set.JobID.isin(job_id_list)]
    job_rank.extend(jobs.text.values.tolist())
    user_feature = user_feature.repeat(len(jobs),axis=0)
    user_feature = np.concatenate((user_feature,group[["State","City"]].values),axis=1)
    user_rank = np.concatenate((user_rank,user_feature), axis=0)


In [None]:
sen_len = 200
preprocess = Preprocess(job_rank, sen_len, w2v_path="word2vec.model")
embedding = preprocess.make_embedding(load=True)
rank_x = preprocess.sentence_word2idx()

In [None]:
rank_dataset = JobUserDataset(rank_x, user_rank, None)
rank_loader = DataLoader(dataset= rank_dataset, batch_size = 100, shuffle = False)

In [None]:
def test_hit_rate(model,N,num_user):
  hit = 0
  model.eval()
  for jobs, users in rank_loader:
    jobs = jobs.to(device)
    users = users.to(torch.float32)
    users = users.to(device)
    outputs = model(jobs, users)
    pred = outputs.cpu().detach().numpy()
    a = -np.sort(-pred)
    idx = np.argwhere(a==pred[0])[0][0]
    if idx <= N-1:
        hit += 1
  return hit/num_user

In [None]:
num_user = len(ranking_data.UserID.unique())
m = torch.load("textCNN_ckpt.model")

In [None]:
test_hit_rate(m, 1, num_user), test_hit_rate(m, 5, num_user), test_hit_rate(m, 10, num_user), test_hit_rate(m, 20, num_user)

In [None]:
(software, 47)