In [1]:
!nvidia-smi

Mon Nov 22 10:30:31 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
! pip install -q sentence-transformers polyglot pyicu pycld2 morfessor

[K     |████████████████████████████████| 78 kB 4.8 MB/s 
[K     |████████████████████████████████| 126 kB 32.2 MB/s 
[K     |████████████████████████████████| 299 kB 43.1 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 41.4 MB 1.4 MB/s 
[K     |████████████████████████████████| 3.1 MB 38.3 MB/s 
[K     |████████████████████████████████| 3.3 MB 35.8 MB/s 
[K     |████████████████████████████████| 1.2 MB 28.0 MB/s 
[K     |████████████████████████████████| 59 kB 7.1 MB/s 
[K     |████████████████████████████████| 596 kB 43.5 MB/s 
[K     |████████████████████████████████| 895 kB 55.0 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Building wheel for polyglot (setup.py) ... [?25l[?25hdone
  Building wheel for pyicu (PEP 517) ... [?25l[?25hdone
  Building wheel for pycl

In [3]:
import json, ast
import pandas as pd
import re
from sklearn.model_selection import train_test_split

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Building train data for phrase similarity

In [5]:
root_path = "/content/drive/My Drive/Leadbook/ml_challenge/"

In [None]:
with open(root_path + "departments.json", 'r') as fp:
  departments = fp.read()
departments = (departments[1:-2]+",").split("\n")

In [None]:
len(departments)

35

In [None]:
dept_dict = {}
for department in departments:
  dept_dict.update(ast.literal_eval(department)[0])

In [None]:
dept_dict.keys()

dict_keys(['Defense', 'Entertainment', 'Media & Journalism', 'Aviation, marine and shipping', 'Healthcare', 'Mechanical & Heavy Industry', 'Design', 'Art and Photography', 'Electrical and Electronics', 'Government and Agencies', 'Chemicals', 'Construction', 'Clothing,Cosmetics and Fashion', 'Business Services', 'Human Resources', 'Marketing and Advertising', 'Logistics and Transportation', 'Energy and Mining', 'Hotels and Culinary', 'Trade', 'Management', 'Information Technology', 'sports ,fitness ,leisure and Travel', 'Import Export Procurement Dealers and Distributors', 'Social Organisations and NGO', 'Consumer Services', 'Real Estate', 'Customer Services', 'Professional Services', 'Education', 'Agriculture', 'Engineering and Telecommunications', 'planning and quality', 'Financials', 'Others'])

In [None]:
len(dept_dict)

35

In [None]:
def data_preprocessing(phrase_list : list):
  phrase_list = [ re.sub('&', 'and', phrase) for phrase in phrase_list ]
  phrase_list = [ re.sub('/', 'or', phrase) for phrase in phrase_list ]
  phrase_list = [ phrase.lower() for phrase in phrase_list ]
  return phrase_list

In [None]:
department_dict = {}
for key, value_list in dept_dict.items():
  phrase_list = [value for value in value_list]
  phrase_list.append(key)
  phrase_list = data_preprocessing(phrase_list)
  department_dict[key] = phrase_list

In [None]:
len(department_dict)

35

In [None]:
with open(root_path + "departments_processed.json", 'w') as fp:
  json.dump(department_dict,fp)

In [None]:
with open(root_path + "departments_processed.json", 'r') as fp:
  department_dict = json.load(fp)

In [None]:
data = []

for key,phrase1_list in department_dict.items():
  for rep_key,phrase2_list in department_dict.items():
    score = 0.0
    if key == rep_key : score = 0.9
    else : score = 0.1
    triplet_dict = [{"phrase1" : phrase1, "phrase2": phrase2, "score": score} for phrase1 in phrase1_list for phrase2 in phrase2_list if phrase1 != phrase2]
    data.extend(triplet_dict)

In [None]:
df = pd.DataFrame(data=data,columns=["phrase1","phrase2","score"])
df.head()

Unnamed: 0,phrase1,phrase2,score
0,defence,space,0.9
1,defence,military,0.9
2,defence,defense,0.9
3,space,defence,0.9
4,space,military,0.9


In [None]:
len(df)

113204

In [None]:
df.to_csv(root_path + "train.csv",index=False)

# Training

In [None]:
df = pd.read_csv(root_path + "train.csv")
df.head()

Unnamed: 0,phrase1,phrase2,score
0,defence,space,0.9
1,defence,military,0.9
2,defence,defense,0.9
3,space,defence,0.9
4,space,military,0.9


In [None]:
train_df, valid_df = train_test_split(df, test_size=0.1, random_state=42)
len(train_df),len(valid_df)

(101883, 11321)

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from torch.utils.data import DataLoader

pretrained_models = ["all-MiniLM-L6-v2",            # 99.66
                     "paraphrase-mpnet-base-v2",    # 99.70
                     "paraphrase-MiniLM-L6-v2",     # 99.67
                     "all-mpnet-base-v2",           # 99.67
                     "bert-base-nli-mean-tokens"]   # 99.71

train_examples = [InputExample(texts=[row['phrase1'],row['phrase2']],label=row['score']) for idx,row in train_df.iterrows()]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

phrase1 = [ phrase for phrase in valid_df['phrase1']]
phrase2 = [ phrase for phrase in valid_df['phrase2']]
scores = [ score for score in valid_df['score']]

evaluator = evaluation.EmbeddingSimilarityEvaluator(phrase1, phrase2, scores)

for model_name in pretrained_models:
  print(f"Finetuning {model_name}")
  model = SentenceTransformer(model_name,device="cuda:0")
  train_loss = losses.CosineSimilarityLoss(model)
  model.fit(train_objectives=[(train_dataloader, train_loss)], 
            epochs=5, 
            warmup_steps=100, 
            evaluator=evaluator, 
            evaluation_steps=500,
            output_path= f"{root_path}/models/{model_name}/")

Iteration:   0%|          | 0/6368 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6368 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6368 [00:00<?, ?it/s]

Finetuning paraphrase-MiniLM-L6-v2


Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6368 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6368 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6368 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6368 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6368 [00:00<?, ?it/s]

Finetuning all-mpnet-base-v2


Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6368 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6368 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6368 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6368 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6368 [00:00<?, ?it/s]

Finetuning bert-base-nli-mean-tokens


Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6368 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6368 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6368 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6368 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6368 [00:00<?, ?it/s]

# Inference

In [6]:
def remove_noisy_words(phrase_list : list):
  common_titles = ["ceo ", "coo ", "cfo ", "cio ", "cmo ", "chro ", "cto ",
                 "director ", "chief ", "president ", "vice president ", "vp ",
                 "vice chair ", "board member ", "member ", "team member ", "team captain ",
                 "owner ", "chairman ", "co - chair ", "co - chairman ", "senior "
                 ]
  clean_list = []
  for phrase in phrase_list:
    phr = phrase
    for title in common_titles:
      phr = re.sub(title, '', phr)
    clean_list.append(phr)
  return clean_list

def keep_only_alnum(s): 
    s1 = re.sub(r'[^a-z0-9 ]+', ' ', s.lower())
    return " ".join(s1.split())

def data_preprocessing(phrase_list : list):
  phrase_list = [ re.sub('&', 'and', phrase) for phrase in phrase_list ]
  phrase_list = [ re.sub('/', 'or', phrase) for phrase in phrase_list ]
  phrase_list = [ keep_only_alnum(phrase) for phrase in phrase_list ]
  phrase_list = remove_noisy_words(phrase_list)
  return phrase_list

In [7]:
with open(root_path + "jobtitles_all.txt", 'r') as fp:
  job_titles = fp.read()
job_titles = job_titles.split("\n")

In [8]:
job_titles_processed = data_preprocessing(job_titles)

In [9]:
job_titles_processed[:20]

['art auctioneer',
 'interior architect',
 'supervisor call centre',
 'tv host',
 'fuel and feedstocks trader',
 'global operations program management office',
 'gasoil operations executive',
 'tiler',
 'executive assistant manager sales marketing and services',
 'professional tennis coach',
 'directororshop managerormechanic',
 'business exec',
 'credit analyst',
 'product specialistorm director',
 'supervising engineer',
 'marketing operations manager apac',
 'diploma in mass communications student',
 'freelance elt editor and writer',
 'supply planner asean oceania',
 'lead mech commissioning engineer']

In [10]:
with open(root_path + "departments_processed.json", 'r') as fp:
  department_dict = json.load(fp)

In [11]:
topic_map = {value : key for key,value_list in department_dict.items() for value in value_list}

In [12]:
len(topic_map)

323

In [13]:
from sentence_transformers import SentenceTransformer, util, models
import torch

model_name = "paraphrase-MiniLM-L6-v2"

embedder = SentenceTransformer(f"{root_path}/models/{model_name}/",device="cuda:0")
corpus = list(topic_map.keys())

queries = job_titles_processed

In [14]:
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

In [15]:
top_k = 20
from tqdm import tqdm
from polyglot.detect import Detector

from polyglot.detect.base import logger as polyglot_logger
polyglot_logger.setLevel("ERROR")

dept_prediction = []

for query in tqdm(queries):
  preds = []
  if len(query.split())==0: 
    dept_prediction.append(preds)
    continue

  try:
    lang_detector = Detector(query)
    if lang_detector.language.name != "English": 
      dept_prediction.append(preds)
      continue 

  except:
    dept_prediction.append(preds)
    continue

  query_embedding = embedder.encode(query, convert_to_tensor=True)
  cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
  top_cos_results = torch.topk(cos_scores, k=top_k)

  for score,idx in zip(top_cos_results[0],top_cos_results[1]):
    if float(score) >= 0.5:
      topic = corpus[int(idx)]
      preds.append(topic_map[topic])
  preds = list(set(preds))

  dept_prediction.append(preds)

100%|██████████| 141564/141564 [21:50<00:00, 108.04it/s]


In [16]:
pred_df = pd.DataFrame()
pred_df["Job Titles"] = job_titles
pred_df["Department Predictions"] = dept_prediction

In [17]:
pred_df.head()

Unnamed: 0,Job Titles,Department Predictions
0,art auctioneer,[Art and Photography]
1,interior architect,[Construction]
2,supervisor - call centre,[Management]
3,tv host,[Entertainment]
4,senior fuel and feedstocks trader,"[Energy and Mining, Logistics and Transportation]"


In [18]:
pred_df.to_csv(root_path + "dept_preds.csv",index=False)