In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# !pip install tensorflow==2.17.0

In [None]:
# !pip install tf-keras

In [None]:
# !pip install git+https://github.com/huggingface/transformers.git

In [None]:
df = pd.read_excel('/kaggle/input/organizationalroles/JobLevelData.xlsx')
display(df.head())
display(df.info())
display(df.describe())


In [None]:
# manualy input some missing values based on the job title and internet research

df.loc[29, 'Column 1'] = 'Chief Officer'                        # CINO
df.loc[829, 'Column 1'] = 'Individual Contributor/Staff'        # Release of Information Tech II
df.loc[1406, 'Column 1'] = 'Manager'                            # Global People Systems, Processes and Information Manager
df.loc[1713, 'Column 1'] = 'Individual Contributor/Staff'       # Supplier Quality Enginee
df.loc[1785, 'Column 1'] = 'Manager'                            # RC Environmental and Cyber Specialized Subscription Manager
df.loc[2182, 'Column 1'] = 'Director'                           # Senior IndependeDirector and Chair of the Customer and Communities Network
df.loc[2182, 'Title'] = 'Senior Independent Director and Chair of the Customer and Communities Network'        # Senior IndependeDirector and Chair of the Customer and Communities Network


In [None]:
new_row_loc = {
    'Title': 'CEO & CO-OWNER at Grupa Montownia',
    'Column1': 'Owner',
    'Column2': 'Chief Officer'
}
df.loc[len(df)] = new_row_loc

new_row_loc = {
    'Title': 'CEO & Founder at TechWings',
    'Column1': 'Owner',
    'Column2': 'Chief Officer'
}
df.loc[len(df)] = new_row_loc

new_row_loc = {
    'Title': 'CEO & CO-founder Smartspell',
    'Column1': 'Owner',
    'Column2': 'Chief Officer'
}
df.loc[len(df)] = new_row_loc

new_row_loc = {
    'Title': 'Founder, CTO at WebMakers Software House',
    'Column1': 'Owner',
    'Column2': 'Chief Officer'
}
df.loc[len(df)] = new_row_loc

new_row_loc = {
    'Title': 'Founder at TreeTopTrip',
    'Column1': 'Owner'
}
df.loc[len(df)] = new_row_loc

new_row_loc = {
    'Title': 'Founder & CEO w SIMLAB',
    'Column1': 'Owner',
    'Column2': 'Chief Officer'
}
df.loc[len(df)] = new_row_loc

new_row_loc = {
    'Title': 'Founder at Marmot Consulting',
    'Column1': 'Owner'
}
df.loc[len(df)] = new_row_loc

new_row_loc = {
    'Title': 'Founder and visionar',
    'Column1': 'Owner',
}
df.loc[len(df)] = new_row_loc

new_row_loc = {
    'Title': 'CEO/Founder of 154Studio',
    'Column1': 'Owner',
    'Column2': 'Chief Officer'
}
df.loc[len(df)] = new_row_loc

new_row_loc = {
    'Title': 'CEO & founder at Onee',
    'Column1': 'Owner',
    'Column2': 'Chief Officer'
}
df.loc[len(df)] = new_row_loc

new_row_loc = {
    'Title': 'Chief Business Development Officer & Co-Founder at Reality Metaverse',
    'Column1': 'Owner',
    'Column2': 'Chief Officer'
}
df.loc[len(df)] = new_row_loc

new_row_loc = {
    'Title': 'Founder & CEO at Academy inDEV',
    'Column1': 'Owner',
    'Column2': 'Chief Officer'
}
df.loc[len(df)] = new_row_loc

In [None]:
# One hot encoding
columns_to_encode = ['Column 1', 'Column 2', 'Column 3', 'Column 4']

one_hot_encoded = pd.DataFrame()

unique_values = set()

for column in ['Column 1', 'Column 2', 'Column 3', 'Column 4']:
    values = df[column].dropna().unique().tolist()
    unique_values.update(values)

for value in unique_values:
    one_hot_encoded[f'Label_{value}'] = df[columns_to_encode].eq(value).any(axis=1).astype(int)

df = pd.concat([df['Title'], one_hot_encoded], axis=1)
display(df.head())

In [None]:
import re
import nltk
from nltk.corpus import stopwords

# basic preprocessing: lowercase, remove special characters,
# remove all numbers, remove extra spaces
df['Title'] = df['Title'].str.lower()

df['Title'] = df['Title'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))
df['Title'] = df['Title'].apply(lambda x: re.sub(r'\s{2,}', ' ', x))

# remove stop words
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))
stop_words.remove('it') # it is a stop word but it used a lot in the job titles
df['Title'] = df['Title'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# normalize the job titles:
# i.e sr -> senior, jr -> junior, vp -> vice president, etc.
job_title_dict = {
    # misspelings
    'vise': 'vice',
    'senoir': 'senior',
    'maneger': 'manager',
    'assistent': 'assistant',
    'supervisr': 'supervisor',
    'analist': 'analyst',
    'enginere': 'engineer',
    'developr': 'developer',
    'programer': 'programmer',
    'acountant': 'accountant',
    'lawer': 'lawyer',
    'docter': 'doctor',
    'analista': 'analyst',
    # short forms
    'dev': 'developer',
    'dev.': 'developer',
    'eng': 'engineer',
    'eng.': 'engineer',
    'engr': 'engineer',
    'engr.': 'engineer',
    # title prefixes 
    'reg': 'regional',
    'reg.': 'regional',
    'assoc': 'associate',
    'assoc.': 'associate',
    'asst': 'assistant',
    'asst.': 'assistant',
    'exec': 'executive',
    'exec.': 'executive',
    'deputy': 'deputy',
    'deputy.': 'deputy',
    'mng': 'managing',
    'mng.': 'managing',
    # expertise level
    'sr': 'senior',
    'sr.': 'senior',
    'snr': 'senior',
    'snr.': 'senior',
    'sen': 'senior',
    'sen.': 'senior',
    'jr': 'junior',
    'jr.': 'junior',
    'jnr': 'junior',
    'jnr.': 'junior',
    'jun': 'junior',
    'jun.': 'junior',
    'mid': 'middle',
    'mid.': 'middle',
    'mdl': 'middle',
    'mdl.': 'middle',
    # vice president
    'vp': 'vice president',
    'svp': 'senior vice president',
    'evp': 'executive vice president',
    'avp': 'assistant vice president',
    'sevp': 'senior executive vice president',
    'gvp': 'group vice president',
    'dvp': 'divisional vice president',
    'rvp': 'regional vice president',
    'cvp': 'corporate vice president',
    'davp': 'deputy assistant vice president',
    'savp': 'senior assistant vice president',
    'mvp': 'managing vice president',
    'arvp': 'associate regional vice president',
    # c level officers
    'cfo': 'chief financial officer',
    'coo': 'chief operating officer',
    'cto': 'chief technology officer',
    'cio': 'chief information officer',
    'chro': 'chief human resources officer',
    'cdo': 'chief data officer',
    'cmo': 'chief marketing officer',
    'cso': 'chief sales officer',
    'cco': 'chief communications officer',
    'cro': 'chief relationship officer',
    # directors
    'dir': 'director',
    'dir.': 'director',
    # managers
    'mgr': 'manager',
    'mgr.': 'manager',
    'mng': 'manager',
    'mng.': 'manager',
    'mngr': 'manager',
    'mngr.': 'manager',
}

# Function to normalize job titles
def normalize_job_title(title):
    words = title.split()
    normalized_words = []
    for word in words:
        normalized_words.append(job_title_dict.get(word, word))
    return ' '.join(normalized_words)

df['Title'] = df['Title'].apply(normalize_job_title)
df = df[['Title', 'Label_Individual Contributor/Staff', 'Label_Vice President', 'Label_Chief Officer', 'Label_Owner', 'Label_Manager', 'Label_Director']]

display(df.head(10))

ROBERTA

In [None]:
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification, AdamWeightDecay 
from sklearn.model_selection import train_test_split

In [None]:
gpus = tf.config.list_physical_devices('GPU')
print(f"Number of GPUs available: {len(gpus)}")

In [None]:
# # MODEL_NAME = 'roberta-base'
# MODEL_NAME = 'roberta-small'
# NUM_LABELS = len(unique_values)
# EPOCH = 100
# PATIENCE_CALLBACK = 20
# LR_CALLBACK = 10
# LEARNING_RATE = 0.00001
# BATCH_SIZE = 32
# TOKENIZER_LENGTH = 128
MODEL_NAME = 'roberta-base'
NUM_LABELS = len(unique_values)
EPOCH = 700
PATIENCE_CALLBACK = 2
LR_CALLBACK = 1
LEARNING_RATE = 0.00003
BATCH_SIZE = 32
TOKENIZER_LENGTH = 128

In [None]:
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)


In [None]:
def tokenize(df):
    input_ids =  []
    attention_masks =  []
    
    for i, text in enumerate(df['Title']):
        tokens = tokenizer.encode_plus(text, max_length=TOKENIZER_LENGTH,
                                   truncation=True, padding='max_length',
                                   add_special_tokens=True, return_attention_mask=True,
                                   return_token_type_ids=False, return_tensors='tf')   
         
        input_ids.append(np.asarray(tokens["input_ids"]).reshape(TOKENIZER_LENGTH,))
        attention_masks.append(np.asarray(tokens["attention_mask"]).reshape(TOKENIZER_LENGTH,))

    return (np.asarray(input_ids), np.asarray(attention_masks))

In [None]:
# display(df)
X = df[['Title']]
y = df[['Label_Individual Contributor/Staff','Label_Vice President','Label_Chief Officer','Label_Owner','Label_Manager','Label_Director']]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape)
print(y_train.shape)

In [None]:
train_input_ids, train_attention_masks = tokenize(X_train)
valid_input_ids, valid_attention_masks = tokenize(X_valid)

In [None]:
print(X_train.iloc[0])
print(train_input_ids[0])
print(train_attention_masks[0])

In [None]:
y_train = y_train.to_numpy(dtype=np.float32)
y_valid = y_valid.to_numpy(dtype=np.float32)

In [None]:
print(y_train)
print(y_valid)

In [None]:
# X = df['Title'].values.tolist()
# labels = df[['Label_Individual Contributor/Staff', 'Label_Vice President', 'Label_Chief Officer', 'Label_Owner', 'Label_Manager', 'Label_Director']].values.astype(np.float32)

In [None]:
# print(X[:10])
# print(type(X))
# print(labels)
# print(type(labels))

In [None]:
import matplotlib.pyplot as plt

def plot_training_history(history):
    # List of metrics to plot
    metrics = ['loss', 'accuracy', 'auc', 'precision', 'recall', 'f1_score']
    
    # Create a figure with subplots
    fig, axs = plt.subplots(len(metrics), 1, figsize=(12, 4*len(metrics)))
    fig.suptitle('Model Training History')
    
    for i, metric in enumerate(metrics):
        # Plot training & validation metric values
        axs[i].plot(history.history[metric], label=f'Train {metric}')
        axs[i].plot(history.history[f'val_{metric}'], label=f'Validation {metric}')
        
        axs[i].set_title(f'{metric.capitalize()} Over Epochs')
        axs[i].set_xlabel('Epoch')
        axs[i].set_ylabel(metric.capitalize())
        axs[i].legend()
        axs[i].grid(True)
    
    plt.tight_layout()
    plt.show()

In [None]:
os.makedirs('./models', exist_ok=True)
os.makedirs('./weights', exist_ok=True)
os.makedirs('./logs', exist_ok=True)

strategy = tf.distribute.OneDeviceStrategy("/gpu:0")

with strategy.scope():
    transformer_model = TFRobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)

    input_ids = tf.keras.layers.Input(shape=(TOKENIZER_LENGTH,), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(TOKENIZER_LENGTH,), dtype=tf.int32, name='attention_mask')

    embeddings = transformer_model(input_ids, attention_mask=attention_mask)[0]
    output = tf.keras.layers.Activation('sigmoid')(embeddings)

    model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)

    optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
    loss = tf.keras.losses.BinaryCrossentropy()
    
    acc = tf.keras.metrics.BinaryAccuracy('accuracy')
    auc = tf.keras.metrics.AUC(name='auc')
    precision = tf.keras.metrics.Precision(name='precision')
    recall = tf.keras.metrics.Recall(name='recall')
    f1_score = tf.keras.metrics.F1Score(average='macro', name='f1_score')

    model.compile(
        optimizer=optimizer,
        loss=loss,
        metrics=[acc, auc, precision, recall, f1_score]
    )
    model.summary()

    model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
        './models/best_model.keras',
        monitor='val_f1_score',
        mode='max',
        save_best_only=True,
        save_weights_only=False,
        verbose=1
    )
    
    # Checkpoint for saving only the weights
    weights_checkpoint = tf.keras.callbacks.ModelCheckpoint(
        './weights/best_model_weights.h5',
        monitor='val_f1_score',
        mode='max',
        save_best_only=True,
        save_weights_only=True,
        verbose=1
    )

    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_f1_score',
        mode='max',
        patience=PATIENCE_CALLBACK,
        verbose=1,
        restore_best_weights=True
    )
    
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_f1_score',
        factor=0.5,
        patience=LR_CALLBACK,
        min_lr=1e-7,
        mode='max',
        verbose=1
    )
    
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs")


    
    history = model.fit(
        (train_input_ids, train_attention_masks),
        y_train,
        validation_data=((valid_input_ids,valid_attention_masks), y_valid),
        epochs=EPOCH,
        batch_size=BATCH_SIZE,
        callbacks=[model_checkpoint, weights_checkpoint, tensorboard_callback, early_stopping, reduce_lr]
    )
    plot_training_history(history)



In [None]:
def prepare_input(text):
    # Tokenize the input text
    encoded_input = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=TOKENIZER_LENGTH,
        return_tensors='tf'
    )
    return {
        'input_ids': encoded_input['input_ids'],
        'attention_mask': encoded_input['attention_mask']
    }


# If you want to predict for multiple texts at once
texts_to_predict = ["senior security architect information security officer",
                    "principle software engineer director",
                    "embedddd software engineer lead",
                    "senior principal development engineer enterprise and data center infrastructure"]
# texts_to_predict = ["Senior Security Architect / Information Security Officer",
#                     "Principle Software Engineer, Director",
#                     "Embedddd Software Engineer Lead",
#                     "Senior Principal Development Engineer Enterprise and Data Center Infrastructure"]
batch_input = prepare_input(texts_to_predict)
batch_prediction = model.predict(batch_input)



In [None]:
print(batch_input['input_ids'])
print(type(batch_prediction))
print(batch_prediction)

In [None]:
# def create_model():
#     transformer_model = TFRobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)

#     input_ids = tf.keras.layers.Input(shape=(TOKENIZER_LENGTH,), dtype=tf.int32, name='input_ids')
#     attention_mask = tf.keras.layers.Input(shape=(TOKENIZER_LENGTH,), dtype=tf.int32, name='attention_mask')

#     embeddings = transformer_model(input_ids, attention_mask=attention_mask)[0]
#     output = tf.keras.layers.Activation('sigmoid')(embeddings)

#     model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)

#     return model


# model_tst = create_model()
# model_tst.load_weights('/kaggle/input/infuse-weights/tensorflow2/default/1/best_model_weights.h5')