<a href="https://colab.research.google.com/github/mtmanna/datasci207/blob/main/Tokenizer%20and%20Labeler%20Notebooks/Model_1_Tokenizer_bert_base_with_oversample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Installs

!pip install pydot --quiet
!pip install transformers==4.37.2 --quiet
!pip install -U imbalanced-learn --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.0/258.0 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
#@title Imports
import pandas as pd

import numpy as np
import random
import torch

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda, Dropout
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

import sklearn as sk
import os
import nltk
from nltk.data import find

import matplotlib.pyplot as plt

import re
from sklearn.model_selection import train_test_split

from transformers import BertTokenizer, TFBertModel

from transformers import logging
logging.set_verbosity_error()

from collections import Counter

import pickle


In [None]:
# Set seed to 10

seed_value = 10

random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)
tf.random.set_seed(seed_value)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/My Drive/DataSci 266 Project

column_names = ['index','title','text','level','salary']

#Load data
labeled_data = pd.read_csv('labeled_job_data.csv',header=None, names=column_names, skiprows=1)

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1V3QooLePiHR_DaZhbXQhsjmP1Ez5fv5F/DataSci 266 Project


In [None]:
labeled_data.dropna(subset=['text'], inplace=True)

In [None]:
#@title Creating Salary Buckets

# UPDATED SALARY BUCKETS 3/23!!

def map_salary_to_bucket(salary):
    if pd.isnull(salary) or salary == 0:
        return np.nan
    elif salary < 45000:
        return '<45k'
    elif 45000 <= salary < 65000:
        return '45-65k'
    elif 65000 <= salary < 85000:
        return '65-85k'
    elif 85000 <= salary < 110000:
        return '85-110k'
    elif 110000 <= salary < 150000:
        return '110-150k'
    elif 150000 <= salary < 200000:
        return '150-200k'
    else:
        return '>200k'


labeled_data['salary_bucket'] = labeled_data['salary'].apply(map_salary_to_bucket)

In [None]:
print(labeled_data['salary_bucket'].value_counts())

salary_bucket
45-65k      3681
<45k        3593
65-85k      3324
85-110k     2082
110-150k    1448
>200k        605
150-200k     548
Name: count, dtype: int64


In [None]:
print(labeled_data['level'].value_counts())

level
senior            39009
entry             13320
mid               13152
junior             9705
student_intern     5005
executive          1818
Name: count, dtype: int64


In [None]:
pd.set_option('display.max_colwidth', 10000)

In [None]:
# Dictionaries
level_key = {
    "student_intern": 0,
    "entry": 1,
    "junior": 2,
    "mid": 3,
    "senior": 4,
    "executive": 5,
    np.nan: -1
}

# Define salary_key dictionary
salary_key = {
    "<45k": 0,
    "45-65k": 1,
    "65-85k": 2,
    "85-110k": 3,
    "110-150k": 4,
    "150-200k": 5,
    ">200k": 6,
    np.nan: -1
}

labeled_data['level_labels'] = labeled_data.level.apply(lambda x: level_key[x])
labeled_data['salary_labels'] = labeled_data.salary_bucket.apply(lambda x: salary_key[x])

In [None]:
#labeled_data.head()

In [None]:
#@title BERT

checkpoint = 'bert-base-cased'
bert_tokenizer = BertTokenizer.from_pretrained(checkpoint)
bert_model = TFBertModel.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [None]:
#@title Train Test Split

# Per Pascual: Probably should split data into sets with both labels, and one label or the other label, then split them all seperately and concat

# Sample dataset
# sample_df = labeled_data.sample(n=5000)

# # Separate data into subsets - sample
# both_labels = sample_df.dropna(subset=['level', 'salary_bucket'])
# salary_only = sample_df[sample_df['salary_bucket'].notna() & sample_df['level'].isna()]
# level_only = sample_df[sample_df['level'].notna() & sample_df['salary_bucket'].isna()]

# Separate data into subsets - all data
both_labels = labeled_data.dropna(subset=['level', 'salary_bucket'])
salary_only = labeled_data[labeled_data['salary_bucket'].notna() & labeled_data['level'].isna()]
level_only = labeled_data[labeled_data['level'].notna() & labeled_data['salary_bucket'].isna()]

# Shuffle
both_labels = both_labels.sample(frac=1).reset_index(drop=True)
salary_only = salary_only.sample(frac=1).reset_index(drop=True)
level_only = level_only.sample(frac=1).reset_index(drop=True)

# Train test split for each subset
train_both_labels, test_both_labels = train_test_split(both_labels, test_size=0.3)
train_salary_only, test_salary_only = train_test_split(salary_only, test_size=0.3)
train_level_only, test_level_only = train_test_split(level_only, test_size=0.3)

# Test validation split for each subset
test_both_labels, val_both_labels = train_test_split(test_both_labels, test_size=0.5)
test_salary_only, val_salary_only = train_test_split(test_salary_only, test_size=0.5)
test_level_only, val_level_only = train_test_split(test_level_only, test_size=0.5)

# combine subsets into train and test datasets
train_data = pd.concat([train_both_labels, train_salary_only, train_level_only])
train_data = train_data.sample(frac=1).reset_index(drop=True)

test_data = pd.concat([test_both_labels, test_salary_only, test_level_only])
test_data = test_data.sample(frac=1).reset_index(drop=True)

val_data = pd.concat([val_both_labels, val_salary_only, val_level_only])
val_data = val_data.sample(frac=1).reset_index(drop=True)

print("train data salary bucket counts:")
print(train_data['salary_bucket'].value_counts())
print("************************")
print("test data salary bucket counts:")
print(test_data['salary_bucket'].value_counts())
print("************************")
print("validation data salary bucket counts:")
print(val_data['salary_bucket'].value_counts())
print("************************")
print("train data level counts:")
print(train_data['level'].value_counts())
print("************************")
print("test data level counts:")
print(test_data['level'].value_counts())
print("************************")
print("validation data level counts:")
print(val_data['level'].value_counts())

train data salary bucket counts:
salary_bucket
45-65k      2546
<45k        2475
65-85k      2356
85-110k     1497
110-150k     991
>200k        429
150-200k     401
Name: count, dtype: int64
************************
test data salary bucket counts:
salary_bucket
<45k        578
45-65k      543
65-85k      503
85-110k     289
110-150k    209
>200k        88
150-200k     83
Name: count, dtype: int64
************************
validation data salary bucket counts:
salary_bucket
45-65k      592
<45k        540
65-85k      465
85-110k     296
110-150k    248
>200k        88
150-200k     64
Name: count, dtype: int64
************************
train data level counts:
level
senior            27340
entry              9325
mid                9170
junior             6778
student_intern     3508
executive          1284
Name: count, dtype: int64
************************
test data level counts:
level
senior            5855
entry             1991
mid               1989
junior            1450
student_int

In [None]:
#@title export raw training to pickle
with open('train_no_oversample_df.pickle', 'wb') as handle:
    pickle.dump(train_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
#@title Resampling - Level

#Resampling Level
X_train = train_data[['index', 'title', 'text', 'salary', 'salary_bucket', 'salary_labels', 'level_labels']]
y_train = train_data['level']

# Define the classes to undersample and oversample
levels_to_undersample = ['mid', 'junior', 'senior'] #will only undersample senior
levels_to_oversample = ['student_intern', 'executive', 'entry'] #will only oversample student_intern and 'executive'

# Create a mask to identify samples that belong to the classes to be undersampled
level_mask_undersample = np.isin(y_train, levels_to_undersample)

# Create a mask to identify samples that belong to the classes to be oversampled
level_mask_oversample = np.isin(y_train, levels_to_oversample)

level_mask_rest = ~level_mask_undersample & ~level_mask_oversample

# Perform undersampling and oversampling
rus = RandomUnderSampler(sampling_strategy='majority')
ros = RandomOverSampler()

# Undersample the 'senior' class
X_resampled_rus, y_resampled_rus = rus.fit_resample(X_train[level_mask_undersample], y_train[level_mask_undersample])

# Oversample the 'executive' and 'student_intern' classes
X_resampled_ros, y_resampled_ros = ros.fit_resample(X_train[level_mask_oversample], y_train[level_mask_oversample])

# Combine the resampled data with the original data
X_resampled = np.concatenate([X_train[level_mask_rest], X_resampled_ros, X_resampled_rus])
y_resampled = np.concatenate([y_train[level_mask_rest], y_resampled_ros, y_resampled_rus])
train_resampled = np.concatenate([X_resampled, y_resampled.reshape(-1, 1)], axis=1)

print("Original class distribution:", Counter(y_train))
print("Resampled class distribution:", Counter(train_resampled[:, -1]))

train_resampled_df = pd.DataFrame(train_resampled, columns=['index', 'title', 'text', 'salary', 'salary_bucket', 'salary_labels', 'level_labels', 'level'])



# print(train_resampled_df['salary_bucket'].value_counts())
# print(train_resampled_df['level'].value_counts())

#print(train_resampled_df.tail(10))

Original class distribution: Counter({'senior': 27340, 'entry': 9325, 'mid': 9170, nan: 8215, 'junior': 6778, 'student_intern': 3508, 'executive': 1284})
Resampled class distribution: Counter({'entry': 9325, 'student_intern': 9325, 'executive': 9325, 'mid': 9170, nan: 8215, 'junior': 6778, 'senior': 6778})


In [None]:
#@title Resampling - salary_buckets

#Resampling buckets
X_train_2 = train_resampled_df[['index', 'title', 'text', 'salary', 'salary_labels', 'level_labels', 'level']]
y_train_2 = train_resampled_df['salary_bucket']

# Define the classes to undersample and oversample
buckets_to_undersample = ['85-110k', '<45k', '45-65k', '65-85k']
buckets_to_oversample = ['110-150k', '150-200k', '>200k']

# Create masks
bucket_mask_undersample = np.isin(y_train_2, buckets_to_undersample)
bucket_mask_oversample = np.isin(y_train_2, buckets_to_oversample)
bucket_mask_rest = ~bucket_mask_undersample & ~bucket_mask_oversample

rus = RandomUnderSampler()
ros = RandomOverSampler()

# Undersample the '<45k' class
X_resampled_rus, y_resampled_rus = rus.fit_resample(X_train_2[bucket_mask_undersample], y_train_2[bucket_mask_undersample])

# Oversample the 110-150k, 150-200k, and >200k classes
X_resampled_ros, y_resampled_ros = ros.fit_resample(X_train_2[bucket_mask_oversample], y_train_2[bucket_mask_oversample])

# Combine with level resampled data
X_resampled = np.concatenate([X_train_2[bucket_mask_rest], X_resampled_ros, X_resampled_rus])
y_resampled = np.concatenate([y_train_2[bucket_mask_rest], y_resampled_ros, y_resampled_rus])
train_final = np.concatenate([X_resampled, y_resampled.reshape(-1, 1)], axis=1)

print("Original class distribution:", Counter(y_train_2))
print("Resampled class distribution:", Counter(train_final[:, -1]))

# the columns didn't want to cooperate so I had to manually fix them.
train_final_df = pd.DataFrame(
    np.concatenate([X_resampled, y_resampled.reshape(-1, 1)], axis=1),
    columns=['index', 'title', 'text', 'salary', 'salary_labels', 'level_labels', 'level', 'salary_bucket']
)

train_final_df = train_final_df.reindex(columns=['index', 'title', 'text', 'salary', 'salary_bucket', 'salary_labels', 'level', 'level_labels'])

# print(train_final_df['salary_bucket'].value_counts())
# print(train_final_df['level'].value_counts())

#print(train_final_df.tail(10))

Original class distribution: Counter({nan: 48982, '<45k': 2603, '45-65k': 2294, '65-85k': 2002, '85-110k': 1357, '110-150k': 942, '>200k': 375, '150-200k': 361})
Resampled class distribution: Counter({nan: 48982, '45-65k': 1357, '65-85k': 1357, '85-110k': 1357, '<45k': 1357, '110-150k': 942, '>200k': 942, '150-200k': 942})


In [None]:
# print(train_final_df['level_labels'].value_counts())
# print(train_final_df['level'].value_counts())
# print(train_final_df['salary_bucket'].unique())
# print(train_final_df['salary_bucket'].value_counts())
# print(train_final_df['salary_labels'].value_counts())

In [None]:
print("Final class distributions for train dataset:")
print(train_final_df['level_labels'].value_counts())
print(train_final_df['salary_labels'].value_counts())
len(train_final_df)

Final class distributions for train dataset:
 1    9289
 5    9271
 3    9143
 0    9094
-1    7032
 4    6725
 2    6682
Name: level_labels, dtype: int64
-1    48982
 3     1357
 2     1357
 1     1357
 0     1357
 5      942
 4      942
 6      942
Name: salary_labels, dtype: int64


57236

In [None]:
# #@title Tokenize

# texts_train = train_final_df['text'].tolist()

# MAX_SEQUENCE_LENGTH = 512

# bert_train_tokenized = bert_tokenizer(
#     texts_train,
#     max_length=MAX_SEQUENCE_LENGTH,
#     truncation=True,
#     padding='max_length',
#     return_tensors='tf'
# )

# bert_train_inputs = [bert_train_tokenized.input_ids,
#                      bert_train_tokenized.token_type_ids,
#                      bert_train_tokenized.attention_mask]

# # Repeat w test data

# texts_test = test_data['text'].tolist()

# bert_test_tokenized = bert_tokenizer(
#     texts_test,
#     max_length=MAX_SEQUENCE_LENGTH,
#     truncation=True,
#     padding='max_length',
#     return_tensors='tf'
# )

# bert_test_inputs = [bert_test_tokenized.input_ids,
#                     bert_test_tokenized.token_type_ids,
#                     bert_test_tokenized.attention_mask]


# # Repeat w val data

# texts_val = val_data['text'].tolist()

# bert_val_tokenized = bert_tokenizer(
#     texts_val,
#     max_length=MAX_SEQUENCE_LENGTH,
#     truncation=True,
#     padding='max_length',
#     return_tensors='tf'
# )

# bert_val_inputs = [bert_val_tokenized.input_ids,
#                     bert_val_tokenized.token_type_ids,
#                     bert_val_tokenized.attention_mask]

In [None]:
train_final_df['level_labels'] = train_final_df['level_labels'].astype(int)
train_final_df['salary_labels'] = train_final_df['salary_labels'].astype(int)

test_data['level_labels'] = test_data['level_labels'].astype(int)
test_data['salary_labels'] = test_data['salary_labels'].astype(int)

val_data['level_labels'] = val_data['level_labels'].astype(int)
val_data['salary_labels'] = val_data['salary_labels'].astype(int)

In [None]:
#@title Export pickled data, labels, tokenized inputs

# import pickle

# #export tokenized inputs
# with open('train_inputs_bert.pickle', 'wb') as handle:
#     pickle.dump(bert_train_inputs, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('test_inputs_bert.pickle', 'wb') as handle:
#     pickle.dump(bert_test_inputs, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('val_inputs_bert.pickle', 'wb') as handle:
#     pickle.dump(bert_val_inputs, handle, protocol=pickle.HIGHEST_PROTOCOL)


# # export train, val, test
# with open('train_final_df.pickle', 'wb') as handle:
#     pickle.dump(train_final_df, handle, protocol=pickle.HIGHEST_PROTOCOL)
# with open('test_data.pickle', 'wb') as handle:
#     pickle.dump(test_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
# with open('val_data.pickle', 'wb') as handle:
#     pickle.dump(val_data, handle, protocol=pickle.HIGHEST_PROTOCOL)


# export labels
with open('labels_level_train_baseline.pickle', 'wb') as handle:
    pickle.dump(train_final_df['level_labels'], handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('labels_salary_train_baseline.pickle', 'wb') as handle:
#     pickle.dump(train_final_df['salary_labels'], handle, protocol=pickle.HIGHEST_PROTOCOL)


# with open('labels_level_test_baseline.pickle', 'wb') as handle:
#     pickle.dump(test_data['level_labels'], handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('labels_salary_test_baseline.pickle', 'wb') as handle:
#     pickle.dump(test_data['salary_labels'], handle, protocol=pickle.HIGHEST_PROTOCOL)


# with open('labels_level_val_baseline.pickle', 'wb') as handle:
#     pickle.dump(val_data['level_labels'], handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('labels_salary_val_baseline.pickle', 'wb') as handle:
#     pickle.dump(val_data['salary_labels'], handle, protocol=pickle.HIGHEST_PROTOCOL)


