In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
#from google.colab import drive
from transformers import BertForSequenceClassification
from peft import LoraConfig, TaskType
from peft import get_peft_model
from tqdm import tqdm
import pandas as pd

In [2]:
NAICS_2022 = pd.read_csv('NAICS_2022.csv')

NAICS_2022['INDUSTRY_GROUP'] = NAICS_2022['Code']//100

NAICS_2022 = NAICS_2022.dropna(subset=['EXAMPLE']).reset_index(drop=True)

# We define a function called extract_examples whose input is a text and output is a list of words
# which are extracted from that text. After doing some preprocess on the text, we split the text by semi-colon.

def extract_examples(text):
    # Lowercase the text
    text = text.lower()
    text = text.replace('for example, ','')
    
    # Remove punctuation marks
    #text = ''.join(char for char in text if char not in punctuation_without_semicolon)
    
    # Remove stopwords
    words = text.split()
    #words = [word for word in words if word not in stopwords]
    
    for i in range(len(words)):
        if ";" in words[i]:
            word = words[i].replace(";", "")
            stemmed_word = word + ";"
            words[i] = stemmed_word        
        #else:
         #   words[i] = stemmer.stem(words[i])
    
    
    
    # Stem the words
    #words = [stemmer.stem(word) for word in words]
    
    # Join the words back into a string
    text = ' '.join(words)
    
    list_words = text.rstrip('; ').split(';')
    # Use Yake to extract keywords from the preprocessed text
    #keywords = custom_kw_extractor.extract_keywords(text)
    #keywords = [t[0] for t in keywords]
    #if len(keywords) != 1:
       # keywords = [t for t in keywords if len(t.split()) > 1]
        
    #keywords = [t[0] for t in keywords if len(t[0].split()) > 1]
    
    return list_words


# Here we apply the extract_examples function on the EXAMPLE column of the dataset. 
# EXAMPLE column comprises of many examples which are separated by semi-colon.

NAICS_2022['EXAMPLE_SPLIT'] = NAICS_2022['EXAMPLE'].apply(lambda x: extract_examples(x))


# Labeling the INDUSTRY_GROUP column: we have 312 classes in the industry group level (first 4 digits of NAICS code)
# and we label them from 0 to 311.

INPUT_DF = NAICS_2022.copy()


INPUT_DF_EXPLODE = INPUT_DF.explode('EXAMPLE_SPLIT').reset_index(drop = True)

value_counts = INPUT_DF_EXPLODE['Code'].value_counts()
# Get the values that occur more than once
values_to_keep = value_counts[value_counts > 1].index

# Filter the DataFrame to keep only rows where the value in 'a' occurs more than once
INPUT_DF_EXPLODE = INPUT_DF_EXPLODE[INPUT_DF_EXPLODE['Code'].isin(values_to_keep)].reset_index(drop = True)

mapping_IG = {sec:idx for idx, sec in enumerate(list(INPUT_DF_EXPLODE.INDUSTRY_GROUP.unique()))}
INPUT_DF_EXPLODE['industry_group_label'] = INPUT_DF_EXPLODE['INDUSTRY_GROUP'].map(mapping_IG)

mapping_NAICS = {sec:idx for idx, sec in enumerate(list(INPUT_DF_EXPLODE.Code.unique()))}
INPUT_DF_EXPLODE['naics_label'] = INPUT_DF_EXPLODE['Code'].map(mapping_NAICS)

mapping_IG_reverse = {v:k for k, v in mapping_IG.items()}
mapping_NAICS_reverse = {v:k for k, v in mapping_NAICS.items()}

INPUT_DF_EXPLODE_SELECT = INPUT_DF_EXPLODE[['Class definition','EXAMPLE_SPLIT','industry_group_label','naics_label']]



In [3]:
NAICS_2022

Unnamed: 0,Level,Hierarchical structure,Code,Parent,Class title,Superscript,Class definition,EXAMPLE,INDUSTRY_GROUP,EXAMPLE_SPLIT
0,5,Canadian industry,111110,11111,Soybean farming,,This Canadian industry comprises establishment...,"soybean farming, field and seed production; s...",1111,"[soybean farming, field and seed production, ..."
1,5,Canadian industry,111120,11112,Oilseed (except soybean) farming,US,This Canadian industry comprises establishment...,canola (rapeseed) farming; linseed (flaxseed)...,1111,"[canola (rapeseed) farming, linseed (flaxseed..."
2,5,Canadian industry,111130,11113,Dry pea and bean farming,US,This Canadian industry comprises establishment...,bean or pea farming (field crop); dry bean fa...,1111,"[bean or pea farming (field crop), dry bean f..."
3,5,Canadian industry,111140,11114,Wheat farming,,This Canadian industry comprises establishment...,"wheat (i.e., spring, winter and durum) growin...",1111,"[wheat (i.e., spring, winter and durum) growin..."
4,5,Canadian industry,111150,11115,Corn farming,US,This Canadian industry comprises establishment...,"corn (except sweet corn) farming, field and s...",1111,"[corn (except sweet corn) farming, field and s..."
...,...,...,...,...,...,...,...,...,...,...
918,5,Canadian industry,913910,91391,"Other local, municipal and regional public adm...",CAN,This Canadian industry comprises establishment...,"adult education program, local; amusement tax...",9139,"[adult education program, local, amusement ta..."
919,5,Canadian industry,914111,91411,First Nations public administration,CAN,This Canadian industry comprises establishment...,First Nations Band Council; First Nations Sel...,9141,"[first nations band council, first nations se..."
920,5,Canadian industry,914112,91411,MÃ©tis public administration,CAN,This Canadian industry comprises establishment...,Métis Self-Government;,9141,[métis self-government]
921,5,Canadian industry,914113,91411,Inuit public administration,CAN,This Canadian industry comprises establishment...,Inuit Self-Government;,9141,[inuit self-government]


In [4]:
if torch.cuda.is_available():
    print("GPU is available!")
else:
    print("GPU is not available. Using CPU.")

GPU is available!


In [5]:
len(INPUT_DF_EXPLODE_SELECT)

19138

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(
    INPUT_DF_EXPLODE_SELECT['EXAMPLE_SPLIT'],
    INPUT_DF_EXPLODE_SELECT['naics_label'],
    test_size=0.1,
    random_state=42,
    stratify=INPUT_DF_EXPLODE_SELECT['naics_label']
)

In [7]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, return_tensors='pt')
valid_encodings = tokenizer(list(X_valid), truncation=True, padding=True, return_tensors='pt')

In [8]:
# Convert labels to tensors
train_labels = torch.tensor(list(y_train))
valid_labels = torch.tensor(list(y_valid))


In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = 'cpu'
train_encodings = {key: val.to(device) for key, val in train_encodings.items()}
valid_encodings = {key: val.to(device) for key, val in valid_encodings.items()}
train_labels = train_labels.to(device)
valid_labels = valid_labels.to(device)

In [10]:
train_encodings

{'input_ids': tensor([[  101,  3871,  9620,  ...,     0,     0,     0],
         [  101,  2512,  1011,  ...,     0,     0,     0],
         [  101,  3221,  3688,  ...,     0,     0,     0],
         ...,
         [  101, 17462, 11382,  ...,     0,     0,     0],
         [  101,  3806,  2240,  ...,     0,     0,     0],
         [  101,  7254,  7390,  ...,     0,     0,     0]], device='cuda:0'),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')}

In [11]:
from torch.utils.data import TensorDataset, DataLoader

# Assuming train_encoding and train_labels are your input data
train_dataset = TensorDataset(train_encodings['input_ids'], 
                              train_encodings['token_type_ids'], 
                              train_encodings['attention_mask'], 
                              train_labels)


In [12]:
# Set batch size according to your preference
batch_size = 16

# Create DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


In [13]:
valid_dataset = TensorDataset(valid_encodings['input_ids'], 
                              valid_encodings['token_type_ids'], 
                              valid_encodings['attention_mask'], 
                              valid_labels)

valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)

In [14]:
NUM_CLASS = len(list(set(list(y_train))))

In [15]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', 
    num_labels=NUM_CLASS
)

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=8, lora_alpha=1, lora_dropout=0.1
)


model = get_peft_model(model, lora_config)

# Set up training parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
model.to(device)

# Count trainable parameters
num_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of trainable parameters: {num_trainable_params}")

# Count total parameters (including non-trainable)
num_total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {num_total_params}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Number of trainable parameters: 963173
Total number of parameters: 111113674


In [16]:
963173/111113674

0.008668357055676154

In [17]:
print(model)

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSelfAttention(
                  (query): Linear(
                    in_features=768, out_features=768, bias=True
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_

In [17]:
torch.cuda.empty_cache()

In [18]:
import torch
import torch.nn as nn
from torch.nn.parallel import DataParallel


# Check if multiple GPUs are available
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = DataParallel(model)

# Move the model to GPU(s)
model = model.cuda()

# Now you can use the model as usual


In [19]:
import torch

# Limit GPU memory growth
#torch.cuda.set_per_process_memory_fraction(0.8)  # Adjust the fraction as needed


In [20]:
#pip install nvidia-ml-py3

In [21]:
from transformers import  AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score, balanced_accuracy_score
from tqdm import tqdm
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
# Number of training epochs
NUM_EPOCHS = 10
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader)*NUM_EPOCHS)
criterion = torch.nn.CrossEntropyLoss()



# Lists to store training losses and F1 scores
train_losses = []
valid_losses = []
f1_scores = []
balanced_accuracies = []
# Training loop with tqdm




for epoch in range(NUM_EPOCHS):
    torch.cuda.empty_cache()
    model.train()
    total_train_loss = 0.0



    # Use tqdm to create a progress bar
    progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{NUM_EPOCHS}', leave=False)

    for batch in progress_bar:
        input_ids, token_type_ids, attention_mask, labels = batch

        optimizer.zero_grad()
        #print('1')

        outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        total_train_loss += loss.item()
        #os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:1024"

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping to prevent exploding gradients
        #print('2')
        

        optimizer.step()
        scheduler.step()

        #print('3')

        # Update the tqdm progress bar with the current loss
        progress_bar.set_postfix({'Training Loss': loss.item()})

    average_train_loss = total_train_loss / len(train_dataloader)
    train_losses.append(average_train_loss)

    # Validation
    model.eval()
    total_valid_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in valid_dataloader:
            input_ids, token_type_ids, attention_mask, labels = batch

            outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            total_valid_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    average_valid_loss = total_valid_loss / len(valid_dataloader)
    valid_losses.append(average_valid_loss)

    # Calculate F1 score
    f1 = f1_score(all_labels, all_preds, average='weighted')
    f1_scores.append(f1)
    balanced_acc = balanced_accuracy_score(all_labels, all_preds)
    balanced_accuracies.append(balanced_acc)

    # Print training and validation statistics
    print(f'Epoch {epoch + 1}/{NUM_EPOCHS}:')
    print(f'Training Loss: {average_train_loss}')
    print(f'Validation Loss: {average_valid_loss}')
    print(f'F1 Score: {f1}')
    print(f'Balanced Accuracy: {balanced_acc}')
    print('-' * 50)


                                                                                                                       

Epoch 1/10:
Training Loss: 5.117507210681033
Validation Loss: 3.8808722416559855
F1 Score: 0.19573424290374772
Balanced Accuracy: 0.10704944080443266
--------------------------------------------------


                                                                                                                       

Epoch 2/10:
Training Loss: 3.323916776930721
Validation Loss: 2.771032762527466
F1 Score: 0.36362078660379943
Balanced Accuracy: 0.22078864550704902
--------------------------------------------------


                                                                                                                       

Epoch 3/10:
Training Loss: 2.328649739369044
Validation Loss: 2.1659298479557036
F1 Score: 0.47114322473796044
Balanced Accuracy: 0.3244386536328263
--------------------------------------------------




Epoch 4/10:
Training Loss: 1.6736489460771573
Validation Loss: 1.7529956579208374
F1 Score: 0.585653893478821
Balanced Accuracy: 0.4518503914693145
--------------------------------------------------




Epoch 5/10:
Training Loss: 1.2225299327873806
Validation Loss: 1.4976436267296473
F1 Score: 0.629536683443063
Balanced Accuracy: 0.5034422592209454
--------------------------------------------------




Epoch 6/10:
Training Loss: 0.9037067760110676
Validation Loss: 1.369629900654157
F1 Score: 0.6703024451013009
Balanced Accuracy: 0.5782403151818304
--------------------------------------------------




Epoch 7/10:
Training Loss: 0.684348363280573
Validation Loss: 1.288301892951131
F1 Score: 0.6941387532141048
Balanced Accuracy: 0.6150531811770873
--------------------------------------------------




Epoch 8/10:
Training Loss: 0.5369050269920513
Validation Loss: 1.2360669496158758
F1 Score: 0.7109500545154364
Balanced Accuracy: 0.6415063162227869
--------------------------------------------------




Epoch 9/10:
Training Loss: 0.4424993259661045
Validation Loss: 1.1889331533263128
F1 Score: 0.7144773510015616
Balanced Accuracy: 0.6416194302205747
--------------------------------------------------


                                                                                                                       

Epoch 10/10:
Training Loss: 0.39136002734372655
Validation Loss: 1.1835569436351459
F1 Score: 0.7203075191047965
Balanced Accuracy: 0.6496843297745551
--------------------------------------------------




In [23]:
torch.save(model.state_dict(), 'Bert_TM_NAICS_model.pth')

In [24]:
with open('train_losses_bert_naics.txt', 'w') as file:
    for item in train_losses:
        file.write(f"{item}\n")

with open('valid_losses_bert_naics.txt', 'w') as file:
    for item in valid_losses:
        file.write(f"{item}\n")

with open('f1_scores_bert_naics.txt', 'w') as file:
    for item in f1_scores:
        file.write(f"{item}\n")

with open('balanced_accuracies_bert_naics.txt', 'w') as file:
    for item in balanced_accuracies:
        file.write(f"{item}\n")

# Prediction of TM dataset based on NICE classes

In [26]:
Sample_TM_800_Unique_Files = pd.read_excel('Sample_TM_800_Unique_Files.xlsx')

In [27]:
Sample_TM_800_Unique_Files = Sample_TM_800_Unique_Files.dropna(subset=['TEXT']).reset_index(drop=True)

In [28]:
texts = Sample_TM_800_Unique_Files['TEXT'].tolist()

In [29]:
#tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
test_encodings = tokenizer(texts, truncation=True, padding=True, return_tensors='pt')

In [30]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
test_encodings = {key: val.to(device) for key, val in test_encodings.items()}

In [31]:
test_dataset = TensorDataset(test_encodings['input_ids'], 
                              test_encodings['token_type_ids'], 
                              test_encodings['attention_mask']
                              )

In [32]:
batch_size = 16

# Create DataLoader
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [33]:
model.eval()
test_preds = []

# Iterate through the test DataLoader
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, token_type_ids,  attention_mask = batch  # adjust based on your input features

        # Make predictions using the model
        outputs = model(input_ids, token_type_ids = token_type_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Assuming a classification task, get predicted class labels
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        test_preds.extend(preds)

In [34]:
Sample_TM_800_Unique_Files['naics_label_pred'] = test_preds

In [35]:
def map_values(value):
    return mapping_NAICS_reverse.get(value, value)  # Use get to handle cases where the key is not in the dictionary

# Apply the function to create the new column
Sample_TM_800_Unique_Files['naics_pred'] = Sample_TM_800_Unique_Files['naics_label_pred'].apply(map_values)

In [37]:
Sample_TM_800_Unique_Files_BERT_ROBERTA_NAICS_IG_PREDICT = pd.read_csv('Sample_TM_800_Unique_Files_BERT_ROBERTA_NAICS_IG_PREDICT.csv')

In [38]:
Sample_TM_800_Unique_Files['industry_group_roberta_pred'] = list(Sample_TM_800_Unique_Files_BERT_ROBERTA_NAICS_IG_PREDICT['industry_group_roberta_pred'])
Sample_TM_800_Unique_Files['industry_group_bert_pred'] = list(Sample_TM_800_Unique_Files_BERT_ROBERTA_NAICS_IG_PREDICT['industry_group_bert_pred'])
Sample_TM_800_Unique_Files['naics_roberta_pred'] = list(Sample_TM_800_Unique_Files_BERT_ROBERTA_NAICS_IG_PREDICT['naics_roberta_pred'])

In [39]:
Sample_TM_800_Unique_Files.rename(columns={'naics_pred': 'naics_bert_pred'}, inplace=True)

In [40]:
Sample_TM_800_Unique_Files_select = Sample_TM_800_Unique_Files[['UNIQUE_FILE_NUMBER',
 'NICE_CLASS_CODE',
 'LEGAL_NAME_UPD',
 
 'NAICS Marianne',
 'NAICS Sebastien',
 
 'TEXT',
 
 'industry_group_roberta_pred',
 'industry_group_bert_pred','naics_roberta_pred','naics_bert_pred'
                                                               ]]

In [41]:
Sample_TM_800_Unique_Files_select.to_csv('Sample_TM_800_Unique_Files_BERT_ROBERTA_NAICS_IG_PREDICT_FULL.csv', index=False)

In [70]:
Sample_TM_800_Unique_Files_select[10:20]

Unnamed: 0,UNIQUE_FILE_NUMBER,NICE_CLASS_CODE,LEGAL_NAME_UPD,NAICS Marianne,NAICS Sebastien,TEXT,industry_group_roberta_pred,industry_group_bert_pred,naics_roberta_pred,naics_bert_pred
10,2204660_0,36,glentel inc,524124.0,524129,Providing extended warranties on cellular phon...,5241,5173,524129,524129
11,2204660_0,37,glentel inc,449211.0,449211,"Developing, supplying and advising with respec...",5179,5182,811210,334511
12,2204660_0,38,glentel inc,513140.0,449211,Provision of information and advice with respe...,5179,5182,334410,334110
13,2204660_0,42,glentel inc,518210.0,449211,Providing technical support services in the fo...,5182,5182,561420,517310
14,2187843_0,36,lussier dale parizeau inc,524111.0,524210,insurance services,5242,5242,524299,524299
15,2171874_0,35,ten tree international inc,813310.0,813310,promotion of the planting of trees and contrib...,5619,5413,541990,541611
16,2136129_0,5,hexo operations inc,312310.0,459993,Nutritional supplements containing cannabis ex...,3123,3123,312310,312310
17,2136129_0,34,hexo operations inc,312310.0,459993,Dried cannabis for recreational use; cannabis ...,3123,3123,312310,312310
18,2151540_0,9,12783185 canada inc,541514.0,518210,Downloadable computer software for accessing d...,3346,5182,334610,513211
19,2151540_0,41,12783185 canada inc,459120.0,449212,Organization of e-sports competitions; enterta...,5121,5162,512110,713299


In [None]:
Roberta = 9(retail),11**(naics),12,18,19,20,22,26,30*,34*,37,162,87**,85**,99**,104**,106,
bert = 15,19,23,27**

In [106]:
k = 11
Sample_TM_800_Unique_Files_select.iloc[k]

UNIQUE_FILE_NUMBER                                                     2204660_0
NICE_CLASS_CODE                                                               37
LEGAL_NAME_UPD                                                       glentel inc
NAICS Marianne                                                          449211.0
NAICS Sebastien                                                           449211
TEXT                           Developing, supplying and advising with respec...
industry_group_roberta_pred                                                 5179
industry_group_bert_pred                                                    5182
naics_roberta_pred                                                        811210
naics_bert_pred                                                           334511
Name: 11, dtype: object

In [107]:
Sample_TM_800_Unique_Files_select['TEXT'][k]

'Developing, supplying and advising with respect to the selection, installation, operation, maintenance, repair, refurbishing and repurchase of wireless communications devices, namely, wireless phones, mobile phones, handheld computers and tablets'

In [114]:
IG = 5179
for i in range(len(NAICS_2022[NAICS_2022.INDUSTRY_GROUP == IG]['Class title'].reset_index(drop = True))):
    print(NAICS_2022[NAICS_2022.INDUSTRY_GROUP == IG]['Class title'].reset_index(drop = True)[i])

Telecommunications resellers
All other telecommunications


In [115]:
for i in range(len(NAICS_2022[NAICS_2022.INDUSTRY_GROUP == IG]['Class definition'].reset_index(drop = True))):
    print(NAICS_2022[NAICS_2022.INDUSTRY_GROUP == IG]['Class definition'].reset_index(drop = True)[i])

This Canadian industry comprises establishments primarily engaged in providing telecommunications and/or video entertainment services over network facilities operated by others.
This Canadian industry comprises establishments primarily engaged in operating telecommunications networks or providing telecommunication services not elsewhere classified


In [116]:
for i in range(len(NAICS_2022[NAICS_2022.INDUSTRY_GROUP == IG]['EXAMPLE_SPLIT'].reset_index(drop = True))):
    print(NAICS_2022[NAICS_2022.INDUSTRY_GROUP == IG]['EXAMPLE_SPLIT'].reset_index(drop = True)[i])

['internet service provider (isp), resale', ' long distance telecommunication resellers', ' microwave communications resellers', ' mobile virtual network operators (mvno)', ' non-facilities based internet service provider (isp)', ' resellers, long-distance telephone communications (except satellite)', ' resellers, telephone communications (except satellite)', ' reselling dial-up or broadband internet service provider (isp) services', ' sale of cellular telephone plans using network facilities operated by others', ' telecommunication resellers', ' voice over internet protocol (voip) services, access-independent (non-managed)']
['radar station operation', ' satellite earth stations facilities operators', ' satellite or missile tracking stations, operated on a contract basis', ' satellite telemetry operation on a contract or fee basis', ' satellite terminal stations', ' satellite tracking stations', ' selling of prepaid phone cards', ' selling of prepaid telephone calling cards', ' teleme

In [117]:
NAICS = 334511
for i in range(len(NAICS_2022[NAICS_2022.Code == NAICS]['Class title'].reset_index(drop = True))):
    print(NAICS_2022[NAICS_2022.Code == NAICS]['Class title'].reset_index(drop = True)[i])

Navigational and guidance instruments manufacturing


In [118]:
for i in range(len(NAICS_2022[NAICS_2022.Code == NAICS]['Class definition'].reset_index(drop = True))):
    print(NAICS_2022[NAICS_2022.Code == NAICS]['Class definition'].reset_index(drop = True)[i])

This Canadian industry comprises establishments primarily engaged in manufacturing navigational and guidance equipment.


In [119]:
for i in range(len(NAICS_2022[NAICS_2022.Code == NAICS]['EXAMPLE_SPLIT'].reset_index(drop = True))):
    print(NAICS_2022[NAICS_2022.Code == NAICS]['EXAMPLE_SPLIT'].reset_index(drop = True)[i])



In [140]:
Sample_TM_800_Unique_Files_select_sebastien['ind_IG_Sebastien_bert_IG'] = (Sample_TM_800_Unique_Files_select_sebastien['IG_Sebastien'] == Sample_TM_800_Unique_Files_select_sebastien['industry_group_bert_pred']).astype(int)

In [144]:
Sample_TM_800_Unique_Files_select_sebastien['ind_IG_Sebastien_roberta_naics'] = (Sample_TM_800_Unique_Files_select_sebastien['IG_Sebastien'] == Sample_TM_800_Unique_Files_select_sebastien['naics_roberta_pred']//100).astype(int)

In [146]:
Sample_TM_800_Unique_Files_select_sebastien['ind_IG_Sebastien_bert_naics'] = (Sample_TM_800_Unique_Files_select_sebastien['IG_Sebastien'] == Sample_TM_800_Unique_Files_select_sebastien['naics_bert_pred']//100).astype(int)

In [148]:
Sample_TM_800_Unique_Files_select_sebastien['ind_naics_Sebastien_roberta'] = (Sample_TM_800_Unique_Files_select_sebastien['NAICS Sebastien'].astype(int) == Sample_TM_800_Unique_Files_select_sebastien['naics_roberta_pred']).astype(int)

In [150]:
Sample_TM_800_Unique_Files_select_sebastien['ind_naics_Sebastien_bert'] = (Sample_TM_800_Unique_Files_select_sebastien['NAICS Sebastien'].astype(int) == Sample_TM_800_Unique_Files_select_sebastien['naics_bert_pred']).astype(int)

In [None]:
Sample_TM_800_Unique_Files_select_sebastien['ind_naics_Sebastien_roberta'] = (Sample_TM_800_Unique_Files_select_sebastien['IG_Sebastien'] == Sample_TM_800_Unique_Files_select_sebastien['naics_bert_pred']//100).astype(int)

In [153]:
Sample_TM_800_Unique_Files_select_sebastien.ind_IG_Sebastien_roberta_IG.sum()/len(Sample_TM_800_Unique_Files_select_sebastien)

0.3058252427184466

In [154]:
Sample_TM_800_Unique_Files_select_sebastien.ind_IG_Sebastien_bert_IG.sum()/len(Sample_TM_800_Unique_Files_select_sebastien)

0.2912621359223301

In [155]:
Sample_TM_800_Unique_Files_select_sebastien.ind_IG_Sebastien_roberta_naics.sum()/len(Sample_TM_800_Unique_Files_select_sebastien)

0.3106796116504854

In [156]:
Sample_TM_800_Unique_Files_select_sebastien.ind_IG_Sebastien_bert_naics.sum()/len(Sample_TM_800_Unique_Files_select_sebastien)

0.3155339805825243

In [157]:
Sample_TM_800_Unique_Files_select_sebastien.ind_naics_Sebastien_roberta.sum()/len(Sample_TM_800_Unique_Files_select_sebastien)

0.25728155339805825

In [158]:
Sample_TM_800_Unique_Files_select_sebastien.ind_naics_Sebastien_bert.sum()/len(Sample_TM_800_Unique_Files_select_sebastien)

0.25728155339805825

In [159]:
Sample_TM_800_Unique_Files_select_sebastien['ind_bert_roberat_naics'] = (Sample_TM_800_Unique_Files_select_sebastien['ind_naics_Sebastien_roberta'] == Sample_TM_800_Unique_Files_select_sebastien['ind_naics_Sebastien_bert']).astype(int)

In [163]:
# the percentage that bert and roberta agree on the same NAICS
Sample_TM_800_Unique_Files_select_sebastien.ind_bert_roberat_naics.sum()/len(Sample_TM_800_Unique_Files_select_sebastien)

0.9029126213592233

In [167]:
binary_columns_IG = ['ind_IG_Sebastien_roberta_IG', 'ind_IG_Sebastien_bert_IG', 'ind_IG_Sebastien_roberta_naics', 'ind_IG_Sebastien_bert_naics']

# Create a new indicator column
Sample_TM_800_Unique_Files_select_sebastien['indicator_IG'] = Sample_TM_800_Unique_Files_select_sebastien[binary_columns_IG].any(axis=1).astype(int)

In [169]:
# the percentage that bert and roberta agree on the same NAICS
Sample_TM_800_Unique_Files_select_sebastien.indicator_IG.sum()/len(Sample_TM_800_Unique_Files_select_sebastien)

0.4174757281553398

In [170]:
binary_columns_naics = ['ind_naics_Sebastien_roberta', 'ind_naics_Sebastien_bert']

# Create a new indicator column
Sample_TM_800_Unique_Files_select_sebastien['indicator_naics'] = Sample_TM_800_Unique_Files_select_sebastien[binary_columns_naics].any(axis=1).astype(int)

In [172]:
# the percentage that bert and roberta agree on the same NAICS
Sample_TM_800_Unique_Files_select_sebastien.indicator_naics.sum()/len(Sample_TM_800_Unique_Files_select_sebastien)

0.3058252427184466

In [173]:
#Sample_TM_800_Unique_Files_select_sebastien