In [None]:
import pandas as pd 
import numpy as np 
import os
import yaml
from tqdm import tqdm
import re
import json  
PATH_DATA = 'data'


In [None]:

sex_freq = pd.read_csv(os.path.join(PATH_DATA,'firstname_with_sex.csv'),sep = ';')
sex_data = pd.read_csv(os.path.join(PATH_DATA,'transcriptions_with_sex.csv'),sep = ',')

In [None]:

def read_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data
    
    
def read_yaml(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return yaml.safe_load(file)


def transform_tags(tags):
    tags_clean = {}
    for col in tags :
        tag = tags[col]['start']
        tags_clean[tag] = col
    return tags_clean




def parse_observation(observation, tags):
    parts = re.split('(' + '|'.join(re.escape(key) for key in tags.keys()) + ')', observation)
    parts = [part for part in parts if part]
    result = {col: None for col in tags.values()}  # Initialize all columns with None
    
    tag = None  # Keep track of the current tag
    for part in parts:
        if part in tags:  # If the part is a tag
            tag = tags[part]
        elif tag:  # If the part is not a tag, it is a value for the current tag
            result[tag] = part
            tag = None  # Reset tag for the next iteration
            
    return result


def parse_page(page, tags,name_page):
    page_parsed = page.split('\n')
    all = []
    for obs in page_parsed:
        res = parse_observation(obs,tags)
        all.append(res)
    all  = pd.DataFrame(all)
    all['name_page'] = name_page
    all = all.set_index('name_page',append=True)
    all = all.swaplevel()
    all.index.names = ['page','id']
    return all


def parse_all_pages(data,tags):
    
    results =[]
    for page in tqdm(list(data.keys())) :
        name_page = page.split('-')[-1].replace('.jpg','')
        page_clean = parse_page(data[page],tags,name_page)
        results.append(page_clean)
        
    return pd.concat(results,axis=0).dropna(axis=0,how = 'all').dropna(axis=1,how = 'all')

all_pages = read_json('data/entities.json')
tags = read_yaml('data/tokens.yml')
tags_clean = transform_tags(tags)
all_pages_clean = parse_all_pages(all_pages,tags_clean).dropna(axis=0,how = 'all')
all_pages_clean

In [None]:
gender_map = {
    'Garçon': 'homme',
    'Garçon ': 'homme',
    'Homme marié': 'homme',
    'Homme marié ': 'homme',
    'Veuf': 'homme',
    'Veuf ': 'homme',
    'Fille': 'femme',
    'Fille ': 'femme',
    'Femme mariée': 'femme',
    'Femme mariée ': 'femme',
    'Veuve': 'femme',
    'Veuve ': 'femme',
    None: None
}
all_pages_clean['sex_clean'] = all_pages_clean['civil_status'].map(gender_map)

In [None]:
### certains prenoms ont les trois prénoms, garder que le premier 
all_pages_clean['firstname'] = all_pages_clean['firstname'].apply(lambda x : x.split(' ')[0] if x else None)
all_pages_clean['firstname'] = all_pages_clean['firstname'].apply(lambda x : x.split('-')[0] if x else None)
all_pages_clean['firstname'] = all_pages_clean['firstname'].apply(lambda x : x.replace('ï','i') if x else None)
all_pages_clean['firstname_lower'] = all_pages_clean['firstname'].apply(lambda x : str.lower(x) if x else None)
all_pages_clean['firstname_lower']= all_pages_clean['firstname_lower'].apply(lambda x : x.replace(' ','') if x else None)


In [None]:
name_freq = pd.read_csv(os.path.join(PATH_DATA,'firstname_with_sex.csv'), sep = ';')
name_freq['total'] = name_freq[['male','female']].sum(1)
name_freq['freq_male'] = name_freq['male'] / name_freq['total']
name_freq['freq_female'] = name_freq['female'] / name_freq['total']
## ni link, ni prenom, ni civil_status, ni sex
to_drop = all_pages_clean['sex_clean'].isna() & (all_pages_clean['link'].isna()) & (all_pages_clean['firstname_lower'].isna())
all_pages_clean = all_pages_clean[~to_drop]



def forward_fill_idem(data,column):
    
    data[column] = data[column].replace({'idem ': 'idem'})
    data[column] = data[column].replace({None:0})
    data[column] = data[column].replace({'idem':None})
    data[column] = data[column].ffill()
    data[column] = data[column].replace({0:None})
    return data 


# test = forward_fill_idem(all_pages_clean,'link')
all_pages_clean= all_pages_clean.groupby(level=0,as_index=False,group_keys=False).apply(lambda x : forward_fill_idem(x,'link'))
all_pages_clean= all_pages_clean.groupby(level=0,as_index=False,group_keys=False).apply(lambda x : forward_fill_idem(x,'nationality'))
all_pages_clean= all_pages_clean.groupby(level=0,as_index=False,group_keys=False).apply(lambda x : forward_fill_idem(x,'occupation'))
all_pages_clean= all_pages_clean.groupby(level=0,as_index=False,group_keys=False).apply(lambda x : forward_fill_idem(x,'surname'))
all_pages_clean= all_pages_clean.groupby(level=0,as_index=False,group_keys=False).apply(lambda x : forward_fill_idem(x,'surname_household'))
all_pages_clean= all_pages_clean.groupby(level=0,as_index=False,group_keys=False).apply(lambda x : forward_fill_idem(x,'employer'))

In [None]:
clean_data  = all_pages_clean.merge(name_freq.set_index('firstname')[['freq_male','freq_female']],left_on='firstname_lower',right_index=True,how = 'left')

In [None]:

from wordcloud import WordCloud
from stop_words import get_stop_words
import matplotlib.pyplot as plt
french_stopwords = get_stop_words('french')
text = ' '.join(clean_data['link'].dropna())
wordcloud = WordCloud(stopwords=french_stopwords, background_color="white").generate(text)

plt.figure(figsize=(8, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

## Model Bert

In [None]:
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer

In [None]:
def encode_label(x):
    
    if x :
        if x =='homme':
            return 1
        if x == 'femme':
            return 0
    else :
        return None

def encode_freq(x):
    
    if x > 0.5 :
        return 'prenom homme'
    elif x <=0.5:
        return 'prenom femme'
    else :
        return None

In [608]:
data_model = clean_data.copy()
data_model['target'] = data_model['sex_clean'].apply(encode_label)
data_model['freq_name_class'] = data_model['freq_male'].apply(encode_freq)
data_model = data_model[data_model['sex_clean'].notna()]
features = ['civil_status','firstname_lower','link','occupation','freq_name_class']

data_model = pd.DataFrame(data_model[features].fillna('').apply(lambda row: ' '.join(row.values.astype(str)), axis=1)).join(data_model['target']).reset_index(drop=True)
data_model.columns = ['TITLE','ENCODE_CAT']
data_model

Unnamed: 0,TITLE,ENCODE_CAT
0,Garçon cyrille menuisier prenom homme,1.0
1,Garçon auguste vitrier prenom homme,1.0
2,Garçon pierre vitrier prenom homme,1.0
3,Homme marié alexandre prop re prenom homme,1.0
4,Fille caroline domestique prenom femme,0.0
...,...,...
10700,Veuve thérèse prop re cultivatrice prenom femme,0.0
10701,Garçon augustin son fils même profession pre...,1.0
10702,Garçon jean sabotier prenom homme,1.0
10703,Homme marié pierre prop re cultivateur journa...,1.0


In [609]:

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

In [610]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [611]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        title = str(self.data.TITLE[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.ENCODE_CAT[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [612]:
train_size = 0.8
train_dataset=data_model.sample(frac=train_size,random_state=200)
test_dataset=data_model.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

print("FULL Dataset: {}".format(data_model.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

FULL Dataset: (10705, 2)
TRAIN Dataset: (8564, 2)
TEST Dataset: (2141, 2)


In [639]:
import torch
from transformers import DistilBertModel

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 1)
    
    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

# Assume 'device' is defined (e.g., 'cuda' for GPU or 'cpu' for CPU)
model = DistillBERTClass()
model.to(device)

# For binary classification, use the sigmoid activation function at the output layer during inference,
# not directly in the model's forward method. You can apply it after getting the logits from the model.
# This is especially useful when you're using BCEWithLogitsLoss, as it expects logits.

# Example usage during training
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [640]:


def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct


def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs.squeeze(),targets.float())
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 


for epoch in range(EPOCHS):
    train(epoch)

RuntimeError: all elements of input should be between 0 and 1

## version kaggle

https://www.kaggle.com/code/lalwaniabhishek/using-bert-for-binary-text-classification

In [658]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(data_model, test_size=0.2, random_state=42)

In [659]:
train_df_bert = pd.DataFrame({
    'id':range(len(train_df)),
    'label':train_df['ENCODE_CAT'],
    'alpha':['a']*train_df.shape[0],
    'text': train_df['TITLE'].replace(r'\n', ' ', regex=True)
})

test_df_bert = pd.DataFrame({
    'id':range(len(test_df)),
    'label':test_df['ENCODE_CAT'],
    'alpha':['a']*test_df.shape[0],
    'text': test_df['TITLE'].replace(r'\n', ' ', regex=True)
})
#Getting the relevant data files
train_df_bert.to_csv('data/train.tsv', sep='\t', index=False, header=False)
test_df_bert.to_csv('data/test.tsv', sep='\t', index=False, header=False)
print('Training and testing files are generated')

Training and testing files are generated


In [660]:
from __future__ import absolute_import, division, print_function

import csv
import os
import sys

csv.field_size_limit(2147483647) # Increase CSV reader's field limit incase we have long text.


class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r", encoding="utf-8") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                if sys.version_info[0] == 2:
                    line = list(unicode(cell, 'utf-8') for cell in line)
                lines.append(line)
            return lines


class BinaryClassificationProcessor(DataProcessor):
    """Processor for binary classification dataset."""

    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

    def get_labels(self):
        """See base class."""
        return ["0", "1"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            text_a = line[3]
            label = line[1]
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
        return examples
#Getting some helper functions and classes
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id


def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()


def convert_example_to_feature(example_row):
    # return example_row
    example, label_map, max_seq_length, tokenizer, output_mode = example_row

    tokens_a = tokenizer.tokenize(example.text_a)

    tokens_b = None
    if example.text_b:
        tokens_b = tokenizer.tokenize(example.text_b)
        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
    else:
        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[:(max_seq_length - 2)]

    tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
    segment_ids = [0] * len(tokens)

    if tokens_b:
        tokens += tokens_b + ["[SEP]"]
        segment_ids += [1] * (len(tokens_b) + 1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    padding = [0] * (max_seq_length - len(input_ids))
    input_ids += padding
    input_mask += padding
    segment_ids += padding

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    if output_mode == "classification":
        label_id = label_map[example.label]
    elif output_mode == "regression":
        label_id = float(example.label)
    else:
        raise KeyError(output_mode)

    return InputFeatures(input_ids=input_ids,
                         input_mask=input_mask,
                         segment_ids=segment_ids,
                         label_id=label_id)

In [661]:
#Instantiating model parameters and directories
DATA_DIR = 'data/'
BERT_MODEL = 'bert-base-cased'
TASK_NAME = 'imdb'
OUTPUT_DIR = f'outputs/{TASK_NAME}/'
REPORTS_DIR = f'reports/{TASK_NAME}_evaluation_report/'
CACHE_DIR = 'cache/'
MAX_SEQ_LENGTH = 256

TRAIN_BATCH_SIZE = 24
EVAL_BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 10
RANDOM_SEED = 42
GRADIENT_ACCUMULATION_STEPS = 1
WARMUP_PROPORTION = 0.1
OUTPUT_MODE = 'classification'

CONFIG_NAME = "config.json"
WEIGHTS_NAME = "pytorch_model.bin"
output_mode = OUTPUT_MODE
cache_dir = CACHE_DIR

if os.path.exists(REPORTS_DIR) and os.listdir(REPORTS_DIR):
        REPORTS_DIR += f'/report_{len(os.listdir(REPORTS_DIR))}'
        os.makedirs(REPORTS_DIR)
if not os.path.exists(REPORTS_DIR):
    os.makedirs(REPORTS_DIR)
    REPORTS_DIR += f'/report_{len(os.listdir(REPORTS_DIR))}'
    os.makedirs(REPORTS_DIR)
    
#Making the required directories
if os.path.exists(OUTPUT_DIR) and os.listdir(OUTPUT_DIR):
        raise ValueError("Output directory ({}) already exists and is not empty.".format(OUTPUT_DIR))
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    

#Using a processor to read the data from training file
processor = BinaryClassificationProcessor()
train_examples = processor.get_train_examples(DATA_DIR)
train_examples_len = len(train_examples)
print(len(train_examples))

8564


In [None]:
name_freq = pd.read_csv(os.path.join(PATH_DATA,'firstname_with_sex.csv'), sep = ';')
name_freq['total'] = name_freq[['male','female']].sum(1)
name_freq['freq_male'] = name_freq['male'] / name_freq['total']
name_freq['freq_female'] = name_freq['female'] / name_freq['total']
## ni link, ni prenom, ni civil_status, ni sex
to_drop = all_pages_clean['sex_clean'].isna() & (all_pages_clean['link'].isna()) & (all_pages_clean['firstname_lower'].isna())
all_pages_clean = all_pages_clean[~to_drop]



def forward_fill_idem(data,column):
    
    data[column] = data[column].replace({'idem ': 'idem'})
    data[column] = data[column].replace({None:0})
    data[column] = data[column].replace({'idem':None})
    data[column] = data[column].ffill()
    data[column] = data[column].replace({0:None})
    return data 


# test = forward_fill_idem(all_pages_clean,'link')
all_pages_clean= all_pages_clean.groupby(level=0,as_index=False,group_keys=False).apply(lambda x : forward_fill_idem(x,'link'))
all_pages_clean= all_pages_clean.groupby(level=0,as_index=False,group_keys=False).apply(lambda x : forward_fill_idem(x,'nationality'))
all_pages_clean= all_pages_clean.groupby(level=0,as_index=False,group_keys=False).apply(lambda x : forward_fill_idem(x,'occupation'))
all_pages_clean= all_pages_clean.groupby(level=0,as_index=False,group_keys=False).apply(lambda x : forward_fill_idem(x,'surname'))
all_pages_clean= all_pages_clean.groupby(level=0,as_index=False,group_keys=False).apply(lambda x : forward_fill_idem(x,'surname_household'))
all_pages_clean= all_pages_clean.groupby(level=0,as_index=False,group_keys=False).apply(lambda x : forward_fill_idem(x,'employer'))

In [None]:
clean_data  = all_pages_clean.merge(name_freq.set_index('firstname')[['freq_male','freq_female']],left_on='firstname_lower',right_index=True,how = 'left')

In [None]:

from wordcloud import WordCloud
from stop_words import get_stop_words
import matplotlib.pyplot as plt
french_stopwords = get_stop_words('french')
text = ' '.join(clean_data['link'].dropna())
wordcloud = WordCloud(stopwords=french_stopwords, background_color="white").generate(text)

plt.figure(figsize=(8, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

## Model Bert

In [None]:
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer

In [None]:
def encode_label(x):
    
    if x :
        if x =='homme':
            return 1
        if x == 'femme':
            return 0
    else :
        return None

def encode_freq(x):
    
    if x > 0.5 :
        return 'prenom homme'
    elif x <=0.5:
        return 'prenom femme'
    else :
        return None

In [None]:
data_model = clean_data.copy()
data_model['target'] = data_model['sex_clean'].apply(encode_label)
data_model['freq_name_class'] = data_model['freq_male'].apply(encode_freq)
data_model = data_model[data_model['sex_clean'].notna()]
features = ['civil_status','firstname_lower','link','occupation','freq_name_class']

data_model = pd.DataFrame(data_model[features].fillna('').apply(lambda row: ' '.join(row.values.astype(str)), axis=1)).join(data_model['target']).reset_index(drop=True)
data_model.columns = ['TITLE','ENCODE_CAT']
data_model

Unnamed: 0,TITLE,ENCODE_CAT
0,Garçon cyrille menuisier prenom homme,1.0
1,Garçon auguste vitrier prenom homme,1.0
2,Garçon pierre vitrier prenom homme,1.0
3,Homme marié alexandre prop re prenom homme,1.0
4,Fille caroline domestique prenom femme,0.0
...,...,...
10700,Veuve thérèse prop re cultivatrice prenom femme,0.0
10701,Garçon augustin son fils même profession pre...,1.0
10702,Garçon jean sabotier prenom homme,1.0
10703,Homme marié pierre prop re cultivateur journa...,1.0


In [None]:

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        title = str(self.data.TITLE[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.ENCODE_CAT[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [None]:
train_size = 0.8
train_dataset=data_model.sample(frac=train_size,random_state=200)
test_dataset=data_model.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

print("FULL Dataset: {}".format(data_model.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

FULL Dataset: (10705, 2)
TRAIN Dataset: (8564, 2)
TEST Dataset: (2141, 2)


In [None]:
import torch
from transformers import DistilBertModel

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 1)
    
    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

# Assume 'device' is defined (e.g., 'cuda' for GPU or 'cpu' for CPU)
model = DistillBERTClass()
model.to(device)

# For binary classification, use the sigmoid activation function at the output layer during inference,
# not directly in the model's forward method. You can apply it after getting the logits from the model.
# This is especially useful when you're using BCEWithLogitsLoss, as it expects logits.

# Example usage during training
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [None]:


def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct


def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs.squeeze(),targets.float())
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 


for epoch in range(EPOCHS):
    train(epoch)

RuntimeError: all elements of input should be between 0 and 1

In [None]:
# for _,data in enumerate(training_loader, 0):
#     ids = data['ids'].to(device, dtype = torch.long)
#     mask = data['mask'].to(device, dtype = torch.long)
#     targets = data['targets'].to(device, dtype = torch.long)
    
outputs = model(ids, mask)

In [None]:
loss_function(outputs.squeeze(),targets.float())

tensor(1.0892, grad_fn=<BinaryCrossEntropyBackward0>)