In [None]:
#import

import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from transformers import DistilBertTokenizerFast
import torch
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score

In [None]:
#creating dataset, cleaning 'NaN' values and dropping subject_line and ambiguous individuals

path_file = 'transcriptions_with_sex.csv'
data = pd.read_csv(path_file)
data = data.dropna()
data = data.drop('subject_line', axis=1)
data = data[data['sex'] != 'ambigu']

In [None]:
#labelling men with '0' and women with '1'

def labelization(x):
    y = 0
    if x == 'homme':
        y = 0
    if x == 'femme':
        y = 1
    return y
data['labels'] = data['sex'].apply(labelization)

In [None]:
#splitting dataset between training (80%) and testing (20%)

df_train, df_test = train_test_split(data, test_size=0.2, random_state=42)

In [36]:
#converting datasets to list

text_train_1 = df_train['groundtruth'].to_list()
text_train_2 = df_train['prediction'].to_list()
labels_train = df_train['labels'].to_list()

text_test_1 = df_test['groundtruth'].to_list()
text_test_2 = df_test['prediction'].to_list()
labels_test = df_test['labels'].to_list()

In [3]:
#setting up the tokenizer

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [37]:
#encoding the text data from 'groundtruth'

train_encodings = tokenizer(text_train_1, truncation=True, padding=True)
test_encodings = tokenizer(text_test_1, truncation=True, padding=True)

In [38]:
#encoding the text data from 'prediction'

train_encodings_p = tokenizer(text_train_2, truncation=True, padding=True)
test_encodings_p = tokenizer(text_test_2, truncation=True, padding=True)

In [39]:
#converting the data into data well-tuned for torch

class dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = dataset(train_encodings, labels_train)
test_dataset = dataset(test_encodings, labels_test)


train_dataset_p = dataset(train_encodings, labels_train)
test_dataset_p = dataset(test_encodings, labels_test)

In [40]:
#setting up the model and training it on 'groundtruth'

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [41]:
#setting up the model and training it on 'prediction'

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model_p = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model_p.to(device)
model_p.train()

train_loader = DataLoader(train_dataset_p, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

model_p.eval()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [42]:
#testing the accuracy of 'groundtruth' model

test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = outputs.logits.argmax(dim=-1)

        all_predictions.extend(predictions.detach().cpu().numpy())
        all_labels.extend(labels.detach().cpu().numpy())

accuracy = accuracy_score(all_labels, all_predictions)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.9524


In [67]:
#testing the accuracy of 'prediction' model

test_loader = DataLoader(test_dataset_p, batch_size=16, shuffle=False)

all_predictions = []
all_labels = []

with torch.no_grad(): 
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model_p(input_ids, attention_mask=attention_mask)
        predictions = outputs.logits.argmax(dim=-1)

        all_predictions.extend(predictions.detach().cpu().numpy())
        all_labels.extend(labels.detach().cpu().numpy())

accuracy = accuracy_score(all_labels, all_predictions)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.6667


In [57]:
#creating a dataset containing all the first names in french population and their ratio in the population

= 'firstname_with_sex.csv'
data_2 = pd.read_csv(path_file_2,sep = ';')
data_h = data_2.drop('female', axis=1)
data_f = data_2.drop('male', axis=1)
data_h = data_h.sort_values(by='male', ascending=False)
data_f = data_f.sort_values(by='female', ascending=False)
total_h = data_h['male'].sum()
total_f = data_f['female'].sum()
data_h['male'] = data_h['male'].apply(lambda x : x/total_h)
data_f['female'] = data_f['female'].apply(lambda x : x/total_f)

In [62]:
#creating a dataset with 5000 french male first names (probability = ratio in the population)

test_data_h = []
for i in range(5000):
    selected_name_h = np.random.choice(data_h['firstname'], p=data_h['male'])
    test_data_h.append('Firstname : ' + selected_name_h)


labels_h = [0 for i in range(len(test_data_h))]
test_encodings_data_h = tokenizer(test_data_h, truncation=True, padding=True)
test_dataset_h = dataset(test_encodings_data_h,labels_h)

print(test_data_h)


['Firstname : michel', 'Firstname : pierre', 'Firstname : rené', 'Firstname : henri', 'Firstname : louis', 'Firstname : louis', 'Firstname : jean', 'Firstname : jean', 'Firstname : jacques', 'Firstname : auguste', 'Firstname : sébastien', 'Firstname : jean', 'Firstname : alexandre', 'Firstname : silvain', 'Firstname : jacques', 'Firstname : rené', 'Firstname : louis', 'Firstname : pierre', 'Firstname : jean', 'Firstname : henri', 'Firstname : joseph', 'Firstname : prix', 'Firstname : henri', 'Firstname : désiré', 'Firstname : françois', 'Firstname : auguste', 'Firstname : joseph', 'Firstname : alexis', 'Firstname : hyppolite', 'Firstname : pierre', 'Firstname : jean', 'Firstname : louis', 'Firstname : joseph', 'Firstname : julien', 'Firstname : auguste', 'Firstname : sébastien', 'Firstname : louis', 'Firstname : georges', 'Firstname : antoine', 'Firstname : yves', 'Firstname : georges', 'Firstname : ulysse', 'Firstname : françois', 'Firstname : françois', 'Firstname : yves', 'Firstname

In [71]:
#creating a dataset with 5000 french female first names (probability = ratio in the population)

test_data_f = []
for i in range(5000):
    selected_name_f = np.random.choice(data_f['firstname'], p=data_f['female'])
    test_data_f.append('Firstname : ' + selected_name_f)

labels_f = [1 for i in range(len(test_data_f))]
test_encodings_data_f = tokenizer(test_data_f, truncation=True, padding=True)
test_dataset_f = dataset(test_encodings_data_f,labels_f)

print(test_data_f)

['Firstname : elisabeth', 'Firstname : catherine', 'Firstname : marguerite', 'Firstname : madeleine', 'Firstname : marguerite', 'Firstname : perrine', 'Firstname : marie', 'Firstname : jeanne', 'Firstname : joséphine', 'Firstname : thérèse', 'Firstname : jacquette', 'Firstname : petronille', 'Firstname : renée', 'Firstname : catherine', 'Firstname : céline', 'Firstname : suzanne', 'Firstname : clémence', 'Firstname : marguerite', 'Firstname : anne', 'Firstname : ambroise', 'Firstname : marguerite', 'Firstname : amélie', 'Firstname : françoise', 'Firstname : victorine', 'Firstname : joséphine', 'Firstname : nathalie', 'Firstname : jeanne', 'Firstname : marie', 'Firstname : anne', 'Firstname : louise', 'Firstname : catherine', 'Firstname : jeanne', 'Firstname : monique', 'Firstname : magdeleine', 'Firstname : émilie', 'Firstname : anne', 'Firstname : marie', 'Firstname : solange', 'Firstname : jeanne', 'Firstname : antoinette', 'Firstname : marie', 'Firstname : léonarde', 'Firstname : je

In [70]:
#testing accuracy of 'prediction' model on 5000 male firstnames

test_loader = DataLoader(test_dataset_h, batch_size=16, shuffle=False)

all_predictions = []
all_labels = []

with torch.no_grad(): 
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = outputs.logits.argmax(dim=-1)

        all_predictions.extend(predictions.detach().cpu().numpy())
        all_labels.extend(labels.detach().cpu().numpy())

accuracy = accuracy_score(all_labels, all_predictions)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.9334


In [72]:
#testing accuracy of 'prediction' model on 5000 female firstnames

test_loader = DataLoader(test_dataset_f, batch_size=16, shuffle=False)

all_predictions = []
all_labels = []

with torch.no_grad(): 
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = outputs.logits.argmax(dim=-1)

        all_predictions.extend(predictions.detach().cpu().numpy())
        all_labels.extend(labels.detach().cpu().numpy())

accuracy = accuracy_score(all_labels, all_predictions)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.9866
