In [51]:
# code aangepast van https://colab.research.google.com/github/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb#scrollTo=Dg82ndBA5xlN

In [59]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [60]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print(device)

cpu


In [61]:
# gebruik pretrained model (cased!): https://huggingface.co/GroNLP/bert-base-dutch-cased
from transformers import AutoTokenizer, AutoModel, TFAutoModel

tokenizer = AutoTokenizer.from_pretrained("GroNLP/bert-base-dutch-cased")
model = AutoModel.from_pretrained("GroNLP/bert-base-dutch-cased")  # PyTorch

Some weights of the model checkpoint at GroNLP/bert-base-dutch-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stre

In [62]:
# inladen data
dataset_path = "~/share/Milena/programma-discriminatie-en-racisme/datasets/"
dataset_zoek = pd.read_csv(dataset_path + 'dataset_met_zoekwoorden.csv')[0:400] # model is te groot om te trainen met volledige dataset zonder GPU

In [63]:
# hoe veel komt discriminatie voor in de dataset?
print(dataset_zoek.columns)

dataset_zoek['discriminatie (ja/nee)?'].value_counts()

Index(['Unnamed: 0', 'bestandsmap', 'bestandsnaam', 'pagina', 'paragraaf',
       'text', 'discriminatie (ja/nee)?', 'aantal discriminerende woorden',
       'discriminerende woorden'],
      dtype='object')


0    394
1      6
Name: discriminatie (ja/nee)?, dtype: int64

In [64]:
# tokenization, padding, masking
# LET OP: TRUNCATION = TRUE KORT DE ZINNEN IN TOT MAX. 512 TEKENS. NOG BESPREKEN
tokenized = dataset_zoek['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation = True)))

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

np.array(padded).shape

attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(400, 512)

In [65]:
# trainen
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)
    
features = last_hidden_states[0][:,0,:].numpy()

labels = dataset_zoek['discriminatie (ja/nee)?']

In [66]:
# split dataset in train, test
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [67]:
# trainen model (logistic regression)
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [68]:
# testen (logistic model)
lr_clf.score(test_features, test_labels)

0.98

In [77]:
# sentiment analysis
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("wietsedv/bert-base-dutch-cased-finetuned-sentiment")
model = AutoModel.from_pretrained("wietsedv/bert-base-dutch-cased-finetuned-sentiment")  # PyTorch

Some weights of the model checkpoint at wietsedv/bert-base-dutch-cased-finetuned-sentiment were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [73]:
print(dataset_zoek['text'])

0      TER ONDERTEKENING                             ...
1      Datum                                         ...
2                                             Referentie
3      Bijlagen                                      ...
4      Intern OCW afgestemd                          ...
                             ...                        
395    3 Digitaal afstandsonderwijs Informatie voor o...
396         1. Wat kan ik van de     school verwachten?
397    Als uw kind tijdelijk niet of minder naar scho...
398    Hulp op maat De eerste stap die de school moet...
399    Regels voor digitaal afstandsonderwijs Digitaa...
Name: text, Length: 400, dtype: object


In [89]:
pipe = pipeline("text-classification", model="wietsedv/bert-base-dutch-cased-finetuned-sentiment", return_all_scores=True)

for i in range(10, 30):
    print(pipe(dataset_zoek.loc[i, 'text']))
    print(dataset_zoek.loc[i, 'text'])

[[{'label': 'neg', 'score': 0.9999552965164185}, {'label': 'pos', 'score': 4.473637090995908e-05}]]
realiseren van een bestendige studiekeuze en het voorkomen van uitval en studiewissel. De insteek is dat lopende oriëntatietrajecten met relatief kleine aanpassingen aan de wettelijke vereisten kunnen voldoen en dus in ieder geval gedurende de pilot inhoudelijk doorgang kunnen vinden. Deze aanpassingen zitten dan veelal op het vlak van de toepassing van het toelatingsrecht en het opleidingsdomein als inschrijfpositie.
[[{'label': 'neg', 'score': 0.9998977184295654}, {'label': 'pos', 'score': 0.00010232756903860718}]]
Pagina 2 van 2
[[{'label': 'neg', 'score': 0.004039074759930372}, {'label': 'pos', 'score': 0.9959608912467957}]]
TER ONDERTEKENING                                                                           Hoger Onderwijs en                                                                                             Studiefinanciering Aan: MOCW                                

NameError: name 'question' is not defined

NameError: name 'predict' is not defined

TypeError: cannot use a string pattern on a bytes-like object