In [None]:
!pip3 install datasets transformers

In [None]:
!wget https://www.htw-dresden.de/~guhr/dist/sample/germeval2018.training.txt
!wget https://www.htw-dresden.de/~guhr/dist/sample/germeval2018.test.txt

In [1]:
import time
import pandas as pd
import numpy as np

In [2]:
test_df = pd.read_csv("germeval2018.test.txt", sep='\t', header=0,encoding="utf-8")
train_df = pd.read_csv("germeval2018.training.txt", sep='\t', header=0,encoding="utf-8")

In [3]:
train_df.head()

Unnamed: 0,text,label,label2
0,"@corinnamilborn Liebe Corinna, wir würden dich...",OTHER,OTHER
1,@Martin28a Sie haben ja auch Recht. Unser Twee...,OTHER,OTHER
2,@ahrens_theo fröhlicher gruß aus der schönsten...,OTHER,OTHER
3,@dushanwegner Amis hätten alles und jeden gewä...,OTHER,OTHER
4,@spdde kein verläßlicher Verhandlungspartner. ...,OFFENSE,INSULT


In [4]:
# drop unused columns
test_df.drop(columns=['label2'], inplace=True)
train_df.drop(columns=['label2'], inplace=True)

In [5]:
def clean_text (text):
    #text = text.str.lower() # lowercase
    text = text.str.replace(r"\#","") # replaces hashtags
    text = text.str.replace(r"http\S+","URL")  # remove URL addresses
    text = text.str.replace(r"@","")
    text = text.str.replace(r"[^A-Za-z0-9öäüÖÄÜß()!?]", " ")
    text = text.str.replace("\s{2,}", " ")
    return text

def convert_label(label):
    return 1 if label == "OFFENSE" else 0

In [6]:
train_df["text"]=clean_text(train_df["text"])
test_df["text"]=clean_text(test_df["text"])
train_df["label"]=train_df["label"].map(convert_label)
test_df["label"]=test_df["label"].map(convert_label)

In [7]:
# this is  how our data set looks now. No urls no @ :)
train_df.head() 

Unnamed: 0,text,label
0,corinnamilborn Liebe Corinna wir würden dich g...,0
1,Martin28a Sie haben ja auch Recht Unser Tweet ...,0
2,ahrens theo fröhlicher gruß aus der schönsten ...,0
3,dushanwegner Amis hätten alles und jeden gewäh...,0
4,spdde kein verläßlicher Verhandlungspartner Na...,1


In [8]:
len(train_df.loc[train_df["label"]==1])


1688

In [23]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
#train_dataset.rename_column_("text", "sentence")
test_dataset = Dataset.from_pandas(test_df)
#test_dataset.rename_column_("text", "sentence")

In [24]:
#todo move cleaning process to dataset type
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 5009
})

In [25]:
from transformers import AutoTokenizer
from datasets import load_dataset, load_metric

model_checkpoint ="distilbert-base-german-cased" #"oliverguhr/german-sentiment-bert"# "bert-base-german-cased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [26]:
tokenizer(["Das ist ein Test.", "And this is a second sentence"], truncation=True)

{'input_ids': [[102, 347, 215, 143, 4369, 566, 103], [102, 6282, 3070, 1420, 120, 6331, 1249, 6661, 155, 730, 103]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [50]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True,padding=False)

In [51]:
encoded_train_dataset = train_dataset.map(preprocess_function, batched=True)
encoded_test_dataset = test_dataset.map(preprocess_function, batched=True)

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [52]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

batch_size = 16

args = TrainingArguments(
    "test-offsive-language",
    evaluation_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    warmup_steps=500,
    logging_steps=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

Some weights of the model checkpoint at distilbert-base-german-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias

In [53]:
metric = load_metric('f1')

def compute_metrics(eval_pred):    
    predictions, labels = eval_pred    
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels,average="binary")

In [54]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.606427,0.636856,0.0


  'precision', 'predicted', average, warn_for)


In [45]:
trainer.evaluate()

{'eval_loss': 0.5589212775230408, 'eval_f1': 0.627838104639684, 'epoch': 5.0}

In [None]:
trainer.model()

In [None]:
from datasets import list_metrics
metrics_list = list_metrics()
len(metrics_list)
print(', '.join(metric for metric in metrics_list))


In [None]:
metric = load_metric('glue', 'mrpc')
metric

In [None]:
metric.compute(average="micro")