**Install the libraries**

In [4]:
!pip install transformers==4.28.0



In [5]:
from collections import defaultdict
import gdown
import gzip
import json
import random
import pickle

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import ticker
sns.set(style='ticks', font_scale=1.2)

In [6]:
#import the libraries
import numpy as np
import pandas as pd

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
#load and read the training TEXT From file
italiantrainingSet = pd.read_csv('drive/MyDrive/italianOriginal.tsv', on_bad_lines='skip', sep='\t')

In [9]:
#drop the columns not needed
italiantrainingSet.drop(columns='id', inplace=True)
italiantrainingSet.drop(columns='stereotype', inplace=True)

In [10]:
#rename the column text to 'Tweet' and column hs to 'Label
italiantrainingSet=italiantrainingSet.rename(columns={'text ': 'Tweet'})
italiantrainingSet=italiantrainingSet.rename(columns={'hs': 'Label'})

In [11]:
import re

In [12]:
#function remove_urls cleans the Urls from the Tweets and put in a seperate column
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

italiantrainingSet["tweet_without_url"] = italiantrainingSet["Tweet"].apply(lambda text: remove_urls(text))

In [13]:
italiantrainingSet.head()

Unnamed: 0,Tweet,Label,tweet_without_url
0,"È terrorismo anche questo, per mettere in uno ...",0,"È terrorismo anche questo, per mettere in uno ..."
1,@user @user infatti finché ci hanno guadagnato...,0,@user @user infatti finché ci hanno guadagnato...
2,"Corriere: Tangenti, Mafia Capitale dimenticata...",0,"Corriere: Tangenti, Mafia Capitale dimenticata..."
3,"@user ad uno ad uno, perché quando i migranti ...",0,"@user ad uno ad uno, perché quando i migranti ..."
4,Il divertimento del giorno? Trovare i patrioti...,0,Il divertimento del giorno? Trovare i patrioti...


In [14]:
italiantrainingSet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6837 entries, 0 to 6836
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Tweet              6837 non-null   object
 1   Label              6837 non-null   int64 
 2   tweet_without_url  6837 non-null   object
dtypes: int64(1), object(2)
memory usage: 160.4+ KB




**The Italian Corpus has now been loaded, and unneeded columns have been removed**
It contains 6837 tweets and labels with 0 or 1 (0 means no-hate speech, 1 signifies hate speech)





In [15]:
count = (italiantrainingSet['Label'] == 1).sum()

# The condition (df['Label'] == 1) checks for rows with value 1, and sum() counts the True values

print("Number of rows with value 1 in 'Label' column:", count)

Number of rows with value 1 in 'Label' column: 2766


In [16]:
countZero = (italiantrainingSet['Label'] == 0).sum()

# The condition (df['Label'] == 0) checks for rows with value 1, and sum() counts the True values

print("Number of rows with value 0 in 'Label' column:", countZero)

Number of rows with value 0 in 'Label' column: 4071


In [17]:
italiantrainingSet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6837 entries, 0 to 6836
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Tweet              6837 non-null   object
 1   Label              6837 non-null   int64 
 2   tweet_without_url  6837 non-null   object
dtypes: int64(1), object(2)
memory usage: 160.4+ KB


In [18]:
X = italiantrainingSet["tweet_without_url"].values.tolist()

In [19]:
y = italiantrainingSet['Label'].values.tolist()

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.20, random_state = 0)

In [21]:
train_texts = X_train
train_labels = y_train

test_texts = X_test
test_labels = y_test

In [22]:
len(train_texts), len(train_labels), len(test_texts), len(test_labels)

(5469, 5469, 1368, 1368)

In [23]:
train_labels[0], train_texts[0]

(0,
 "#Ungheria dà 900mln e riceve 4,5mld da #Ue (3,2% Pil). Soldi anche di contribuenti italiani. #Orban, l'amico di #Salvini, si rifiuta di accogliere anche un solo migrante dall'#Italia.   ✂ Tagliamo i fondi ai Paesi che non rispettano gli impegni!   #sfidalacorrente #PiùEuropa2019 URL ")

In [24]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

In [25]:
#import svm model and metrics
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [26]:

#SVM BASE LINE

model = SVC(kernel='linear', gamma='auto').fit(X_train, train_labels)
predictions = model.predict(X_test)

In [27]:
print(classification_report(test_labels, predictions))

              precision    recall  f1-score   support

           0       0.81      0.82      0.81       812
           1       0.73      0.71      0.72       556

    accuracy                           0.78      1368
   macro avg       0.77      0.77      0.77      1368
weighted avg       0.78      0.78      0.78      1368



In [28]:
#calculate F1 score
from sklearn.metrics import f1_score
f1_score(test_labels, predictions)

0.7208029197080292

In [29]:
from sklearn.metrics import matthews_corrcoef
#calculate Matthews correlation coefficient
matthews_corrcoef(test_labels, predictions)

0.5344423635390319

Now we load the Italian BERT Model and Make Predictions for Italian Bert based on our italian data

In [30]:
# This is the name of the BERT model that we want to use.
model_name = 'dbmdz/bert-base-italian-cased'

# This is the name of the program management system for NVIDIA GPUs. We're going to send our code here.
device_name = 'cuda'

# This is the maximum number of tokens in any document sent to BERT.
max_length = 512

# This is the name of the directory where we'll save our model. You can name it whatever you want.
cached_model_directory_name = 'Masteritalianbert-saved'

In [31]:
from transformers import BertForSequenceClassification, BertTokenizer
tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-italian-cased")
#tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) # The model_name needs to match our pre-trained model.

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/235k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

In [32]:
train_labels

[0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,


In [33]:
unique_labels = set(label for label in train_labels)
label2id = {label: id for id, label in enumerate(unique_labels)}
id2label = {id: label for label, id in label2id.items()}

In [34]:
label2id.keys()

dict_keys([0, 1])

In [35]:
id2label.keys()

dict_keys([0, 1])

In [36]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
test_encodings  = tokenizer(test_texts, truncation=True, padding=True, max_length=max_length)

train_labels_encoded = [label2id[y] for y in train_labels]
test_labels_encoded  = [label2id[y] for y in test_labels]

In [37]:
set(train_labels_encoded)

{0, 1}

In [38]:
set(test_labels_encoded)

{0, 1}

In [39]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [40]:
train_dataset = MyDataset(train_encodings, train_labels_encoded)
test_dataset = MyDataset(test_encodings, test_labels_encoded)

In [41]:
# The model_name needs to match the name used for the tokenizer above.
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(id2label)).to(device_name)

Downloading pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-base-italian-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model c

In [42]:
training_args = TrainingArguments(
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    learning_rate=2e-5,              # initial learning rate for Adam optimizer
    warmup_steps=100,                # number of warmup steps for learning rate scheduler (set lower because of small dataset size)
    weight_decay=0.01,               # strength of weight decay
    output_dir='./results',          # output directory
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps',     # evaluate during fine-tuning so that we can see progress
)

In [43]:
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

In [44]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset (usually a validation set; here we just send our test set)
    compute_metrics=compute_metrics      # our custom evaluation function
)

In [45]:
trainer.train()



Step,Training Loss,Validation Loss,Accuracy
100,0.6687,0.609027,0.649854
200,0.636,0.555596,0.696637
300,0.5467,0.515232,0.741959
400,0.5475,0.499811,0.756579
500,0.5002,0.492513,0.756579
600,0.4769,0.46586,0.780702
700,0.4959,0.478357,0.784357
800,0.4259,0.567841,0.733918
900,0.3969,0.491211,0.78655
1000,0.3744,0.486912,0.789474


TrainOutput(global_step=1368, training_loss=0.47404676710653026, metrics={'train_runtime': 630.8979, 'train_samples_per_second': 17.337, 'train_steps_per_second': 2.168, 'total_flos': 1000522954663920.0, 'train_loss': 0.47404676710653026, 'epoch': 2.0})

In [46]:
trainer.save_model(cached_model_directory_name)

In [47]:
trainer.evaluate()

{'eval_loss': 0.47830620408058167,
 'eval_accuracy': 0.7945906432748538,
 'eval_runtime': 19.5282,
 'eval_samples_per_second': 70.052,
 'eval_steps_per_second': 3.533,
 'epoch': 2.0}

In [48]:
predicted_results = trainer.predict(test_dataset)

In [49]:
predicted_results.predictions.shape

(1368, 2)

In [50]:
predicted_labels = predicted_results.predictions.argmax(-1) # Get the highest probability prediction
predicted_labels = predicted_labels.flatten().tolist()      # Flatten the predictions into a 1D list
predicted_labels = [id2label[l] for l in predicted_labels]  # Convert from integers back to strings for readability

In [51]:
len(predicted_labels)

1368

**Print the results for ItalianBERT + Italian Dataset**

In [52]:
print(classification_report(test_labels,
                            predicted_labels))

              precision    recall  f1-score   support

           0       0.81      0.85      0.83       812
           1       0.76      0.71      0.74       556

    accuracy                           0.79      1368
   macro avg       0.79      0.78      0.78      1368
weighted avg       0.79      0.79      0.79      1368



In [53]:
#calculate F1 score
from sklearn.metrics import f1_score
f1_score(test_labels, predicted_labels)

0.7386046511627907

In [54]:
from sklearn.metrics import matthews_corrcoef
#calculate Matthews correlation coefficient
matthews_corrcoef(test_labels, predicted_labels)

0.5706757990105646