# Binary Language Classifier

Detect if an input sentence is in Italian language or not.

In [1]:
import numpy as np
import pandas as pd
import os

Set environment if in google colab to use cuda.

In [2]:
try:
    import google.colab
    googleColab = True
except:
    googleColab = False

print(f"In google colab: {googleColab}")

In google colab: True


In [3]:
setup_environment = True #@param {type:"boolean"}
print_subprocess = False #@param {type:"boolean"}


print("Environment set only if inside google colab.")

if googleColab:
  if setup_environment:
      import subprocess, time
      print("Setting up environment...")
      all_process = [
          ["pip", "install", "datasets==2.4"],
          ["pip", "install", "transformers==4.21.1"]
      ]

      for process in all_process:
          running = subprocess.run(process, stdout=subprocess.PIPE).stdout.decode("utf-8")
          if print_subprocess:
              print(running)

Environment set only if inside google colab.
Setting up environment...


In [4]:
import torch

device = torch.device("cpu")
if torch.cuda.is_available():
    running = subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE).stdout.decode("utf-8")
    device = torch.device("cuda:0")
    print(running)
else:
    print("Is CUDA active?: False.")

Sun Oct  2 20:13:02 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P8     9W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Create directory

In [5]:
rootPath = os.getcwd()

if not googleColab:
    rootPath = os.path.dirname(rootPath)
    
dataPath = os.path.join(rootPath, "data")
logsPath = os.path.join(rootPath, "logs")
modelsPath = os.path.join(rootPath, "models")

nameFile = "Language Detection.csv"
datasetPath = os.path.join(dataPath, nameFile)

In [12]:
if not os.path.isdir(dataPath):
    os.makedirs(dataPath, exist_ok=True)

if not os.path.isfile(datasetPath):
    print(f"Download Language Detection.csv dataset from https://www.kaggle.com/datasets/basilb2s/language-detection and put it in: {dataPath}")
else:
    print(f"'{nameFile}' correctly detected.")

'Language Detection.csv' correctly detected.


In [13]:
os.makedirs(logsPath, exist_ok=True)
os.makedirs(modelsPath, exist_ok=True)

print(f"Data path: {dataPath}\nLogs path: {logsPath}\nModels path: {modelsPath}")

Data path: /content/data
Logs path: /content/logs
Models path: /content/models


## Import dataset

In [14]:
colnames=["text", "language"]
df = pd.read_csv(datasetPath, header=0, names=colnames)
df

Unnamed: 0,text,language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English
...,...,...
10332,ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...,Kannada
10333,ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...,Kannada
10334,ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...,Kannada
10335,ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...,Kannada


## Set target based on the task

In [15]:
df.groupby("language") \
    .agg({"language": "count"})

Unnamed: 0_level_0,language
language,Unnamed: 1_level_1
Arabic,536
Danish,428
Dutch,546
English,1385
French,1014
German,470
Greek,365
Hindi,63
Italian,698
Kannada,369


In [16]:
df.loc[ df["language"] == "Italian", "label"] = 1
df.loc[ df["language"] != "Italian", "label"] = 0
df["label"] = df["label"].astype(int)
df

Unnamed: 0,text,language,label
0,"Nature, in the broadest sense, is the natural...",English,0
1,"""Nature"" can refer to the phenomena of the phy...",English,0
2,"The study of nature is a large, if not the onl...",English,0
3,"Although humans are part of nature, human acti...",English,0
4,[1] The word nature is borrowed from the Old F...,English,0
...,...,...,...
10332,ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...,Kannada,0
10333,ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...,Kannada,0
10334,ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...,Kannada,0
10335,ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...,Kannada,0


In [17]:
df.groupby("label") \
    .agg({"label": "count"})

Unnamed: 0_level_0,label
label,Unnamed: 1_level_1
0,9639
1,698


In [18]:
X = df.loc[:, ["text"]]
y = df.loc[:, ["label"]]

## Train/test split

(for this task we avoid validation set...)

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43)

In [20]:
train_df = pd.concat([X_train, y_train], axis=1) #[0:30]
train_df.columns = ["text", "label"]

test_df = pd.concat([X_test, y_test], axis=1) #[0:5]
test_df.columns = ["text", "label"]

In [21]:
from datasets import Dataset, DatasetDict
train_dataset = Dataset.from_dict(train_df)
test_dataset = Dataset.from_dict(test_df)

In [22]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

Downloading vocab.txt:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [23]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [24]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [25]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Downloading pytorch_model.bin:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [26]:
print(f"Model max length: {tokenizer.model_max_length}\nModel num labels: {model.num_labels}")

Model max length: 512
Model num labels: 2


## Define metrics

In [27]:
from datasets import load_metric
from sklearn import metrics

def compute_metrics(eval_pred):
    load_accurancy = load_metric("accuracy")
    load_f1 = load_metric("f1")

    predictions = np.argmax(eval_pred.predictions, axis=-1)
    accuracy = load_accurancy.compute(predictions=predictions, references=eval_pred.label_ids)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=eval_pred.label_ids, average="weighted")["f1"]

    # Confusion matrix and classification report.
    print(metrics.classification_report(eval_pred.label_ids, predictions))

    return {
        "accuracy": accuracy,
        "f1": f1
    }

## Finetuning BERT

In [28]:
from transformers import TrainingArguments, Trainer
import os

name_model = "finetuning-binary-language-classifier"
output_dir = os.path.join(modelsPath, name_model)

training_args = TrainingArguments(
    output_dir = output_dir,
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,  # 2
    per_device_eval_batch_size = 16,   # 2
    num_train_epochs = 2,
    weight_decay = 0.01,
    save_steps = 500,
    evaluation_strategy = "steps",
    eval_steps = 150,
    logging_dir = logsPath, 
    fp16 = True, # False
    push_to_hub = False
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_train,
    eval_dataset = tokenized_test,
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics
)

Using cuda_amp half precision backend


In [29]:
torch.cuda.empty_cache()
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 7235
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 906


Step,Training Loss,Validation Loss,Accuracy,F1
150,No log,0.028288,0.992263,0.992214
300,No log,0.077375,0.987427,0.986867
450,No log,0.029308,0.99323,0.993075
600,0.038700,0.013365,0.996132,0.996099
750,0.038700,0.02939,0.994197,0.994084
900,0.038700,0.018726,0.995809,0.995796


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3102
  Batch size = 16


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2879
           1       0.96      0.93      0.95       223

    accuracy                           0.99      3102
   macro avg       0.98      0.96      0.97      3102
weighted avg       0.99      0.99      0.99      3102



The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3102
  Batch size = 16


              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2879
           1       1.00      0.83      0.90       223

    accuracy                           0.99      3102
   macro avg       0.99      0.91      0.95      3102
weighted avg       0.99      0.99      0.99      3102



The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3102
  Batch size = 16


              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2879
           1       1.00      0.91      0.95       223

    accuracy                           0.99      3102
   macro avg       1.00      0.95      0.97      3102
weighted avg       0.99      0.99      0.99      3102



Saving model checkpoint to /content/models/finetuning-binary-language-classifier/checkpoint-500
Configuration saved in /content/models/finetuning-binary-language-classifier/checkpoint-500/config.json
Model weights saved in /content/models/finetuning-binary-language-classifier/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/models/finetuning-binary-language-classifier/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/models/finetuning-binary-language-classifier/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3102
  Batch size = 16


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2879
           1       0.99      0.96      0.97       223

    accuracy                           1.00      3102
   macro avg       0.99      0.98      0.99      3102
weighted avg       1.00      1.00      1.00      3102



The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3102
  Batch size = 16


              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2879
           1       1.00      0.92      0.96       223

    accuracy                           0.99      3102
   macro avg       1.00      0.96      0.98      3102
weighted avg       0.99      0.99      0.99      3102



The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3102
  Batch size = 16


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2879
           1       0.98      0.96      0.97       223

    accuracy                           1.00      3102
   macro avg       0.99      0.98      0.98      3102
weighted avg       1.00      1.00      1.00      3102





Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=906, training_loss=0.025769247124526676, metrics={'train_runtime': 296.4342, 'train_samples_per_second': 48.814, 'train_steps_per_second': 3.056, 'total_flos': 1281191020192620.0, 'train_loss': 0.025769247124526676, 'epoch': 2.0})

In [30]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3102
  Batch size = 16


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2879
           1       0.98      0.96      0.97       223

    accuracy                           1.00      3102
   macro avg       0.99      0.98      0.98      3102
weighted avg       1.00      1.00      1.00      3102



{'eval_loss': 0.01873157173395157,
 'eval_accuracy': 0.9958091553836235,
 'eval_f1': 0.9957960563411773,
 'eval_runtime': 14.599,
 'eval_samples_per_second': 212.481,
 'eval_steps_per_second': 13.289,
 'epoch': 2.0}

In [31]:
trainer.save_model(output_dir)

Saving model checkpoint to /content/models/finetuning-binary-language-classifier
Configuration saved in /content/models/finetuning-binary-language-classifier/config.json
Model weights saved in /content/models/finetuning-binary-language-classifier/pytorch_model.bin
tokenizer config file saved in /content/models/finetuning-binary-language-classifier/tokenizer_config.json
Special tokens file saved in /content/models/finetuning-binary-language-classifier/special_tokens_map.json


## Export model if in google colab

In [34]:
# Create zip of the model just created
# exclude all checkpoints
# and move in drive folder

if googleColab:

    !zip -r {rootPath}/{name_model}.zip {modelsPath} -x {output_dir}/checkpoint*/**\*

    from google.colab import drive
    try:
        drive_path = "/content/drive"
        drive.mount(drive_path, force_remount=False)
        !mv {rootPath}/{name_model}.zip drive/MyDrive
    except:
      print("...error mounting drive")

  adding: content/models/ (stored 0%)
  adding: content/models/finetuning-binary-language-classifier/ (stored 0%)
  adding: content/models/finetuning-binary-language-classifier/config.json (deflated 49%)
  adding: content/models/finetuning-binary-language-classifier/pytorch_model.bin (deflated 7%)
  adding: content/models/finetuning-binary-language-classifier/training_args.bin (deflated 48%)
  adding: content/models/finetuning-binary-language-classifier/vocab.txt (deflated 49%)
  adding: content/models/finetuning-binary-language-classifier/checkpoint-500/ (stored 0%)
  adding: content/models/finetuning-binary-language-classifier/tokenizer_config.json (deflated 44%)
  adding: content/models/finetuning-binary-language-classifier/special_tokens_map.json (deflated 42%)
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
