### Install Dependincies

In [1]:
!pip install --upgrade transformers
!pip install --upgrade accelerate
!pip install torch torchvision torchaudio

Collecting transformers
  Downloading transformers-4.41.2-py3-none-any.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.41.1
    Uninstalling transformers-4.41.1:
      Successfully uninstalled transformers-4.41.1
Successfully installed transformers-4.41.2
Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 k

In [None]:
!pip show transformers
!pip show accelerate
!pip show torchvision
!pip show torchaudio

### Connect to drive to save or load models

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# !ls '/content/drive/My Drive/' | grep '\.zip$'

### Downloading the Dataset

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# Download the dataset
!gdown --id 1NdvIddoyYy2idsAWxJ8lodKfD-PZhmyL

Downloading...
From: https://drive.google.com/uc?id=1NdvIddoyYy2idsAWxJ8lodKfD-PZhmyL
To: /content/in_domain_train.tsv
100% 429k/429k [00:00<00:00, 103MB/s]


In [4]:
# Read into a pandas dataframe
df = pd.read_csv("in_domain_train.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])[['label', 'sentence']]
df.head()

Unnamed: 0,label,sentence
0,1,"Our friends won't buy this analysis, let alone..."
1,1,One more pseudo generalization and I'm giving up.
2,1,One more pseudo generalization or I'm giving up.
3,1,"The more we study verbs, the crazier they get."
4,1,Day by day the facts are getting murkier.


In [5]:
df.label.value_counts()

label
1    6023
0    2528
Name: count, dtype: int64

In [6]:
df_correct = pd.read_csv('data.csv')

In [7]:
df_correct.head()

Unnamed: 0,label,sentence
0,0,"He was, I take it, the most perfect reasoning..."
1,0,"He never spoke of the softer passions, save w..."
2,0,They were admirable things for the observer--...
3,0,But for the trained reasoner to admit such in...
4,0,"Grit in a sensitive instrument, or a crack in..."


In [8]:
df_correct.label.value_counts()

label
0    3495
Name: count, dtype: int64

In [9]:
balanced_df = pd.concat([df, df_correct])

In [10]:
balanced_df.head()

Unnamed: 0,label,sentence
0,1,"Our friends won't buy this analysis, let alone..."
1,1,One more pseudo generalization and I'm giving up.
2,1,One more pseudo generalization or I'm giving up.
3,1,"The more we study verbs, the crazier they get."
4,1,Day by day the facts are getting murkier.


In [11]:
balanced_df.label.value_counts()

label
1    6023
0    6023
Name: count, dtype: int64

### Model Building, Training and Evaluation

In [12]:
import torch
from transformers import DistilBertTokenizer, AutoTokenizer
from transformers import DistilBertForSequenceClassification, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

In [13]:
X = list(balanced_df.sentence)
y = list(balanced_df.label)

In [14]:
train_texts, val_texts, train_labels, val_labels = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [15]:
set(y)

{0, 1}

In [50]:
from sklearn.utils.class_weight import compute_class_weight

# Calculate class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)

In [16]:
checkpoint = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)
model = DistilBertForSequenceClassification.from_pretrained(checkpoint, num_labels=len(set(y)), problem_type="single_label_classification")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# Lets build custom dataset
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
      text = str(self.texts[idx])
      label = torch.tensor(self.labels[idx])

      encoding = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors='pt')
      # encoding = self.tokenizer(text, truncation=True, padding=True, return_tensors='pt')

      return {
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'labels': label,
          # 'loss': torch.tensor(0.0),   # Placeholder for loss calculation
      }

In [18]:
train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

In [19]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def binary_classification_metrics(predictions, labels, threshold=0.5):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))

    y_pred = (probs.numpy() >= threshold).astype(int)
    y_true = labels

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, probs.numpy())

    metrics = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "roc_auc": roc_auc
    }

    return metrics

In [20]:
import os

os.makedirs('/content/results', exist_ok=True)
os.makedirs('/content/logs', exist_ok=True)

In [21]:
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define early stopping callback
early_stopping = EarlyStoppingCallback(early_stopping_patience = 4)  # Adjust patience as needed


# Define custom Trainer class with modified loss function
class TrainerWithCustomLoss(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fn = torch.nn.CrossEntropyLoss()
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss


# Define evaluation metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


# Define TrainingArguments
args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir='/content/results',
    num_train_epochs=8,
    save_steps=300,
    save_total_limit=2,
    warmup_steps=150,
    weight_decay=1e-4,
    load_best_model_at_end=True,
    evaluation_strategy="steps",
    eval_steps=300,
    logging_dir='/content/logs',
    learning_rate=1e-5,  # Adjust the learning rate as needed
)


# Initialize Trainer
trainer = TrainerWithCustomLoss(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)



In [23]:
# Train the model
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
300,No log,0.470069,0.782158,0.742094,0.869852,0.80091
600,0.458100,0.41415,0.804979,0.775556,0.862438,0.816693
900,0.458100,0.398961,0.814938,0.779476,0.882208,0.827666
1200,0.450400,0.397647,0.819502,0.766234,0.923394,0.837505
1500,0.373500,0.426525,0.823237,0.793155,0.878089,0.833464
1800,0.373500,0.531681,0.809129,0.837209,0.771005,0.802744
2100,0.316500,0.494467,0.829461,0.780966,0.919275,0.844495
2400,0.316500,0.494314,0.83278,0.782578,0.925041,0.847867


Step,Training Loss,Validation Loss


TrainOutput(global_step=2400, training_loss=0.3893111610412598, metrics={'train_runtime': 372.6594, 'train_samples_per_second': 206.859, 'train_steps_per_second': 25.868, 'total_flos': 715191485343744.0, 'train_loss': 0.3893111610412598, 'epoch': 1.991701244813278})

In [24]:
trainer.evaluate()

{'eval_loss': 0.39764708280563354,
 'eval_accuracy': 0.8195020746887967,
 'eval_precision': 0.7662337662337663,
 'eval_recall': 0.9233937397034596,
 'eval_f1': 0.8375046694060515,
 'eval_runtime': 9.6073,
 'eval_samples_per_second': 250.852,
 'eval_steps_per_second': 31.435,
 'epoch': 1.991701244813278}

In [41]:
tokenizer.save_pretrained('/content/drive/MyDrive/distilbert-tokenizer')

('/content/drive/MyDrive/distilbert-tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/distilbert-tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/distilbert-tokenizer/vocab.txt',
 '/content/drive/MyDrive/distilbert-tokenizer/added_tokens.json')

In [42]:
model.save_pretrained("/content/drive/My Drive/distilbert-finetuned-binary-classifier")

In [None]:
!zip -r /content/model.zip /content/distilbert-finetuned-binary-classifier

#### Make predictions

In [27]:
text = "give him her sududu."

encoding = tokenizer(text, return_tensors='pt')
encoding.to(trainer.model.device)

outputs = trainer.model(**encoding)

In [28]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-0.3862,  0.3495]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [30]:
import numpy as np

# Get the index of the maximum probability
pred_index = np.argmax(outputs)

# Assuming the classes are named 'Class 0' and 'Class 1'
pred_class = 'Class 0' if pred_index == 0 else 'Class 1'

print(pred_class)

Class 0


In [58]:
import torch
import numpy as np

def predict_class(text, tokenizer, model):
    # Tokenize the text
    encoding = tokenizer(text, return_tensors='pt')

    # Move the input tensors to the model's device
    encoding.to(model.device)

    # Get the model's prediction
    with torch.no_grad():
        outputs = model(**encoding)

    # Get the index of the maximum probability
    pred_index = np.argmax(outputs.logits.cpu().numpy())

    # Assuming the classes are named 'Class 0' and 'Class 1'
    pred_class = f'"{text}" is grammatically CORRECT' if pred_index == 0 else f'"{text}" is grammatically INCORRECT'

    return pred_class

In [32]:
# Example usage
text = "Thank you, see you soon"
predicted_class = predict_class(text, tokenizer, trainer.model)
print(predicted_class)

Thank you, see you soon is grammatically CORRECT


In [35]:
"Give up when you know the war is ended"
predicted_class = predict_class(text, tokenizer, trainer.model)
print(predicted_class)

Thank you, see you soon is grammatically CORRECT


In [36]:
"Give up are fine and goods"
predicted_class = predict_class(text, tokenizer, trainer.model)
print(predicted_class)

Thank you, see you soon is grammatically CORRECT


In [49]:
"Our friends won't buy this analysis, let alone the next one we propose."
predicted_class = predict_class(text, tokenizer, trainer.model)
print(predicted_class)

Thank you, see you soon is grammatically CORRECT


#### load the model

In [52]:
# !unzip '/content/drive/My Drive/distilbert-finetuned-binary-classifier.zip'

unzip:  cannot find or open /content/drive/My Drive/distilbert-finetuned-binary-classifier.zip, /content/drive/My Drive/distilbert-finetuned-binary-classifier.zip.zip or /content/drive/My Drive/distilbert-finetuned-binary-classifier.zip.ZIP.


In [53]:
from transformers import DistilBertForSequenceClassification, AutoModelForSequenceClassification

# Load the model
model = AutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/distilbert-finetuned-binary-classifier', problem_type="single_label_classification")

# Make sure to set the model to evaluation mode
model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [54]:
# Initialize Trainer
trainer = TrainerWithCustomLoss(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)

In [55]:
trainer.evaluate()

{'eval_loss': 0.39764708280563354,
 'eval_accuracy': 0.8195020746887967,
 'eval_precision': 0.7662337662337663,
 'eval_recall': 0.9233937397034596,
 'eval_f1': 0.8375046694060515,
 'eval_runtime': 10.3402,
 'eval_samples_per_second': 233.071,
 'eval_steps_per_second': 29.206}

In [59]:
text = "Thank you, see you soon"
predicted_class = predict_class(text, tokenizer, trainer.model)
print(predicted_class)

"Thank you, see you soon" is grammatically CORRECT


In [60]:
text = "me and ali is in the pool alone"
predicted_class = predict_class(text, tokenizer, trainer.model)
print(predicted_class)

"me and ali is in the pool alone" is grammatically CORRECT


In [65]:
text = "me is ali is in the pool alone."
predicted_class = predict_class(text, tokenizer, trainer.model)
print(predicted_class)

"me is ali is in the pool alone." is grammatically CORRECT


In [66]:
text = "do it once again and less se his cow in the road."
predicted_class = predict_class(text, tokenizer, trainer.model)
print(predicted_class)

"do it once again and less se his cow in the road." is grammatically CORRECT
