<a href="https://colab.research.google.com/github/mathml-ai/AI-Text-Detector/blob/main/Grammar_tasks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#   Install Unsloth library
! pip install unsloth --upgrade --no-cache-dir

# Install Flash Attention 2 for softcapping support
import torch
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"

Collecting unsloth
  Downloading unsloth-2025.3.10-py3-none-any.whl.metadata (59 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/59.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.3/59.3 kB[0m [31m83.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.3.8 (from unsloth)
  Downloading unsloth_zoo-2025.3.9-py3-none-any.whl.metadata (17 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.17-py3-none-any.whl.metadata (9.5 kB)
Collecting datasets>=2.16.0 (from unsloth)
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting trl!=0.15.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,<=0.15.2,>=0.7.9 (from unsloth)
  Dow

In [2]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = None,        # None for auto detection.
    load_in_4bit = True, # Use 4bit quantization to reduce memory usage.
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.10: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # LoRA attention dimension
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha =32,  # Alpha parameter for LoRA scaling
    lora_dropout = 0.1, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # Rank stabilized LoRA
    loftq_config = None, # LoRA-Fine-Tuning-Aware Quantization
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.3.10 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


# Dataset loading

In [4]:
import numpy as np
import pandas as pd

In [5]:
df1=pd.read_csv('/content/df1.csv')

In [6]:
df1

Unnamed: 0,_id,task,src,tgt
0,1,gec,Remove all grammatical errors from this text: ...,"For example, countries with a lot of deserts c..."
1,2,gec,Improve the grammaticality: As the number of p...,"As the number of people grows, the need for a ..."
2,3,gec,Improve the grammaticality of this sentence: B...,Besides some technological determinists that a...
3,4,gec,Remove all grammatical errors from this text: ...,Safety is one of the crucial problems that man...
4,5,gec,Fix grammaticality in this sentence: On one ha...,"On the one hand, more and more viruses and hac..."
...,...,...,...,...
69066,69067,clarity,Rewrite this sentence for clarity: The Habsbur...,"During the Habsburg's period, Spain ushered in..."
69067,69068,clarity,Rewrite the sentence more clearly: The Habsbur...,The Habsburgyears also ushered in the Spanish ...
69068,69069,clarity,"Make this sentence more readable: In 2019, he ...","In 2019, he was traded to the Astros in a bloc..."
69069,69070,clarity,"Use clearer wording: In 2019, he was traded to...","In 2019, he was traded to the Astros in a bloc..."


In [7]:
df1['src'][0]

'Remove all grammatical errors from this text: For example, countries with a lot of deserts can terraform their desert to increase their habitable land and using irrigation to provide clean water to the desert.'

In [8]:
df1['tgt'][0]

'For example, countries with a lot of deserts can transform their desert to increase their habitable land and use irrigation to provide clean water to the desert.'

In [9]:
df1_grpd=df1.groupby(by='task')

# Few shots prompts  

In [10]:
few_shot_prompts = []

for task_name, group in df1_grpd:
    examples = group[['src', 'tgt']].head(5).to_dict(orient='records')  # Extract 5 examples

    examples_str = "\n\n".join([f"Input: {ex['src']}\nOutput: {ex['tgt']}" for ex in examples])

    Prompt = f"""You are an advanced AI assistant skilled in improving text based on the given task. Your
    goal is to perform the english linguistic task as per the instructions. Learn from the examples given
    to you. Then for each given source sentence, find the correct target sentence in the same manner as in
    examples.

Task: {task_name}

Here are some examples:

{examples_str}


"""
    few_shot_prompts.append(Prompt)







In [11]:
task_mapping={'gec':5,'paraphrase':4,'simplification':3,'coherence':2,'neutralize':1,'clarity':0}

In [12]:
from sklearn.model_selection import train_test_split
df_train,df_test=train_test_split(df1,test_size=0.2,stratify=df1['task'])

In [13]:
main_prompts = []

# Task mapping
task_mapping = {'gec': 5, 'paraphrase': 4, 'simplification': 3, 'coherence': 2, 'neutralize': 1, 'clarity': 0}

eos_TOKEN = tokenizer.eos_token  # Define EOS token

for index, row in df_train.iterrows():
    task_name = row['task']
    src_text = row['src']
    tgt_text = row['tgt']

    # Select the appropriate few-shot prompt
    task_index = task_mapping.get(task_name, None)
    if task_index is not None:
        few_shot_prompt = few_shot_prompts[task_index]  # Fetch from predefined prompts

        # Construct the full prompt with clear instructions
        full_prompt = f"""{few_shot_prompt}

Now, process the following input according to the task.
Provide only the correct sentence after 'Output:', without any explanations.

Input: {src_text}
Output: {tgt_text} {eos_TOKEN}
"""

        # Store the complete prompt
        main_prompts.append(full_prompt)





In [14]:
main_prompts[0]

"You are an advanced AI assistant skilled in improving text based on the given task. Your\n    goal is to perform the english linguistic task as per the instructions. Learn from the examples given\n    to you. Then for each given source sentence, find the correct target sentence in the same manner as in\n    examples.\n\nTask: paraphrase\n\nHere are some examples:\n\nInput: Reword this sentence: Item 5.1.2 shall be amended to read:\nOutput: Point 5.1.2 is replaced by the following:\n\nInput: Reword this text: She stopped when she saw his expression.\nOutput: Seeing the look on his face, she paused.\n\nInput: Reword this sentence: D'Hara is well served, at long last.\nOutput: After a long time, D'Hara is well managed.\n\nInput: Paraphrase: All right, fine, yes, yes, it's me.\nOutput: Okay, okay, yeah, yeah, I'm the one.\n\nInput: Rewrite this sentence: Someone must've knocked it on the ground.\nOutput: Somebody must have smashed him to earth.\n\n\n\n\nNow, process the following input ac

In [15]:
train_dataset = pd.DataFrame({
    'instruction':main_prompts,
    'output':df_train['tgt']
})

In [16]:
train_dataset

Unnamed: 0,instruction,output
50426,You are an advanced AI assistant skilled in im...,We'il never have enough food if we stay here.
29427,You are an advanced AI assistant skilled in im...,the renaissance: 1500-1660
7080,You are an advanced AI assistant skilled in im...,"I looked at them, trying to think of something..."
30506,You are an advanced AI assistant skilled in im...,"Anyway, we learn from our mistakes, right?"
68806,You are an advanced AI assistant skilled in im...,The Sullubawa are a Fulani clan in Northern Ni...
...,...,...
38093,You are an advanced AI assistant skilled in im...,After 7 years children have the right to live ...
1185,You are an advanced AI assistant skilled in im...,The old is worthy of every resource spent on t...
16929,You are an advanced AI assistant skilled in im...,My hobby is playing rugby.
45963,You are an advanced AI assistant skilled in im...,"""One never enters, and no one never leaves."""


# Training setup

In [17]:
from trl import SFTTrainer
from unsloth import is_bfloat16_supported
from transformers import TrainingArguments

In [18]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    warmup_steps = 5,
    # num_train_epochs = 1, # Set this for 1 full training run.
    max_steps = 100,
    learning_rate = 2e-4,
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    logging_steps = 1,
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = 42,
    output_dir = "outputs",
    report_to = "none",
)

In [19]:
from datasets import Dataset

# Assuming train_dataset is your DataFrame
hf_train_dataset =Dataset.from_pandas(pd.DataFrame(train_dataset))

# To check if the conversion was successful
print(hf_train_dataset)


Dataset({
    features: ['instruction', 'output', '__index_level_0__'],
    num_rows: 55256
})


In [20]:
hf_train_dataset = hf_train_dataset.map(
    lambda x: {"formatted_text": f"{tokenizer.bos_token} {x['instruction']}\n\nProvide only the correct sentence after 'Output:', without any explanations.\n\nOutput: {x['output']} {eos_TOKEN}"}
)





Map:   0%|          | 0/55256 [00:00<?, ? examples/s]

In [21]:
print("bos_token:", tokenizer.bos_token)
print("chat_template:", getattr(tokenizer, "chat_template", None))


bos_token: <|begin_of_text|>
chat_template: None


In [22]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=hf_train_dataset,
    dataset_text_field="formatted_text",  # Use the combined field
    max_seq_length=512,
    dataset_num_proc=2,
    packing=False,
    args=training_args
)


Unsloth: We found double BOS tokens - we shall remove one automatically.


Unsloth: Tokenizing ["formatted_text"] (num_proc=2):   0%|          | 0/55256 [00:00<?, ? examples/s]

In [23]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 55,256 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 83,886,080/4,712,566,784 (1.78% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.3832
2,2.3661
3,2.221
4,2.1576
5,1.9124
6,1.893
7,1.4719
8,1.2717
9,1.1744
10,1.1007


TrainOutput(global_step=100, training_loss=0.44663821324706077, metrics={'train_runtime': 1235.5386, 'train_samples_per_second': 0.647, 'train_steps_per_second': 0.081, 'total_flos': 1.6332274237538304e+16, 'train_loss': 0.44663821324706077})

In [24]:
FastLanguageModel.for_inference(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
        (layers): ModuleList(
          (0): LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k

In [25]:
inference_prompts = []

for index, row in df_test.iterrows():  # Test dataset
    task_name = row['task']
    src_text = row['src']

    inference_prompt = f"""
You are an advanced AI assistant skilled in {task_name}.
Your task is to process the given input and generate the correct output.

Input: {src_text}

Now, generate the required output below after 'Output:'.

Output:
"""  # Model should generate the output here

    inference_prompts.append(inference_prompt)



In [26]:
prompt = prompt

# Enable native 2x faster inference
FastLanguageModel.for_inference(model)

# Tokenize the prompt for inference
inputs = tokenizer(
    [
        prompt  # Use the prompt directly without additional formatting
    ],
    return_tensors="pt"
).to("cuda")

# Generate the output from the model
outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)

# Decode the generated output
decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Print the generated output
print(decoded_outputs)

NameError: name 'prompt' is not defined

In [None]:
decoded_outputs

In [27]:
import re

def extract_output(text):
    match = re.search(r'Output:\s(.*)', text, re.DOTALL)
    return match.group(1).strip() if match else ""



final_output = extract_output(decoded_outputs[0])
print(final_output)


NameError: name 'decoded_outputs' is not defined

In [28]:
import re
def generate_output(model, tokenizer):
    """Takes user input, constructs inference prompt, runs model, and returns extracted output."""

    # Get user input
    src_text = input("Enter the source text: ")

    # Construct inference prompt
    inference_prompt = f"""
You are an advanced AI assistant skilled in processing text.
Your task is to generate the correct output for the given input.

Input: {src_text}

Now, generate the correct output below after 'Output:'.

Output:
"""

    # Tokenize the prompt
    inputs = tokenizer([inference_prompt], return_tensors="pt").to("cuda")

    # Generate model output
    outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)

    # Decode the generated output
    decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    # Extract final output
    final_output = extract_output(decoded_output)

    # Display result
    print("\nFinal Output:", final_output)

# Example usage (Assuming model and tokenizer are already loaded)
# generate_output(model, tokenizer)


In [29]:
generate_output(model, tokenizer)

Enter the source text: Paraphrase the sentence: I need not be as devilish as Satan wants me to be.

Final Output: I don't have to be as bad as Satan wants me to be.


In [31]:
generate_output(model, tokenizer)

Enter the source text: Correct the grammatical errors, I learn this when I is 10.

Final Output: I learned this when I was 10.
