In [1]:
!pip install chardet

Collecting chardet
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Downloading chardet-5.2.0-py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.4/199.4 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m0m
[?25hInstalling collected packages: chardet
Successfully installed chardet-5.2.0


In [2]:
import pandas as pd

def convert_csv_to_text(input_csv, output_txt):
    # Load CSV without headers and handle encoding issues
    try:
        df = pd.read_csv(input_csv, header=None, encoding='ISO-8859-1')  # Handle special characters
    except pd.errors.ParserError:
        # Try loading with tab separator if default fails
        df = pd.read_csv(input_csv, header=None, delimiter='\t', encoding='ISO-8859-1')

    print(df.head())  # Debugging: Show the first few rows to check the structure

    with open(output_txt, 'w', encoding='utf-8') as f:  # Ensure the output is UTF-8
        current_question = None
        for index, row in df.iterrows():
            try:
                # Convert each value to string and handle NaN values
                question = str(row[0]) if not pd.isna(row[0]) else ''
                answer = str(row[1]) if not pd.isna(row[1]) else ''

                # Only proceed if both question and answer are non-empty
                if question and answer:
                    # Remove 'Question :' and 'Answer :' prefixes
                    question = question.replace('Question :', '').strip()
                    answer = answer.replace('Answer :', '').strip()

                    if question == current_question:
                        f.write(f"Answer: {answer}\n")
                    else:
                        current_question = question
                        f.write(f"Question: {question}\n")
                        f.write(f"Answer: {answer}\n")
            except KeyError as e:
                print(f"Error accessing row data: {e}")
                print(row)

# Convert CSV files
convert_csv_to_text('/kaggle/input/qa-dataset/aiml_qa_train_1.csv', '/kaggle/working/train.txt')
convert_csv_to_text('/kaggle/input/qa-dataset/aiml_qa_dev_1.csv', '/kaggle/working/val.txt')
convert_csv_to_text('/kaggle/input/qa-dataset/aiml_qa_test_1.csv', '/kaggle/working/test.txt')

                                                   0  \
0  Question :What is the difference between conca...   
1  Question :What is the difference between conca...   
2  Question :Why are derivatives substracted from...   
3  Question :Why are derivatives substracted from...   
4  Question :Describe a process/pipeline for gene...   

                                                   1  
0  Answer :Concatenation combines two tensors by ...  
1  Answer :Concatenation is often used to combine...  
2  Answer :The derivative of the loss function at...  
3  Answer :The intuition behind adjusting the wei...  
4  Answer :Select a pretrained model suitable for...  
                                                   0  \
0  Question : Does the maximum value of 'n' in th...   
1                          Question : How CNN works?   
2  Question : How is NMT trained? Is it common to...   
3  Question : What is the process of learning POS...   
4  Question : how to handle multi lingual situati... 

In [3]:
import random

def augment_data(input_csv, output_csv):
    df = pd.read_csv(input_csv, header=None, encoding='ISO-8859-1')
    augmented_rows = []
    
    for _, row in df.iterrows():
        question = row[0]
        answers = [row[i] for i in range(1, len(row)) if pd.notna(row[i])]
        
        if len(answers) > 1:
            augmented_answers = [answer.replace('Answer:', 'Paraphrased Answer:') for answer in answers]
            for ans in augmented_answers:
                augmented_rows.append([question] + [ans])
    
    augmented_df = pd.DataFrame(augmented_rows, columns=['Question', 'Answer'])
    augmented_df.to_csv(output_csv, index=False, header=False)

# Augment the training data
augment_data('/kaggle/input/qa-dataset/aiml_qa_train_1.csv', '/kaggle/working/augmented_train.csv')

In [4]:
!pip install --upgrade datasets transformers

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m665.3 kB/s[0m eta [36m0:00:00[0m:--:--[0m
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading transformers-4.44.2-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: transformers, datasets
  Attempting uninstall: transformers
    Found existing installation: transformers 4.44.0
    Uninstalling transformers-4.44.0:
      Successfully uninstalled transformers-4.44.0
  Attempting uninstall: datasets
    Found existing installation:

In [6]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained("gpt2")

def tokenize_function(data):
    encodings = tokenizer(data['text'], padding="max_length", truncation=True, max_length=512)
    encodings['labels'] = encodings['input_ids']
    return encodings

# Load and tokenize datasets
train_dataset = load_dataset('text', data_files='/kaggle/working/train.txt')['train']
val_dataset = load_dataset('text', data_files='/kaggle/working/val.txt')['train']
test_dataset = load_dataset('text', data_files='/kaggle/working/test.txt')['train']

train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
test_dataset = test_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1091 [00:00<?, ? examples/s]

Map:   0%|          | 0/161 [00:00<?, ? examples/s]

Map:   0%|          | 0/268 [00:00<?, ? examples/s]

In [7]:
# Define training arguments with adjusted parameters
training_args = TrainingArguments(
    output_dir="/kaggle/working/output",
    evaluation_strategy="epoch",
    num_train_epochs=5,          # Experiment with fewer epochs
    learning_rate=5e-5,          # Adjust learning rate
    per_device_train_batch_size=8,  # Adjust batch size
    per_device_eval_batch_size=8,   # Adjust batch size
    logging_dir="/kaggle/working/logs",
    save_total_limit=1,
    save_steps=500,             # Save less frequently
    logging_steps=50,           # Log more frequently
    warmup_steps=500,           # Adjust warmup steps
    weight_decay=0.01,
    max_grad_norm=1.0,          # Gradient clipping
    fp16=True,                  # Use mixed precision if supported
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Start training
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,5.9144,0.246059
2,0.2707,0.204762
3,0.1588,0.199218
4,0.1455,0.196333
5,0.1438,0.195846


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=345, training_loss=1.0048119876695716, metrics={'train_runtime': 593.6304, 'train_samples_per_second': 9.189, 'train_steps_per_second': 0.581, 'total_flos': 1425348034560000.0, 'train_loss': 1.0048119876695716, 'epoch': 5.0})

In [8]:
# Save the model and tokenizer
model.save_pretrained("/kaggle/working/saved_model")
tokenizer.save_pretrained("/kaggle/working/saved_model")

('/kaggle/working/saved_model/tokenizer_config.json',
 '/kaggle/working/saved_model/special_tokens_map.json',
 '/kaggle/working/saved_model/vocab.json',
 '/kaggle/working/saved_model/merges.txt',
 '/kaggle/working/saved_model/added_tokens.json')

In [9]:
!zip -r /kaggle/working/saved_model_qa.zip /kaggle/working/saved_model

  pid, fd = os.forkpty()


  adding: kaggle/working/saved_model/ (stored 0%)
  adding: kaggle/working/saved_model/model.safetensors (deflated 7%)
  adding: kaggle/working/saved_model/config.json (deflated 52%)
  adding: kaggle/working/saved_model/generation_config.json (deflated 24%)
  adding: kaggle/working/saved_model/merges.txt (deflated 53%)
  adding: kaggle/working/saved_model/vocab.json (deflated 68%)
  adding: kaggle/working/saved_model/special_tokens_map.json (deflated 74%)
  adding: kaggle/working/saved_model/tokenizer_config.json (deflated 55%)


In [11]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('/kaggle/working/saved_model')
model = GPT2LMHeadModel.from_pretrained('/kaggle/working/saved_model')

# Load the test dataset
test_dataset = load_dataset('text', data_files='/kaggle/working/test.txt')['train']

# Print column names to understand the dataset structure
print(test_dataset.column_names)

# Tokenize the test dataset
def tokenize_function(data):
    encodings = tokenizer(data['text'], padding="max_length", truncation=True, max_length=512)
    encodings['labels'] = encodings['input_ids']
    return encodings

test_dataset = test_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

['text']


Map:   0%|          | 0/268 [00:00<?, ? examples/s]

In [12]:
print(test_dataset.column_names)

['input_ids', 'attention_mask', 'labels']


In [14]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset
import torch

In [15]:
def predict(example):
    input_ids = torch.tensor(example['input_ids']).unsqueeze(0)  # Add batch dimension
    attention_mask = torch.tensor(example['attention_mask']).unsqueeze(0)  # Add batch dimension

    prediction = generate_predictions(input_ids, attention_mask, model, tokenizer)
    return {"predictions": prediction}

def generate_predictions(input_ids, attention_mask, model, tokenizer, max_new_tokens=50):
    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,  # Include attention mask
        max_new_tokens=max_new_tokens,
        num_return_sequences=1,
        no_repeat_ngram_size=2
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [16]:
# Apply prediction function to the test dataset
test_predictions = test_dataset.map(predict, batched=False)

Map:   0%|          | 0/268 [00:00<?, ? examples/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

In [17]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=b46d53143ea3b5c13ecc633eaf160b91ea91ee091155a19326ce8aec2d3797fe
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [18]:
!pip install evaluate sacrebleu

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m754.9 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-2.10.1-py3-none-any.whl (18 kB)
Installing collected packages: portalocker, sacrebleu, evaluate
Successfully installed evaluate-0.4.3 portalocker-2.10.1 sacrebleu-2.4.3


In [19]:
import evaluate
sacrebleu_metric = evaluate.load("sacrebleu")
rouge_metric = evaluate.load('rouge')

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [27]:
!pip install --upgrade datasets



In [30]:
import evaluate

# Load the ROUGE metric
rouge = evaluate.load("rouge")

# Assuming test_predictions and test_dataset are defined
predictions = [item['predictions'] for item in test_predictions]
references = [tokenizer.decode(item['labels'], skip_special_tokens=True) for item in test_dataset]

# Compute ROUGE scores
results = rouge.compute(predictions=predictions, references=references)
print(results)

{'rouge1': 0.7937453791851754, 'rouge2': 0.7830699181992671, 'rougeL': 0.7933683815253593, 'rougeLsum': 0.7929372513021002}


In [34]:
import torch

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def question_answer(question, max_new_tokens=50):
    model.eval()
    
    # Include instruction to generate only the answer
    prompt = f"Question: {question} Answer:"
    
    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs.get('attention_mask', None).to(device)

    try:
        output = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=input_ids.size(1) + max_new_tokens,
            pad_token_id=tokenizer.pad_token_id,
            temperature=1.0,  # Adjust temperature for diversity
            top_k=50,
            top_p=0.95
        )
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
        
        # Strip the question and keep only the answer part
        generated_answer = generated_text.replace(f"Question: {question} Answer:", "").strip()
        return generated_answer
    except Exception as e:
        print(f"Error during text generation: {e}")
        return ""

In [36]:
# Move model to the device
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [38]:
question_answer('In machine learning, when is a tanh kernel typically utilized?')

'Yes, it is used for classification tasks.'

In [39]:
question_answer('by adding layers, are we not creating overfitting?')

'Yes, by adding layers, are we not creating overfitting?'

In [40]:
question_answer('Question :is Auto Encoder useful for dimensionality reduction of a numerical data set?')

'Yes, it can be used for dimensionality reduction of a numerical data set.'

In [41]:
question_answer('Question :Can we generate a sentence by using all the n-grams upto the length of sentence?')

'Yes, we can.'

In [42]:
question_answer('Question :What does DNN stand for in the context of machine learning?')

'DNN stands for Deep Neural Network, a type of neural network that is used in machine learning to learn from data.'

In [43]:
question_answer('Is RNN using human assistance for backpropagation?')

'Yes, it is possible to use RNN for backpropagation.'

In [44]:
question_answer('Why is slicing important and when is it used?')

'It is used to extract features from images or text.'