In [None]:
#Mount your drive, if you are working in Google Colab 
from google.colab import drive
drive.mount('/content/drive')


In [None]:
!pip install transformers datasets torch


In [None]:
!pip install nltk spacy

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
import json

with open(r'D:\LP\Assessment\train.json') as f: #Update with your path
  train_data = json.load(f)

with open(r'D:\LP\Assessment\dev.json') as f: #Update with your path
  val_data = json.load(f)

In [None]:
# Extract disfluent and original questions from the dataset
def extract_questions(data):
    disfluent_questions = []
    original_questions = []

    for key, value in data.items():
        disfluent_questions.append(value['disfluent'])
        original_questions.append(value['original'])

    return disfluent_questions, original_questions

# Extract questions from training data
disfluent_train, original_train = extract_questions(train_data)

# Extract questions from validation data
disfluent_val, original_val = extract_questions(val_data)

# Display a sample of extracted questions
print("Sample Disfluent Training Question:", disfluent_train[0])
print("Sample Original Training Question:", original_train[0])


In [None]:
import json
import re
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader, Dataset
import torch

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

# Load NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Disfluency words commonly found in conversations
disfluency_words = set(["uh", "um", "ah", "er", "hm", "like", "you know", "i mean"])

# Function to preprocess text
def preprocess_text(text):
    # 1. Disfluency Detection and Removal
    # Remove filler words and common disfluencies
    words = text.split()
    words = [word for word in words if word.lower() not in disfluency_words]

    # Join words back into a single string
    text = ' '.join(words)

    # 2. Tokenization and Lemmatization
    # Use spaCy for tokenization and lemmatization
    doc = nlp(text)
    tokens = [lemmatizer.lemmatize(token.text.lower()) for token in doc if token.text.lower() not in stop_words and token.text.isalpha()]

    # Join tokens back into a single string
    text = ' '.join(tokens)

    # 3. Text Normalization
    # Lowercasing, Contraction expansion can be applied here
    text = text.lower()
    
    # Remove unnecessary characters or multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Load your data
with open(r'D:\LP\Assessment\train.json') as f:
    train_data = json.load(f)

with open(r'D:\LP\Assessment\dev.json') as f:
    val_data = json.load(f)

# Extract disfluent and original questions from the dataset
def extract_questions(data):
    disfluent_questions = []
    original_questions = []

    for key, value in data.items():
        disfluent_questions.append(value['disfluent'])
        original_questions.append(value['original'])

    return disfluent_questions, original_questions

# Extract and preprocess questions from training data
disfluent_train, original_train = extract_questions(train_data)
disfluent_train = [preprocess_text(question) for question in disfluent_train]
original_train = [preprocess_text(question) for question in original_train]

# Extract and preprocess questions from validation data
disfluent_val, original_val = extract_questions(val_data)
disfluent_val = [preprocess_text(question) for question in disfluent_val]
original_val = [preprocess_text(question) for question in original_val]

# Display a sample of extracted questions
print("Sample Preprocessed Disfluent Training Question:", disfluent_train[0])
print("Sample Preprocessed Original Training Question:", original_train[0])

# Custom dataset class for handling the data
class QuestionRewriteDataset(Dataset):
    def __init__(self, disfluent_questions, original_questions, tokenizer, max_length=128):
        self.disfluent_questions = disfluent_questions
        self.original_questions = original_questions
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.disfluent_questions)

    def __getitem__(self, idx):
        input_text = "fix: " + self.disfluent_questions[idx]  # Prefix with 'fix:'
        target_text = self.original_questions[idx]

        # Tokenize input and target text
        input_encodings = self.tokenizer(input_text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors="pt")
        target_encodings = self.tokenizer(target_text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors="pt")

        # Convert to tensors and return as dictionary
        return {
            'input_ids': input_encodings['input_ids'].squeeze(),
            'attention_mask': input_encodings['attention_mask'].squeeze(),
            'labels': target_encodings['input_ids'].squeeze()
        }

# Load the pre-trained T5 model and tokenizer
model_name = "t5-small"  # You can use 't5-base' or 't5-large' for better performance
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Instantiate the datasets
train_dataset = QuestionRewriteDataset(disfluent_train, original_train, tokenizer)
val_dataset = QuestionRewriteDataset(disfluent_val, original_val, tokenizer)

# Create DataLoaders for batching
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)  # Adjust batch size as needed
val_loader = DataLoader(val_dataset, batch_size=8)

# Display shape of tokenized data to ensure correctness
batch = next(iter(train_loader))
print("Sample training input shape:", batch['input_ids'].shape)
print("Sample training target shape:", batch['labels'].shape)


In [None]:
from transformers import AdamW, get_scheduler
from tqdm import tqdm  # For progress bar
import torch

# Training parameters
epochs = 3  # Number of epochs to train ranging from 1 to 9
learning_rate = 5e-5  # Learning rate
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to device (GPU/CPU)
model.to(device)

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Scheduler for learning rate decay
num_training_steps = epochs * len(train_loader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

# Training loop
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")

    # Training phase
    model.train()
    train_loss = 0

    for batch in tqdm(train_loader):
        # Move input tensors to the device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    print(f"Training Loss: {avg_train_loss:.4f}")

    # Validation phase
    model.eval()
    val_loss = 0

    with torch.no_grad():
        for batch in tqdm(val_loader):
            # Move input tensors to the device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Validation Loss: {avg_val_loss:.4f}")

    # Optional: Save the model checkpoint after each epoch
    torch.save(model.state_dict(), f"t5_question_rewrite_epoch_{epoch + 1}.pt")

print("Training complete!")


In [None]:
import sacrebleu

def compute_bleu(predictions, references):
    bleu = sacrebleu.corpus_bleu(predictions, [references])
    return bleu.score

# Collect predictions and references
predictions = []
references = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)
        preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        refs = tokenizer.batch_decode(labels, skip_special_tokens=True)

        predictions.extend(preds)
        references.extend(refs)

# Evaluate BLEU score
bleu_score = compute_bleu(predictions, references)
print(f"BLEU Score: {bleu_score:.2f}")


In [None]:
from rouge_score import rouge_scorer

def compute_rouge(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {"rouge1": 0, "rouge2": 0, "rougeL": 0}

    for pred, ref in zip(predictions, references):
        score = scorer.score(ref, pred)
        for key in scores.keys():
            scores[key] += score[key].fmeasure

    for key in scores.keys():
        scores[key] /= len(predictions)

    return scores

# Evaluate ROUGE score
rouge_scores = compute_rouge(predictions, references)
print(f"ROUGE Scores: {rouge_scores}")


In [None]:
def compute_accuracy(predictions, references):
    correct = sum(p == r for p, r in zip(predictions, references))
    total = len(predictions)
    return correct / total

# Evaluate accuracy
accuracy = compute_accuracy(predictions, references)
print(f"Accuracy: {accuracy:.2f}")


In [None]:
# After the validation loop
model.eval()
predictions = []
references = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)
        preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        refs = tokenizer.batch_decode(labels, skip_special_tokens=True)

        predictions.extend(preds)
        references.extend(refs)

# Calculate and print evaluation metrics
bleu_score = compute_bleu(predictions, references)
rouge_scores = compute_rouge(predictions, references)
accuracy = compute_accuracy(predictions, references)

print(f"BLEU Score: {bleu_score:.2f}")
print(f"ROUGE Scores: {rouge_scores}")
print(f"Accuracy: {accuracy:.2f}")


In [None]:
originals = []

model.eval()
predictions = []
references = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Collect originals
        originals_batch = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
        originals.extend(originals_batch)

        # Generate predictions
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)
        preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        refs = tokenizer.batch_decode(labels, skip_special_tokens=True)

        predictions.extend(preds)
        references.extend(refs)

# Now you have originals, predictions, and references for analysis or visualization


In [None]:
import matplotlib.pyplot as plt
import textwrap

def visualize_predictions(originals, predictions, references, num_samples=10):
    fig, axs = plt.subplots(num_samples, 3, figsize=(15, 2 * num_samples), sharex=True, sharey=True)
    
    def wrap_text(text, width=50):
        return '\n'.join(textwrap.wrap(text, width=width))

    for i in range(num_samples):
        axs[i, 0].text(0.5, 0.5, wrap_text(originals[i]), fontsize=10, ha='center', va='center')
        axs[i, 1].text(0.5, 0.5, wrap_text(predictions[i]), fontsize=10, ha='center', va='center')
        axs[i, 2].text(0.5, 0.5, wrap_text(references[i]), fontsize=10, ha='center', va='center')
        
        axs[i, 0].set_title('Original')
        axs[i, 1].set_title('Prediction')
        axs[i, 2].set_title('Reference')
        
        for j in range(3):
            axs[i, j].axis('off')
    
    plt.tight_layout()
    plt.show()

# Example usage
visualize_predictions(originals[:10], predictions[:10], references[:10])



In [None]:
import matplotlib.pyplot as plt
import textwrap

def visualize_predictions(originals, predictions, references, num_samples=10):
    fig, axs = plt.subplots(num_samples, 3, figsize=(18, 3 * num_samples), sharex=True, sharey=True)
    
    def wrap_text(text, width=50):
        return '\n'.join(textwrap.wrap(text, width=width))

    for i in range(num_samples):
        axs[i, 0].text(0.5, 0.5, wrap_text(originals[i]), fontsize=10, ha='center', va='center', wrap=True)
        axs[i, 1].text(0.5, 0.5, wrap_text(predictions[i]), fontsize=10, ha='center', va='center', wrap=True)
        axs[i, 2].text(0.5, 0.5, wrap_text(references[i]), fontsize=10, ha='center', va='center', wrap=True)

        axs[i, 0].set_title('Original')
        axs[i, 1].set_title('Prediction')
        axs[i, 2].set_title('Reference')

        for j in range(3):
            axs[i, j].axis('off')

    plt.tight_layout()
    plt.show()

# Example usage with corrected data
visualize_predictions(originals, predictions, references, num_samples=3)

