In [None]:
!pip install transformers
!pip install sentencepiece

In [None]:
import pandas as pd
import torch
import tensorflow as tf
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Adafactor, AdamW

In [None]:
# Load your JSON file into a pandas dataframe
df = pd.read_json('/content/drive/MyDrive/Assignments/Project/reviews.json', lines=True)
# Combine summary and reviewText fields
df['text'] = df['summary'].fillna('') + '. ' + df['reviewText'].fillna('')
# sentiment polarity classification
X_data = df['text']
y_data = df['overall'].apply(lambda x: 'positive' if x > 3 else ('neutral' if x == 3 else 'negative'))

df = pd.DataFrame({
    'text': X_data,
    'label': y_data
})

label_counts_df = df['label'].value_counts()

print(label_counts_df)

positive    33516
negative    11530
neutral      4954
Name: label, dtype: int64


In [None]:
df = df.sample(n=1000, random_state=42)

In [None]:
# Split the DataFrame into training and test sets (80:20)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
# Assuming you have process_data function and train_df
train_df, val_df = train_test_split(train_df, test_size=0.2)

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm

# Initialize T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

if torch.cuda.is_available():
    model = model.to("cuda")

# Function to encode data
def process_data(df):
    input_texts = ["review: " + text for text in df['text'].values]
    target_texts = [str(label) for label in df['label'].values]

    input_encodings = tokenizer(input_texts, truncation=True, padding='max_length', max_length=512, return_tensors="pt")
    target_encodings = tokenizer(target_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

    return input_encodings.input_ids, target_encodings.input_ids

# Prepare data
train_input_ids, train_target_ids = process_data(train_df)
test_input_ids, test_target_ids = process_data(test_df)

# Prepare data loaders
train_dataset = TensorDataset(train_input_ids, train_target_ids)
test_dataset = TensorDataset(test_input_ids, test_target_ids)

train_loader = DataLoader(train_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

# Define Loss and Optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(10):
    model.train()
    for batch in tqdm(train_loader):
        optimizer.zero_grad()

        # Move data to GPU
        if torch.cuda.is_available():
            input_ids = batch[0].to("cuda")
            target_ids = batch[1].to("cuda")
        else:
            input_ids = batch[0]
            target_ids = batch[1]

        outputs = model(input_ids=input_ids, labels=target_ids)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

100%|██████████| 50/50 [00:26<00:00,  1.90it/s]
100%|██████████| 50/50 [00:24<00:00,  2.03it/s]
100%|██████████| 50/50 [00:24<00:00,  2.06it/s]
100%|██████████| 50/50 [00:24<00:00,  2.03it/s]
100%|██████████| 50/50 [00:24<00:00,  2.01it/s]
100%|██████████| 50/50 [00:24<00:00,  2.03it/s]
100%|██████████| 50/50 [00:24<00:00,  2.04it/s]
100%|██████████| 50/50 [00:24<00:00,  2.03it/s]
100%|██████████| 50/50 [00:24<00:00,  2.04it/s]
100%|██████████| 50/50 [00:24<00:00,  2.02it/s]


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import label_binarize
import numpy as np

# Evaluation
model.eval()
all_predictions = []
all_targets = []

for batch in tqdm(test_loader):
    if torch.cuda.is_available():
        input_ids = batch[0].to("cuda")
        target_ids = batch[1].to("cuda")
    else:
        input_ids = batch[0]
        target_ids = batch[1]

    with torch.no_grad():
        outputs = model.generate(input_ids)

    predictions = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in outputs]
    targets = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in target_ids]

    all_predictions.extend(predictions)
    all_targets.extend(targets)


100%|██████████| 13/13 [00:02<00:00,  5.13it/s]


In [None]:
# Calculate metrics
print("Confusion Matrix:", confusion_matrix(all_targets, all_predictions))
print("Accuracy:", accuracy_score(all_targets, all_predictions))

Confusion Matrix: [[ 25   1  19]
 [  1   6  16]
 [  1   1 130]]
Accuracy: 0.805


In [None]:
# For multi-class classification, the "weighted" average is generally used for precision, recall, and F1-score
print("Precision:", precision_score(all_targets, all_predictions, average='weighted'))
print("Recall:", recall_score(all_targets, all_predictions, average='weighted'))
print("F1 Score:", f1_score(all_targets, all_predictions, average='weighted'))

Precision: 0.8145833333333333
Recall: 0.805
F1 Score: 0.7785439068100359


In [None]:
# AUC calculation for multi-class sentiment analysis is a bit more involved and depends on one-vs-all binarization
# Assuming all_targets and all_predictions contain string labels, you must first map them to integer indices
unique_labels = np.unique(all_targets)
targets_int = [np.where(label==unique_labels)[0][0] for label in all_targets]
predictions_int = [np.where(label==unique_labels)[0][0] for label in all_predictions]

# Then, binarize these integer indices
targets_binarized = label_binarize(targets_int, classes=range(len(unique_labels)))
predictions_binarized = label_binarize(predictions_int, classes=range(len(unique_labels)))

# Compute AUC for each class and average
auc = roc_auc_score(targets_binarized, predictions_binarized, multi_class='ovr', average='weighted')
print("AUC:", auc)

AUC: 0.7305457284061794


In [None]:
# Process validation data like training and testing data
val_input_ids, val_target_ids = process_data(val_df)
val_dataset = TensorDataset(val_input_ids, val_target_ids)
val_loader = DataLoader(val_dataset, batch_size=16)


In [None]:
from sklearn.model_selection import KFold
import numpy as np

# Function to encode data
def process_data(df):
    input_texts = ["review: " + text for text in df['text'].values]
    target_texts = [str(label) for label in df['label'].values]

    input_encodings = tokenizer(input_texts, truncation=True, padding='max_length', max_length=512, return_tensors="pt")
    target_encodings = tokenizer(target_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

    return input_encodings.input_ids, target_encodings.input_ids

# Prepare data
input_ids, target_ids = process_data(df)  # Assume df is your DataFrame containing 'text' and 'label'

# Prepare KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_index, val_index) in enumerate(kf.split(input_ids)):
    print(f"Fold {fold + 1}")

    train_inputs, val_inputs = input_ids[train_index], input_ids[val_index]
    train_targets, val_targets = target_ids[train_index], target_ids[val_index]

    # Prepare data loaders
    train_dataset = TensorDataset(train_inputs, train_targets)
    val_dataset = TensorDataset(val_inputs, val_targets)

    train_loader = DataLoader(train_dataset, batch_size=16)
    val_loader = DataLoader(val_dataset, batch_size=16)

    # Initialize model for each fold
    model = T5ForConditionalGeneration.from_pretrained('t5-small')
    if torch.cuda.is_available():
        model = model.to("cuda")

    # Define Loss and Optimizer
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Training loop for each fold
    for epoch in range(10):
        model.train()
        for batch in tqdm(train_loader):
            optimizer.zero_grad()

            if torch.cuda.is_available():
                input_ids = batch[0].to("cuda")
                target_ids = batch[1].to("cuda")
            else:
                input_ids = batch[0]
                target_ids = batch[1]

            outputs = model(input_ids=input_ids, labels=target_ids)
            loss = outputs.loss

            loss.backward()
            optimizer.step()



In [None]:
# Metrics storage for each fold
all_fold_metrics = []

for fold, (train_index, val_index) in enumerate(kf.split(input_ids)):
    print(f"Fold {fold + 1}")
    # ... (previous training code)

    # Initialize metrics for this fold
    fold_metrics = {
        'accuracy': [],
        'precision': [],
        'recall': [],
        'f1_score': []
    }

    # Validation loop for each fold
    model.eval()
    all_predictions = []
    all_targets = []

    for batch in tqdm(val_loader):
        if torch.cuda.is_available():
            input_ids = batch[0].to("cuda")
            target_ids = batch[1].to("cuda")
        else:
            input_ids = batch[0]
            target_ids = batch[1]

        with torch.no_grad():
            outputs = model.generate(input_ids)

        predictions = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in outputs]
        targets = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in target_ids]

        all_predictions.extend(predictions)
        all_targets.extend(targets)

    # Calculate metrics
    accuracy = accuracy_score(all_targets, all_predictions)
    precision = precision_score(all_targets, all_predictions, average='weighted')
    recall = recall_score(all_targets, all_predictions, average='weighted')
    f1 = f1_score(all_targets, all_predictions, average='weighted')

    print(f"Fold {fold + 1} Validation Metrics: ")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

    fold_metrics['accuracy'].append(accuracy)
    fold_metrics['precision'].append(precision)
    fold_metrics['recall'].append(recall)
    fold_metrics['f1_score'].append(f1)

    # Store fold metrics
    all_fold_metrics.append(fold_metrics)

# Averaging the metrics over all folds
average_metrics = {key: np.mean([fold_metrics[key] for fold_metrics in all_fold_metrics]) for key in fold_metrics.keys()}
print("Average Metrics Across All Folds: ", average_metrics)
