# Importing

In [19]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping,LearningRateScheduler
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import LSTM, GRU, Dense, Embedding, Dropout, GlobalAveragePooling1D, Flatten, SpatialDropout1D, Bidirectional,Input,GlobalMaxPooling1D,BatchNormalization,Concatenate,Conv1D
import gensim
import re
import nltk
import string
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertConfig,BertModel
from transformers import AdamW, BertForSequenceClassification, get_linear_schedule_with_warmup
from tqdm import tqdm, trange  
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.tree import Tree
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.utils import shuffle
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
#!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

# Data Loading and Preprocessing

In [29]:
df = pd.read_csv('/kaggle/input/traintest/train.csv')
df

Unnamed: 0,SampleID,Discussion,Category
0,1,"Without sitting down and doing it manually, yo...",Sports
1,2,All your Search ends with this link.,STEM
2,3,"No, the program you're using is made to be com...",STEM
3,4,Mike Woicik\n\nThe correct answer is: Mike Woi...,Sports
4,5,"No, but not because of why you might think. Wh...",Politics
...,...,...,...
24984,24985,He's got more pull with the horses than most j...,Sports
24985,24986,Yes he did for a big juicy cheeseburger with f...,Market & Economy
24986,24987,I'm not.,Market & Economy
24987,24988,It is sexual harassment because it is offensiv...,Politics


## Normal Preprocessing

In [None]:
# df['Discussion'].dropna()
# category_mapping = {
#     'Politics': 0,
#     'Sports': 1,
#     'Media': 2,
#     'Market & Economy': 3,
#     'STEM': 4
# }
# df['Discussion'] = df['Discussion'].astype(str)
# df['Discussion'] = df['Discussion'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', ' ', x))
# df['Category'] = df['Category'].map(category_mapping)
# df=df.drop('SampleID',axis=1)
# nltk.download('stopwords')
# stop_words = set(stopwords.words('english'))
# df['Discussion'] = df['Discussion'].apply(lambda text: ' '.join([word for word in word_tokenize(text) if word.lower() not in stop_words]))
# df['Discussion'][0]

## FOR BERT

In [None]:
df['Discussion'].dropna()
category_mapping = {
    'Politics': 0,
    'Sports': 1,
    'Media': 2,
    'Market & Economy': 3,
    'STEM': 4
}
df['Discussion'] = df['Discussion'].astype(str)
df['Discussion'] = df['Discussion'].apply(lambda text: re.sub(r"http\S+|www\S+|https\S+", "<URL>", text, flags=re.MULTILINE))
df['Discussion'] = df['Discussion'].apply(lambda text: re.sub(r"<.*?>", "", text))
df['Discussion'] = df['Discussion'].apply(lambda text: re.sub(r"[^a-zA-Z0-9.,!?;\\'\"\\s]", " ", text))
df['Discussion'] = df['Discussion'].apply(lambda text: re.sub(r"\d+", "<NUMBER>", text))
df['Category'] = df['Category'].map(category_mapping)
df=df.drop('SampleID',axis=1)
#stop_words = set(stopwords.words('english'))
#df['Discussion'] = df['Discussion'].astype(str).apply(lambda text: re.sub(r'[^a-zA-Z\s]', ' ', text))
#df['Discussion'] = df['Discussion'].apply(lambda text: ' '.join(filter(lambda word: word.lower() not in stop_words, text.split())))

## Augmentation

In [None]:
from nltk.corpus import wordnet

def synonym_replacement(row):
    text = row['Discussion']
    words = text.split()
    augmented_text = []
    for word in words:
        # Get synonyms for each word
        synonyms = wordnet.synsets(word)
        if synonyms:
            # Replace the word with a synonym
            synonym = synonyms[0].lemmas()[0].name()
            augmented_text.append(synonym)
        else:
            augmented_text.append(word)
    return {'augmented': augmented_text, 'category': row['Category']}

random_sample = df.sample(n=5000, random_state=42)  
augmented_data = random_sample.apply(synonym_replacement, axis=1)
augmented_df = pd.DataFrame(augmented_data.tolist())
augmented_df['augmented'] = augmented_df['augmented'].apply(lambda x: " ".join(x))
augmented_df.rename(columns={'augmented': 'Discussion', 'category': 'Category'}, inplace=True)
df = pd.concat([df, augmented_df], ignore_index=True)

In [None]:
df = pd.concat([df, augmented_df], ignore_index=True)

## Aggressive preprocessing

In [None]:
##############Aggressive Preprocessing##################
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    # Remove NaN values
    if pd.isnull(text):
        return ""
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    # Tokenize and remove stopwords
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))

    words = [word for word in words if word not in stop_words]
    # Lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]
    # Join back into a single string
    return ' '.join(words)

# Apply preprocessing to the 'Discussion' column
df['Discussion']=df['Discussion'].astype(str)
df['Discussion'] = df['Discussion'].dropna().apply(preprocess_text)

# Map categories
category_mapping = {
    'Politics': 0,
    'Sports': 1,
    'Media': 2,
    'Market & Economy': 3,
    'STEM': 4
}
df['Category'] = df['Category'].map(category_mapping)

# Drop the 'SampleID' column
df = df.drop('SampleID', axis=1)
df['Discussion'][0]
# Display the preprocessed DataFrame


## Tokenizer

In [None]:
#####Tokenization########
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size = 30000
oov_token = "<OOV>"
padding_type = "post"
trunc_type = "post"
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(df['Discussion'])
training_sequences = tokenizer.texts_to_sequences(df['Discussion'])
training_padded = pad_sequences(training_sequences, maxlen=100, padding=padding_type, truncating=trunc_type)
X=np.array(training_padded)
labels = np.array(df['Category'])


## Splitting

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(X, labels, test_size=0.2, random_state=42,stratify=labels)


## Pre-trained Embedding

In [None]:
# # Function to load GloVe vectors
# def load_glove_vectors(filepath):
#     embeddings_index = {}
#     with open(filepath, encoding="utf8") as f:
#         for line in f:
#             values = line.split()
#             word = values[0]
#             vector = np.asarray(values[1:], dtype="float32")
#             embeddings_index[word] = vector
#     return embeddings_index

# # Load GloVe embeddings
# glove_path = "/kaggle/input/glove-300d/glove.6B.300d.txt"
# glove_vectors = load_glove_vectors(glove_path)



In [None]:
# # Parameters
# vocab_size = len(tokenizer.word_index) + 1
# embedding_dim = 300

# # Initialize embedding matrix
# embedding_matrix = np.zeros((vocab_size, embedding_dim))

# # Populate embedding matrix
# for word, i in tokenizer.word_index.items():
#     if word in glove_vectors:
#         embedding_matrix[i] = glove_vectors[word]

In [None]:
from gensim.models import KeyedVectors
model=KeyedVectors.load_word2vec_format("/kaggle/input/google-news-300/GoogleNews-vectors-negative300.bin", binary=True)

In [None]:
# embedding_dim = 300
# word_index = tokenizer.word_index
# embedding_matrix = np.zeros((vocab_size, embedding_dim))
# for word, i in word_index.items():
#     if i < vocab_size and word in model.key_to_index:
#         embedding_matrix[i] = model[word]

In [None]:
##############################cond2#################
vocab_size = len(tokenizer.word_index) + 1

embedding_dim = 300
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    try:
        embedding_matrix[i] = model[word]
    except KeyError:
        pass

# Models

## BILSTM

In [None]:
##########################BILSTM################71.444score####30000vocab####300 shape ,input_length100 ####
##################71.5+ score #### Model1 Archi######
model2 = Sequential([
    Input(shape=(100,),dtype=np.int32),
    Embedding(input_dim=vocab_size,
              output_dim=embedding_dim,
              weights=[embedding_matrix],
              input_length=100,
              trainable=False), 
    Bidirectional(LSTM(256, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)),
    GlobalAveragePooling1D(),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.1),
    Dense(5, activation='softmax')  
])
model2.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',  
              metrics=['accuracy'])
model2.summary()
history = model2.fit(
    x_train, 
    y_train,
    validation_data=(x_val, y_val),
    epochs=10,  
    batch_size=128,
    callbacks=[EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)]
)

In [None]:
##########Fine Tuning##############
for layer in model2.layers:
    if isinstance(layer, Embedding):
        layer.trainable = True
history = model2.fit(
    x_train, 
    y_train,
    validation_data=(x_val, y_val),
    epochs=10,  
    batch_size=128,
    callbacks=[
        EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
        LearningRateScheduler(lambda epoch, lr: lr * 0.1 if epoch > 5 else lr)
    ]
)

In [None]:
model2.evaluate(x_val,y_val)

## GRU

In [None]:
########################Archi2#################
model4=Sequential([
    Input(shape=(100,),dtype=np.int32),
    Embedding(
    input_dim=vocab_size,
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    input_length=100,  # Adjust the input length to match your data
    trainable=False),
    GRU(256,return_sequences=True),
    Dropout(0.3),
    GlobalMaxPooling1D(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(5, activation='softmax')
    
])

In [None]:
model4.summary()
model4.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
historyy=model4.fit(x_train, 
    y_train,
    validation_data=(x_val, y_val),
    epochs=10,  
    batch_size=128,
    callbacks=[
        EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    ])

In [None]:
for layer in model4.layers:
    if isinstance(layer, Embedding):
        layer.trainable = True
history2 = model4.fit(
    x_train, 
    y_train,
    validation_data=(x_val, y_val),
    epochs=10,  
    batch_size=128,
    callbacks=[
        EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
        LearningRateScheduler(lambda epoch, lr: lr * 0.1 if epoch > 5 else lr)
    ]
)

In [None]:
model4.evaluate(x_val,y_val)

## BERT

In [None]:
try:
  import transformers
except:
  print("Installing transformers")
  !pip -q install transformers

In [None]:
sentences = df.Discussion.values

# Adding CLS and SEP tokens at the beginning and end of each sentence for BERT
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = df.Category.values

In [None]:
from transformers import BertTokenizer
try:
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    print("Tokenizer downloaded successfully.")
except Exception as e:
    print("An error occurred while downloading the tokenizer.")
    print(str(e))
    import traceback
    print(traceback.format_exc())

In [None]:
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

In [None]:
# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway.
# In the original paper, the authors used a length of 512.
MAX_LEN = 128

# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [None]:
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [None]:
# Use train_test_split to split our data into train and validation sets for training

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels,
                                                            random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)
# Torch tensors are the required datatype for our model

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [None]:
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 32
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop,
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


In [None]:
# Initializing a BERT bert-base-uncased style configuration
from transformers import BertModel, BertConfig
configuration = BertConfig()

# Initializing a model from the bert-base-uncased style configuration
model = BertModel(configuration)

# Accessing the model configuration
configuration = model.config

print(configuration)

In [None]:
import torch

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)
model = nn.DataParallel(model)
model.to(device)

In [None]:
#This code is taken from:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L102

# Don't apply weight decay to any parameters whose names include these tokens.
# (Here, the BERT doesn't have `gamma` or `beta` parameters, only `bias` terms)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']
# Separate the `weight` parameters from the `bias` parameters.
# - For the `weight` parameters, this specifies a 'weight_decay_rate' of 0.01.
# - For the `bias` parameters, the 'weight_decay_rate' is 0.0.
optimizer_grouped_parameters = [
    # Filter for all parameters which *don't* include 'bias', 'gamma', 'beta'.
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.1},

    # Filter for parameters which *do* include those.
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]
# Note - `optimizer_grouped_parameters` only includes the parameter values, not
# the names.

In [None]:
# Displaying a sample of the parameter_optimizer:  layer 3
layer_parameters = [p for n, p in model.named_parameters() if 'layer.3' in n]

In [None]:
# Displaying names of parameters for which weight decay is not applied
no_decay

In [None]:
# Displaying the list of the two dictionaries
small_sample = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)][:2],
     'weight_decay_rate': 0.1},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)][:2],
     'weight_decay_rate': 0.0}
]

for i, group in enumerate(small_sample):
    print(f"Group {i+1}:")
    print(f"Weight decay rate: {group['weight_decay_rate']}")
    for j, param in enumerate(group['params']):
        print(f"Parameter {j+1}: {param}")

In [None]:
#optimizer = BertAdam(optimizer_grouped_parameters,
#                      lr=2e-5,
#                      warmup=.1)

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3
#optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, correct_bias=False)
optimizer = AdamW(optimizer_grouped_parameters,
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                 )
# Total number of training steps is number of batches * number of epochs.
# `train_dataloader` contains batched data so `len(train_dataloader)` gives
# us the number of batches.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
#Creating the Accuracy Measurement Function
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
from tqdm import trange
import torch

# Initialize lists to track metrics
train_loss_set = []

# Training loop over epochs
for epoch in trange(epochs, desc="Epoch"):

    # Set the model to training mode
    model.train()

    # Initialize training loss
    tr_loss = 0
    nb_tr_steps = 0

    for step, batch in enumerate(train_dataloader):
        # Move batch data to the specified device
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Clear out gradients
        optimizer.zero_grad()

        # Forward pass with loss computation
        outputs = model(input_ids=b_input_ids, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)

        # Ensure loss is reduced to a scalar
        loss = outputs.loss
        if loss.dim() > 0:  # If loss has multiple elements, reduce it
            loss = loss.mean()

        # Append the scalar loss for tracking
        train_loss_set.append(loss.item())

        # Backward pass for gradients
        loss.backward()

        # Update model parameters
        optimizer.step()

        # Update the learning rate
        scheduler.step()

        # Accumulate training loss
        tr_loss += loss.item()
        nb_tr_steps += 1

    # Print average training loss for the epoch
    print(f"Train loss: {tr_loss / nb_tr_steps:.4f}")

    # Validation phase
    model.eval()

    # Initialize validation metrics
    eval_loss = 0
    eval_accuracy = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:
        # Move batch data to the specified device
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Disable gradient computation for evaluation
        with torch.no_grad():
            outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask)
            logits = outputs.logits  # Extract logits

        # Move logits and labels to CPU for metric calculation
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.cpu().numpy()

        # Compute accuracy for the current batch
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    # Print validation accuracy for the epoch
    print(f"Validation Accuracy: {eval_accuracy / nb_eval_steps:.4f}")


In [None]:
plt.figure(figsize=(15,8))
plt.title("Training loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.plot(train_loss_set)
plt.show()

# Test submission for BERT

In [None]:
test_df = pd.read_csv('/kaggle/input/traintest/test.csv')

test_df['Discussion']=test_df['Discussion'].astype(str)
test_df['Discussion'] = test_df['Discussion'].apply(lambda text: re.sub(r"http\S+|www\S+|https\S+", "<URL>", text, flags=re.MULTILINE))
test_df['Discussion'] = test_df['Discussion'].apply(lambda text: re.sub(r"<.*?>", "", text))
test_df['Discussion'] = test_df['Discussion'].apply(lambda text: re.sub(r"[^a-zA-Z0-9.,!?;\\'\"\\s]", " ", text))
test_df['Discussion'] = test_df['Discussion'].apply(lambda text: re.sub(r"\d+", "<NUMBER>", text))

In [None]:
test_df

In [None]:
###same tokenizer###
# Add CLS and SEP tokens to each sentence in the test data
test_sentences = test_df['Discussion'].values
test_sentences = ["[CLS] " + sentence + " [SEP]" for sentence in test_sentences]

# Tokenize the test sentences using the same tokenizer
test_tokenized_texts = [tokenizer.tokenize(sent) for sent in test_sentences]

# Convert tokens to input IDs
test_input_ids = [tokenizer.convert_tokens_to_ids(x) for x in test_tokenized_texts]

# Pad and truncate sequences to MAX_LEN
test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# Create attention masks
test_attention_masks = []
for seq in test_input_ids:
    seq_mask = [float(i > 0) for i in seq]
    test_attention_masks.append(seq_mask)

# Convert to Torch tensors
test_inputs = torch.tensor(test_input_ids)
test_masks = torch.tensor(test_attention_masks)


In [None]:
# predictions = model.predict(X_test)
# predictions
# Create TensorDataset and DataLoader for test data
test_data = TensorDataset(test_inputs, test_masks)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)



In [None]:
# predicted_class = predictions.argmax(axis=-1)
# predicted_class
# Set the model to evaluation mode
model.eval()

# Store predictions
predictions = []

# Predict
with torch.no_grad():
    for batch in test_dataloader:
        # Move inputs to the same device as the model
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask = batch

        # Forward pass
        outputs = model(b_input_ids, attention_mask=b_input_mask)

        # Get the logits and move to CPU
        logits = outputs.logits.detach().cpu().numpy()

        # Use argmax to get the predicted label for each instance
        batch_predictions = np.argmax(logits, axis=1)
        predictions.extend(batch_predictions)


In [None]:
len(predictions)

In [None]:
submit = pd.DataFrame({
    "SampleID": test_df["SampleID"],  
    "Category": predictions           
})
print(submit.head())


In [None]:
submit.to_csv("/kaggle/working/bfinal.csv", index = False)

# Test submission

In [None]:
test_df = pd.read_csv('/kaggle/input/traintest/test.csv')
df['Discussion']=df['Discussion'].astype(str)
test_df['Discussion'] = test_df['Discussion'].dropna().apply(preprocess_text)
# test_df['Discussion'] = test_df['Discussion'].astype(str)
# test_df['Discussion'] = test_df['Discussion'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', ' ', x))
# test_df['Discussion'] = test_df['Discussion'].apply(lambda text: ' '.join([word for word in word_tokenize(text) if word.lower() not in stop_words]))

In [None]:
test_df

In [None]:
sequences_test = tokenizer.texts_to_sequences(test_df['Discussion'])
padded_sequences_test = pad_sequences(sequences_test, maxlen=100, padding='post')

X_test = np.array(padded_sequences_test)

In [None]:
predictions = model2.predict(X_test)
predictions



In [None]:
predicted_class = predictions.argmax(axis=-1)
predicted_class

In [None]:
len(predicted_class)

In [None]:
submit = pd.DataFrame({
    "SampleID": test_df["SampleID"],  
    "Category": predicted_class  
})
submit.to_csv("/kaggle/working/bilstmagrglo.csv", index = False)

# RoBERTa

In [None]:
# Import necessary libraries
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm import trange
import numpy as np
import re
from nltk.corpus import stopwords


# Preprocessing
df['Discussion'].dropna(inplace=True)
category_mapping = {
    'Politics': 0,
    'Sports': 1,
    'Media': 2,
    'Market & Economy': 3,
    'STEM': 4
}
df['Discussion'] = df['Discussion'].astype(str)
df['Discussion'] = df['Discussion'].apply(lambda text: re.sub(r"http\S+|www\S+|https\S+", "<URL>", text, flags=re.MULTILINE))
df['Discussion'] = df['Discussion'].apply(lambda text: re.sub(r"<.*?>", "", text))
df['Discussion'] = df['Discussion'].apply(lambda text: re.sub(r"[^a-zA-Z0-9.,!?;\\'\"\\s]", " ", text))
df['Discussion'] = df['Discussion'].apply(lambda text: re.sub(r"\d+", "<NUMBER>", text))
df['Category'] = df['Category'].map(category_mapping)
df = df.drop('SampleID', axis=1)
# stop_words = set(stopwords.words('english'))
# df['Discussion'] = df['Discussion'].apply(
#     lambda text: ' '.join(filter(lambda word: word.lower() not in stop_words, text.split()))
# )

sentences = df.Discussion.values
labels = df.Category.values

# Tokenization using RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize the sentences and pad sequences
MAX_LEN = 128
input_ids = []
attention_masks = []

for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
        sent,
        add_special_tokens=True,  # Add [CLS] and [SEP]
        max_length=MAX_LEN,       # Pad or truncate
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',      # Return pytorch tensors
        truncation=True
    )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Split the dataset into training and validation
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(
    input_ids, labels, random_state=2021, test_size=0.1
)
train_masks, validation_masks, _, _ = train_test_split(
    attention_masks, input_ids, random_state=2021, test_size=0.1
)

# Create DataLoader
batch_size = 32
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# Load RoBERTa model for sequence classification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=5,  # Number of categories
    output_attentions=False,
    output_hidden_states=False,
)
model.to(device)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
from transformers import get_scheduler

epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_scheduler(
    "cosine",
    optimizer=optimizer,
    num_warmup_steps=int(total_steps * 0.1),  # 10% warm-up steps
    num_training_steps=total_steps,
)

# Accuracy calculation
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Training loop
for epoch in trange(epochs, desc="Epoch"):
    # Training phase
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        logits = outputs.logits
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Average Training Loss: {avg_train_loss:.4f}")

    # Validation phase
    model.eval()
    eval_accuracy = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask)
            logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.cpu().numpy()

        eval_accuracy += flat_accuracy(logits, label_ids)
        nb_eval_steps += 1

    print(f"Validation Accuracy: {eval_accuracy / nb_eval_steps:.4f}")


In [3]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_scheduler
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from tqdm import trange
import numpy as np
import re

# Preprocessing
df['Discussion'].dropna(inplace=True)
category_mapping = {
    'Politics': 0,
    'Sports': 1,
    'Media': 2,
    'Market & Economy': 3,
    'STEM': 4
}
df['Discussion'] = df['Discussion'].astype(str)
df['Discussion'] = df['Discussion'].apply(lambda text: re.sub(r"http\S+|www\S+|https\S+", "<URL>", text, flags=re.MULTILINE))
df['Discussion'] = df['Discussion'].apply(lambda text: re.sub(r"<.*?>", "", text))
df['Discussion'] = df['Discussion'].apply(lambda text: re.sub(r"[^a-zA-Z0-9.,!?;\\'\"\\s]", " ", text))
df['Discussion'] = df['Discussion'].apply(lambda text: re.sub(r"\d+", "<NUMBER>", text))
df['Category'] = df['Category'].map(category_mapping)
df = df.drop('SampleID', axis=1)

sentences = df.Discussion.values
labels = df.Category.values

# Tokenization using RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize the sentences and pad sequences
MAX_LEN = 128
input_ids = []
attention_masks = []

for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
        sent,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Split the dataset into training and validation
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(
    input_ids, labels, random_state=2021, test_size=0.1
)
train_masks, validation_masks, _, _ = train_test_split(
    attention_masks, input_ids, random_state=2021, test_size=0.1
)

# Create DataLoader
batch_size = 16  # Decrease batch size for better generalization

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# Load RoBERTa model for sequence classification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=5,
    output_attentions=False,
    output_hidden_states=False,
)
model.to(device)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8, weight_decay=0.01)
epochs = 6  # Train for more epochs

# Scheduler with warm-up
scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=int(len(train_dataloader) * 0.1),
    num_training_steps=len(train_dataloader) * epochs,
)

# Accuracy calculation
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Training loop
for epoch in trange(epochs, desc="Epoch"):
    # Training phase
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        logits = outputs.logits
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Average Training Loss: {avg_train_loss:.4f}")

    # Validation phase
    model.eval()
    eval_accuracy = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask)
            logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.cpu().numpy()

        eval_accuracy += flat_accuracy(logits, label_ids)
        nb_eval_steps += 1

    print(f"Validation Accuracy: {eval_accuracy / nb_eval_steps:.4f}")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch:   0%|          | 0/6 [00:00<?, ?it/s]

Average Training Loss: 0.8174


Epoch:  17%|█▋        | 1/6 [08:46<43:54, 526.80s/it]

Validation Accuracy: 0.7561
Average Training Loss: 0.5951


Epoch:  33%|███▎      | 2/6 [17:45<35:34, 533.59s/it]

Validation Accuracy: 0.7617
Average Training Loss: 0.4735


Epoch:  50%|█████     | 3/6 [26:43<26:47, 535.67s/it]

Validation Accuracy: 0.7661
Average Training Loss: 0.3666


Epoch:  67%|██████▋   | 4/6 [35:40<17:52, 536.40s/it]

Validation Accuracy: 0.7569
Average Training Loss: 0.2878


Epoch:  83%|████████▎ | 5/6 [44:38<08:56, 536.90s/it]

Validation Accuracy: 0.7533
Average Training Loss: 0.2379


Epoch: 100%|██████████| 6/6 [53:36<00:00, 536.02s/it]

Validation Accuracy: 0.7557





In [None]:
# Import necessary libraries
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import pandas as pd
import numpy as np
import re
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
from tqdm import tqdm

# Load the test dataset
test_df = pd.read_csv('/kaggle/input/traintest/test.csv')

# Preprocessing
test_df['Discussion'] = test_df['Discussion'].astype(str)
test_df['Discussion'] = test_df['Discussion'].apply(lambda text: re.sub(r"http\S+|www\S+|https\S+", "<URL>", text, flags=re.MULTILINE))
test_df['Discussion'] = test_df['Discussion'].apply(lambda text: re.sub(r"<.*?>", "", text))
test_df['Discussion'] = test_df['Discussion'].apply(lambda text: re.sub(r"[^a-zA-Z0-9.,!?;\\'\"\\s]", " ", text))
test_df['Discussion'] = test_df['Discussion'].apply(
    lambda text: ' '.join(filter(lambda word: word.lower() not in stop_words, text.split()))
)

# Tokenize test sentences
test_sentences = test_df['Discussion'].values
test_encoded = tokenizer(
    list(test_sentences),
    max_length=MAX_LEN,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)

# Extract input IDs and attention masks
test_input_ids = test_encoded['input_ids']
test_attention_masks = test_encoded['attention_mask']

# Prepare the DataLoader for inference
test_data = TensorDataset(test_input_ids, test_attention_masks)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Perform inference
model.eval()
predictions = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Inference", leave=False):
        b_input_ids, b_input_mask = tuple(t.to(device) for t in batch)

        # Forward pass
        outputs = model(b_input_ids, attention_mask=b_input_mask)
        logits = outputs.logits.detach().cpu().numpy()
        batch_predictions = np.argmax(logits, axis=1)
        predictions.extend(batch_predictions)

# Create submission DataFrame
submit = pd.DataFrame({
    "SampleID": test_df["SampleID"],  # Ensure SampleID column exists in test_df
    "Category": predictions
})

# Save the submission to a CSV file
submit.to_csv("/kaggle/working/disrobta3.csv", index=False)
print(submit.head())


# DistilBERT

In [None]:
# Import necessary libraries
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm import trange
import numpy as np
import re
from nltk.corpus import stopwords

# Preprocessing
df['Discussion'].dropna(inplace=True)
category_mapping = {
    'Politics': 0,
    'Sports': 1,
    'Media': 2,
    'Market & Economy': 3,
    'STEM': 4
}
df['Discussion'] = df['Discussion'].astype(str)
df['Discussion'] = df['Discussion'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', ' ', x))
df['Category'] = df['Category'].map(category_mapping)
df = df.drop('SampleID', axis=1)
stop_words = set(stopwords.words('english'))
df['Discussion'] = df['Discussion'].apply(
    lambda text: ' '.join(filter(lambda word: word.lower() not in stop_words, text.split()))
)

sentences = df.Discussion.values
labels = df.Category.values

# Tokenization using DistilBertTokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the sentences and pad sequences
MAX_LEN = 100
input_ids = []
attention_masks = []

for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
        sent,
        add_special_tokens=True,  # Add [CLS] and [SEP]
        max_length=MAX_LEN,       # Pad or truncate
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',      # Return pytorch tensors
        truncation=True
    )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Split the dataset into training and validation
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(
    input_ids, labels, random_state=2021, test_size=0.1
)
train_masks, validation_masks, _, _ = train_test_split(
    attention_masks, input_ids, random_state=2021, test_size=0.1
)

# Create DataLoader
batch_size = 32
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# Load DistilBERT model for sequence classification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=5,  # Number of categories
    output_attentions=False,
    output_hidden_states=False,
)
model.to(device)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
from transformers import get_scheduler

epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps,
)

# Accuracy calculation
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Training loop
for epoch in trange(epochs, desc="Epoch"):
    # Training phase
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        logits = outputs.logits
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Average Training Loss: {avg_train_loss:.4f}")

    # Validation phase
    model.eval()
    eval_accuracy = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask)
            logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.cpu().numpy()

        eval_accuracy += flat_accuracy(logits, label_ids)
        nb_eval_steps += 1

    print(f"Validation Accuracy: {eval_accuracy / nb_eval_steps:.4f}")


# RoBERTa Large

In [None]:
# Import necessary libraries
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm import trange
import numpy as np
import re
from nltk.corpus import stopwords

# Preprocessing
df['Discussion'].dropna(inplace=True)
category_mapping = {
    'Politics': 0,
    'Sports': 1,
    'Media': 2,
    'Market & Economy': 3,
    'STEM': 4
}
df['Discussion'] = df['Discussion'].astype(str)
df['Discussion'] = df['Discussion'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', ' ', x))
df['Category'] = df['Category'].map(category_mapping)
df = df.drop('SampleID', axis=1)
stop_words = set(stopwords.words('english'))
df['Discussion'] = df['Discussion'].apply(
    lambda text: ' '.join(filter(lambda word: word.lower() not in stop_words, text.split()))
)

sentences = df.Discussion.values
labels = df.Category.values

# Tokenization using RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

# Tokenize the sentences and pad sequences
MAX_LEN = 100
input_ids = []
attention_masks = []

for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
        sent,
        add_special_tokens=True,  # Add [CLS] and [SEP]
        max_length=MAX_LEN,       # Pad or truncate
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',      # Return pytorch tensors
        truncation=True
    )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Split the dataset into training and validation
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(
    input_ids, labels, random_state=2021, test_size=0.1
)
train_masks, validation_masks, _, _ = train_test_split(
    attention_masks, input_ids, random_state=2021, test_size=0.1
)

# Create DataLoader
batch_size = 32
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# Load RoBERTa model for sequence classification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-large",
    num_labels=5,  # Number of categories
    output_attentions=False,
    output_hidden_states=False,
)
model.to(device)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
from transformers import get_scheduler

epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps,
)

# Accuracy calculation
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Training loop
for epoch in trange(epochs, desc="Epoch"):
    # Training phase
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        logits = outputs.logits
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Average Training Loss: {avg_train_loss:.4f}")

    # Validation phase
    model.eval()
    eval_accuracy = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask)
            logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.cpu().numpy()

        eval_accuracy += flat_accuracy(logits, label_ids)
        nb_eval_steps += 1

    print(f"Validation Accuracy: {eval_accuracy / nb_eval_steps:.4f}")


# Testsubmission for BERT optimized

In [None]:
test_df = pd.read_csv('/kaggle/input/traintest/test.csv')
test_df['Discussion']=test_df['Discussion'].astype(str)
test_df['Discussion'] = test_df['Discussion'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', ' ', x))
test_df['Discussion'] = test_df['Discussion'].apply(lambda text: ' '.join(filter(lambda word: word.lower() not in stop_words, text.split())))

In [None]:
test_sentences = test_df['Discussion'].values
test_encoded = tokenizer(
    list(test_sentences),
    max_length=MAX_LEN,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)

# Extract input IDs and attention masks
test_input_ids = test_encoded['input_ids']
test_attention_masks = test_encoded['attention_mask']

In [None]:
test_data = TensorDataset(test_input_ids, test_attention_masks)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Set the model to evaluation mode
model.eval()

# Perform inference
predictions = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask = batch

        # Forward pass
        outputs = model(b_input_ids, attention_mask=b_input_mask)
        logits = outputs.logits.detach().cpu().numpy()
        batch_predictions = np.argmax(logits, axis=1)
        predictions.extend(batch_predictions)


In [None]:
len(predictions)

In [None]:
submit = pd.DataFrame({
    "SampleID": test_df["SampleID"],  
    "Category": predictions           
})
print(submit.head())


In [None]:
submit.to_csv("/kaggle/working/rot.csv", index = False)

In [None]:
# Preprocessing
df['Discussion'].dropna(inplace=True)
category_mapping = {
    'Politics': 0,
    'Sports': 1,
    'Media': 2,
    'Market & Economy': 3,
    'STEM': 4
}
df['Discussion'] = df['Discussion'].astype(str)
df['Discussion'] = df['Discussion'].apply(lambda text: re.sub(r"http\S+|www\S+|https\S+", "<URL>", text, flags=re.MULTILINE))
df['Discussion'] = df['Discussion'].apply(lambda text: re.sub(r"<.*?>", "", text))
df['Discussion'] = df['Discussion'].apply(lambda text: re.sub(r"[^a-zA-Z0-9.,!?;\\'\"\\s]", " ", text))
df['Discussion'] = df['Discussion'].apply(lambda text: re.sub(r"\d+", "<NUMBER>", text))
df['Category'] = df['Category'].map(category_mapping)
df = df.drop('SampleID', axis=1)
# stop_words = set(stopwords.words('english'))
# df['Discussion'] = df['Discussion'].apply(
#     lambda text: ' '.join(filter(lambda word: word.lower() not in stop_words, text.split()))
# )
from transformers import get_scheduler
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

sentences = df.Discussion.values
labels = df.Category.values

# Tokenization using RobertaTokenizer
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')  # Using 'roberta-base' for efficiency

# Tokenize the sentences and pad sequences
MAX_LEN = 128
input_ids = []
attention_masks = []

for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
        sent,
        add_special_tokens=True,  # Add [CLS] and [SEP]
        max_length=MAX_LEN,       # Pad or truncate
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',      # Return pytorch tensors
        truncation=True
    )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Split the dataset into training and validation
from sklearn.model_selection import train_test_split
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(
    input_ids, labels, random_state=2021, test_size=0.1
)
train_masks, validation_masks, _, _ = train_test_split(
    attention_masks, input_ids, random_state=2021, test_size=0.1
)

# Create DataLoader
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# Ensemble Models
from transformers import RobertaForSequenceClassification
import numpy as np


model_names = ['roberta-base', 'distilbert-base-uncased']
models = []

# Training hyperparameters
epochs = 3
learning_rate = 2e-5
batch_size = 32

# Train each model
for model_name in model_names:
    print(f"Training model: {model_name}")
    model = RobertaForSequenceClassification.from_pretrained(
        model_name,
        num_labels=5,  # Number of categories
        output_attentions=False,
        output_hidden_states=False
    )
    model.to(device)

    # Optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
    total_steps = len(train_dataloader) * epochs
    scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    # Training loop
    for epoch in trange(epochs, desc=f"Epochs for {model_name}"):
        model.train()
        total_loss = 0

        for step, batch in enumerate(train_dataloader):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            model.zero_grad()

            outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits
            total_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Average Training Loss for {model_name}: {avg_train_loss:.4f}")

    models.append(model)

# Generate predictions for each model
def get_model_predictions(model, dataloader):
    model.eval()
    all_logits = []

    with torch.no_grad():
        for batch in dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)

            outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask)
            logits = outputs.logits
            all_logits.append(logits.cpu().numpy())

    return np.concatenate(all_logits, axis=0)

ensemble_logits = []
for model in models:
    logits = get_model_predictions(model, validation_dataloader)
    ensemble_logits.append(logits)

# Average the logits for soft voting
average_logits = np.mean(ensemble_logits, axis=0)

# Get final predictions
ensemble_predictions = np.argmax(average_logits, axis=1)

# Calculate validation accuracy
label_ids = validation_labels.numpy()
accuracy = np.sum(ensemble_predictions == label_ids) / len(label_ids)
print(f"Ensemble Validation Accuracy: {accuracy:.4f}") 


# Ensemble

In [None]:
# Preprocessing
df['Discussion'].dropna(inplace=True)
category_mapping = {
    'Politics': 0,
    'Sports': 1,
    'Media': 2,
    'Market & Economy': 3,
    'STEM': 4
}
df['Discussion'] = df['Discussion'].astype(str)
df['Discussion'] = df['Discussion'].apply(lambda text: re.sub(r"http\S+|www\S+|https\S+", "<URL>", text, flags=re.MULTILINE))
df['Discussion'] = df['Discussion'].apply(lambda text: re.sub(r"<.*?>", "", text))
df['Discussion'] = df['Discussion'].apply(lambda text: re.sub(r"[^a-zA-Z0-9.,!?;\\'\"\\s]", " ", text))
df['Discussion'] = df['Discussion'].apply(lambda text: re.sub(r"\d+", "<NUMBER>", text))
df['Category'] = df['Category'].map(category_mapping)
df = df.drop('SampleID', axis=1)

from transformers import get_scheduler
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

sentences = df.Discussion.values
labels = df.Category.values

# Tokenization using RobertaTokenizer
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')  # Using 'roberta-base' for efficiency

# Tokenize the sentences and pad sequences
MAX_LEN = 512
input_ids = []
attention_masks = []

for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
        sent,
        add_special_tokens=True,  # Add [CLS] and [SEP]
        max_length=MAX_LEN,       # Pad or truncate
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',      # Return pytorch tensors
        truncation=True
    )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Split the dataset into training and validation
from sklearn.model_selection import train_test_split
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(
    input_ids, labels, random_state=2021, test_size=0.1
)
train_masks, validation_masks, _, _ = train_test_split(
    attention_masks, input_ids, random_state=2021, test_size=0.1
)

# Create DataLoader
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# Ensemble Models
from transformers import RobertaForSequenceClassification, BertForSequenceClassification
import numpy as np

model_names = ['roberta-base', 'distilroberta-base']
models = []

# Training hyperparameters
epochs = 3
learning_rate = 2e-5
batch_size = 32
for model_name in model_names:
    print(f"Training model: {model_name}")
    model = RobertaForSequenceClassification.from_pretrained(
        model_name,
        num_labels=5,
        output_attentions=False,
        output_hidden_states=False
    )
    model.to(device)

    # Optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
    total_steps = len(train_dataloader) * epochs
    scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    # Training loop without class weights
    for epoch in trange(epochs, desc=f"Epochs for {model_name}"):
        model.train()
        total_loss = 0

        for step, batch in enumerate(train_dataloader):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            model.zero_grad()
            outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Average Training Loss for {model_name}: {avg_train_loss:.4f}")

    models.append(model)

# Generate predictions for ensemble
def get_model_predictions(model, dataloader):
    model.eval()
    all_logits = []

    with torch.no_grad():
        for batch in dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask)
            logits = outputs.logits
            all_logits.append(logits.cpu().numpy())

    return np.concatenate(all_logits, axis=0)

ensemble_logits = []
for model in models:
    logits = get_model_predictions(model, validation_dataloader)
    ensemble_logits.append(logits)

# Weighted average for soft voting
weights = [1.0, 0.8]  # Assign higher weight to the better-performing model
average_logits = np.average(ensemble_logits, axis=0, weights=weights)

# Get final predictions
ensemble_predictions = np.argmax(average_logits, axis=1)

# Calculate validation accuracy
label_ids = validation_labels.numpy()
accuracy = np.sum(ensemble_predictions == label_ids) / len(label_ids)
print(f"Ensemble Validation Accuracy: {accuracy:.4f}")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Training model: roberta-base


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epochs for roberta-base:  33%|███▎      | 1/3 [38:12<1:16:24, 2292.34s/it]

Average Training Loss for roberta-base: 0.7899


Epochs for roberta-base:  67%|██████▋   | 2/3 [1:16:38<38:20, 2300.65s/it]

Average Training Loss for roberta-base: 0.5907


Epochs for roberta-base: 100%|██████████| 3/3 [1:55:04<00:00, 2301.38s/it]

Average Training Loss for roberta-base: 0.4908
Training model: distilroberta-base





config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epochs for distilroberta-base:  33%|███▎      | 1/3 [19:16<38:33, 1156.54s/it]

Average Training Loss for distilroberta-base: 0.7836


Epochs for distilroberta-base:  67%|██████▋   | 2/3 [38:34<19:17, 1157.27s/it]

Average Training Loss for distilroberta-base: 0.6130


Epochs for distilroberta-base: 100%|██████████| 3/3 [57:51<00:00, 1157.27s/it]

Average Training Loss for distilroberta-base: 0.5316





Ensemble Validation Accuracy: 0.7703


In [None]:
# Preprocess the test dataset
test_df = pd.read_csv('/kaggle/input/traintest/test.csv')
test_df['Discussion'] = test_df['Discussion'].astype(str)
test_df['Discussion'] = test_df['Discussion'].apply(lambda text: re.sub(r"http\S+|www\S+|https\S+", "<URL>", text, flags=re.MULTILINE))
#test_df['Discussion'] = test_df['Discussion'].apply(lambda text: re.sub(r"<.*?>", "", text))
#test_df['Discussion'] = test_df['Discussion'].apply(lambda text: re.sub(r"[^a-zA-Z0-9.,!?;\\'\"\\s]", " ", text))
test_df['Discussion'] = test_df['Discussion'].apply(lambda text: re.sub(r"\d+", "<NUMBER>", text))

# Tokenize the test sentences
test_sentences = test_df['Discussion'].values
test_encoded = tokenizer(
    list(test_sentences),
    max_length=MAX_LEN,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)

# Extract input IDs and attention masks
test_input_ids = test_encoded['input_ids']
test_attention_masks = test_encoded['attention_mask']

# Create DataLoader for test data
test_data = TensorDataset(test_input_ids, test_attention_masks)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Generate predictions for each model in the ensemble
ensemble_logits_test = []

for model in models:
    model.eval()
    all_logits = []

    with torch.no_grad():
        for batch in test_dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)

            outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask)
            logits = outputs.logits
            all_logits.append(logits.cpu().numpy())

    ensemble_logits_test.append(np.concatenate(all_logits, axis=0))

# Average logits for soft voting
average_logits_test = np.mean(ensemble_logits_test, axis=0)

# Get final predictions
test_predictions = np.argmax(average_logits_test, axis=1)






In [None]:
print(test_predictions)

In [None]:
submit = pd.DataFrame({
    "SampleID": test_df["SampleID"],  # Ensure 'SampleID' exists in test_df
    "Category": test_predictions  # Predictions as category labels
})

# Display the first few rows of the submission DataFrame
print(submit.head())

# Save to a CSV file
submit.to_csv("/kaggle/working/trial5.csv", index=False)

In [29]:
# Preprocessing
df=df.dropna()
category_mapping = {
    'Politics': 0,
    'Sports': 1,
    'Media': 2,
    'Market & Economy': 3,
    'STEM': 4
}
df['Discussion'] = df['Discussion'].astype(str)
df['Category'] = df['Category'].map(category_mapping)
df = df.drop('SampleID', axis=1)

from transformers import RobertaTokenizer, BertTokenizer
import torch

# Tokenizers
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization function
def tokenize_texts(tokenizer, texts, max_len):
    input_ids = []
    attention_masks = []
    
    for sent in texts:
        encoded_dict = tokenizer.encode_plus(
            sent,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

MAX_LEN = 128
sentences = df.Discussion.values
labels = torch.tensor(df.Category.values)

# Tokenize for both models
roberta_input_ids, roberta_attention_masks = tokenize_texts(roberta_tokenizer, sentences, MAX_LEN)
bert_input_ids, bert_attention_masks = tokenize_texts(bert_tokenizer, sentences, MAX_LEN)

# Split the dataset
from sklearn.model_selection import train_test_split

def split_data(input_ids, attention_masks, labels):
    train_inputs, val_inputs, train_labels, val_labels = train_test_split(
        input_ids, labels, random_state=2021, test_size=0.1
    )
    train_masks, val_masks, _, _ = train_test_split(
        attention_masks, input_ids, random_state=2021, test_size=0.1
    )
    return train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels

# Split for RoBERTa
roberta_train_inputs, roberta_val_inputs, roberta_train_masks, roberta_val_masks, roberta_train_labels, roberta_val_labels = split_data(
    roberta_input_ids, roberta_attention_masks, labels
)

# Split for BERT
bert_train_inputs, bert_val_inputs, bert_train_masks, bert_val_masks, bert_train_labels, bert_val_labels = split_data(
    bert_input_ids, bert_attention_masks, labels
)

from transformers import RobertaForSequenceClassification, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import get_scheduler
from tqdm import trange
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 32
epochs = 4
learning_rate = 2e-5

# Function to train a model
def train_model(model, train_dataloader, optimizer, scheduler):
    model.to(device)
    model.train()
    
    for epoch in trange(epochs, desc="Epochs"):
        total_loss = 0
        
        for batch in train_dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            model.zero_grad()
            outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        print(f"Epoch Loss: {total_loss / len(train_dataloader):.4f}")
    return model

# Train RoBERTa
roberta_train_data = TensorDataset(roberta_train_inputs, roberta_train_masks, roberta_train_labels)
roberta_train_dataloader = DataLoader(roberta_train_data, sampler=RandomSampler(roberta_train_data), batch_size=batch_size)

roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=5)
optimizer = AdamW(roberta_model.parameters(), lr=learning_rate, eps=1e-8)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(roberta_train_dataloader) * epochs)
roberta_model = train_model(roberta_model, roberta_train_dataloader, optimizer, scheduler)

# Train BERT
bert_train_data = TensorDataset(bert_train_inputs, bert_train_masks, bert_train_labels)
bert_train_dataloader = DataLoader(bert_train_data, sampler=RandomSampler(bert_train_data), batch_size=batch_size)

bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)
optimizer = AdamW(bert_model.parameters(), lr=learning_rate, eps=1e-8)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(bert_train_dataloader) * epochs)
bert_model = train_model(bert_model, bert_train_dataloader, optimizer, scheduler)
# Function to get predictions
def get_predictions(model, dataloader):
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask)
            logits = outputs.logits
            predictions.append(logits.cpu().numpy())

    return np.concatenate(predictions, axis=0)

# Validation dataloaders
roberta_val_data = TensorDataset(roberta_val_inputs, roberta_val_masks, roberta_val_labels)
roberta_val_dataloader = DataLoader(roberta_val_data, sampler=SequentialSampler(roberta_val_data), batch_size=batch_size)

bert_val_data = TensorDataset(bert_val_inputs, bert_val_masks, bert_val_labels)
bert_val_dataloader = DataLoader(bert_val_data, sampler=SequentialSampler(bert_val_data), batch_size=batch_size)

# Get logits
roberta_logits = get_predictions(roberta_model, roberta_val_dataloader)
bert_logits = get_predictions(bert_model, bert_val_dataloader)

# Weighted average
weights = [1.0, 0.5]  # Adjust based on performance
ensemble_logits = np.average([roberta_logits, bert_logits], axis=0, weights=weights)
ensemble_predictions = np.argmax(ensemble_logits, axis=1)

# Calculate accuracy
label_ids = roberta_val_labels.numpy()
accuracy = np.sum(ensemble_predictions == label_ids) / len(label_ids)
print(f"Ensemble Validation Accuracy: {accuracy:.4f}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Discussion'] = df['Discussion'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Category'] = df['Category'].map(category_mapping)
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epochs:

Epoch Loss: 0.7614


Epochs:  50%|█████     | 2/4 [31:46<31:46, 953.46s/it]

Epoch Loss: 0.5583


Epochs:  75%|███████▌  | 3/4 [47:41<15:54, 954.14s/it]

Epoch Loss: 0.4447


Epochs: 100%|██████████| 4/4 [1:03:36<00:00, 954.04s/it]

Epoch Loss: 0.3642



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epochs:  25%|██▌       | 1/4 [15:52<47:37, 952.46s/it]

Epoch Loss: 0.7670


Epochs:  50%|█████     | 2/4 [31:44<31:44, 952.35s/it]

Epoch Loss: 0.5322


Epochs:  75%|███████▌  | 3/4 [47:37<15:52, 952.39s/it]

Epoch Loss: 0.3974


Epochs: 100%|██████████| 4/4 [1:03:29<00:00, 952.44s/it]

Epoch Loss: 0.3044





Ensemble Validation Accuracy: 0.7915


In [35]:
test_df = pd.read_csv('/kaggle/input/traintest/test.csv')
test_df['Discussion'] = test_df['Discussion'].astype(str)

# Tokenize test data for RoBERTa
roberta_test_input_ids, roberta_test_attention_masks = tokenize_texts(
    roberta_tokenizer, test_df['Discussion'].values, MAX_LEN
)

# Tokenize test data for BERT
bert_test_input_ids, bert_test_attention_masks = tokenize_texts(
    bert_tokenizer, test_df['Discussion'].values, MAX_LEN
)

# Create dataloaders
roberta_test_data = TensorDataset(roberta_test_input_ids, roberta_test_attention_masks)
roberta_test_dataloader = DataLoader(roberta_test_data, sampler=SequentialSampler(roberta_test_data), batch_size=batch_size)

bert_test_data = TensorDataset(bert_test_input_ids, bert_test_attention_masks)
bert_test_dataloader = DataLoader(bert_test_data, sampler=SequentialSampler(bert_test_data), batch_size=batch_size)


In [36]:
# Get predictions for the test data
roberta_test_logits = get_predictions(roberta_model, roberta_test_dataloader)
bert_test_logits = get_predictions(bert_model, bert_test_dataloader)

# Weighted average of logits
test_ensemble_logits = np.average([roberta_test_logits, bert_test_logits], axis=0, weights=weights)

# Final predictions
test_predictions = np.argmax(test_ensemble_logits, axis=1)


In [37]:
print(test_predictions)

[3 0 1 ... 3 0 2]


In [38]:
submit = pd.DataFrame({
    "SampleID": test_df["SampleID"],  # Ensure 'SampleID' exists in test_df
    "Category": test_predictions  # Predictions as category labels
})

# Display the first few rows of the submission DataFrame
print(submit.head())

# Save to a CSV file
submit.to_csv("finalfinal5.csv", index=False)

   SampleID  Category
0         1         3
1         2         0
2         3         1
3         4         4
4         5         3


In [39]:
len(submit)

10557

In [30]:
df=df.dropna()
category_mapping = {
    'Politics': 0,
    'Sports': 1,
    'Media': 2,
    'Market & Economy': 3,
    'STEM': 4
}
df['Discussion'] = df['Discussion'].astype(str)
df['Category'] = df['Category'].map(category_mapping)
df=df.drop('SampleID',axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Discussion'] = df['Discussion'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Category'] = df['Category'].map(category_mapping)


In [49]:
import tensorflow as tf
from tensorflow.keras.layers import Layer, Embedding, MultiHeadAttention, Dense, Dropout, LayerNormalization
from tensorflow.keras.models import Model

class PositionEmbeddingLayer(Layer):
    def __init__(self, max_length, d_model):
        super(PositionEmbeddingLayer, self).__init__()
        self.max_length = max_length
        self.d_model = d_model
        self.position_embeddings = Embedding(input_dim=self.max_length, output_dim=self.d_model)

    def call(self, inputs):
        seq_len = tf.shape(inputs)[1]
        position_ids = tf.range(start=0, limit=seq_len, delta=1)
        position_embeddings = self.position_embeddings(position_ids)
        position_embeddings = tf.expand_dims(position_embeddings, axis=0)
        position_embeddings = tf.tile(position_embeddings, [tf.shape(inputs)[0], 1, 1])
        return position_embeddings


def bert_encoder_block(x, d_model, n_heads, d_ff, dropout=0.1, attention_mask=None):
    attention = MultiHeadAttention(
        num_heads=n_heads,
        key_dim=d_model // n_heads,
        dropout=dropout
    )(x, x, x, attention_mask=attention_mask)
    attention = Dropout(dropout)(attention)
    out1 = LayerNormalization()(x + attention)
    
    ffn_output = Dense(d_ff, activation='gelu')(out1)
    ffn_output = Dense(d_model)(ffn_output)
    ffn_output = Dropout(dropout)(ffn_output)
    
    return LayerNormalization()(out1 + ffn_output)

def build_bert_for_sequence_classification(n_layers, d_model, n_heads, d_ff, vocab_size, max_length, num_classes, dropout=0.1):
    # Input layers
    input_ids = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
    token_type_ids = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="token_type_ids")
    attention_mask = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="attention_mask")
    
    # Embeddings
    word_embeddings = Embedding(vocab_size, d_model)(input_ids)
    position_embeddings = PositionEmbeddingLayer(max_length, d_model)(input_ids)
    token_type_embeddings = Embedding(2, d_model)(token_type_ids)
    
    x = word_embeddings + position_embeddings + token_type_embeddings
    x = LayerNormalization()(x)
    x = Dropout(dropout)(x)
    
    # Transformer blocks
    for _ in range(n_layers):
        x = bert_encoder_block(x, d_model, n_heads, d_ff, dropout, attention_mask)
    
    # Classification head
    cls_token_output = x[:, 0, :]
    x = Dropout(dropout)(cls_token_output)
    logits = Dense(num_classes)(x)
    
    # Create model
    model = Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=logits,
        name="BERT_Sequence_Classifier"
    )
    return model
# Initialize parameters
MAX_LEN = 128
BATCH_SIZE = 32
N_LAYERS = 12
D_MODEL = 768
N_HEADS = 12
D_FF = 3072
VOCAB_SIZE = 30522
NUM_CLASSES = 5
DROPOUT = 0.1


# Build model
model1 = build_bert_for_sequence_classification(
    n_layers=N_LAYERS,
    d_model=D_MODEL,
    n_heads=N_HEADS,
    d_ff=D_FF,
    vocab_size=VOCAB_SIZE,
    max_length=MAX_LEN,
    num_classes=NUM_CLASSES,
    dropout=DROPOUT
)

# Print model summary
model1.summary()


In [57]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, MultiHeadAttention, Dense, Dropout, LayerNormalization, Lambda
from tensorflow.keras.models import Model
from transformers import RobertaTokenizer

# Parameters
MAX_LEN = 128  # Using your original max length
BATCH_SIZE = 32  # Your original batch size
N_LAYERS = 12
D_MODEL = 768
N_HEADS = 12
D_FF = 3072
NUM_CLASSES = 5
DROPOUT = 0.1

# Initialize tokenizer and get vocabulary size
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
VOCAB_SIZE = tokenizer.vocab_size

# Position embedding layer
def position_embedding_layer(inputs, max_length, d_model):
    def compute_position_embeddings(inputs):
        position_embeddings = Embedding(max_length, d_model)(tf.range(start=0, limit=tf.shape(inputs)[1], delta=1))
        position_embeddings = tf.expand_dims(position_embeddings, axis=0)
        return tf.tile(position_embeddings, [tf.shape(inputs)[0], 1, 1])

    return Lambda(compute_position_embeddings)(inputs)

# RoBERTa encoder block
def roberta_encoder_block(x, d_model, n_heads, d_ff, dropout=0.1, attention_mask=None):
    attention = MultiHeadAttention(
        num_heads=n_heads,
        key_dim=d_model // n_heads,
        dropout=dropout
    )(x, x, x, attention_mask=attention_mask)
    attention = Dropout(dropout)(attention)
    out1 = LayerNormalization()(x + attention)
    
    ffn_output = Dense(d_ff, activation='gelu')(out1)
    ffn_output = Dense(d_model)(ffn_output)
    ffn_output = Dropout(dropout)(ffn_output)
    
    return LayerNormalization()(out1 + ffn_output)

# Build RoBERTa model for sequence classification
def build_roberta_for_sequence_classification(
    n_layers, d_model, n_heads, d_ff, vocab_size, max_length, num_classes, dropout=0.1
):
    # Input layers
    input_ids = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
    attention_mask = tf.keras.Input(shape=(max_length,), dtype=tf.int32, name="attention_mask")
    
    # Embeddings
    word_embeddings = Embedding(vocab_size, d_model)(input_ids)
    position_embeddings = position_embedding_layer(input_ids, max_length, d_model)
    x = word_embeddings + position_embeddings
    x = LayerNormalization()(x)
    x = Dropout(dropout)(x)
    
    # Transformer blocks
    for _ in range(n_layers):
        x = roberta_encoder_block(x, d_model, n_heads, d_ff, dropout, attention_mask)
    
    # Classification head with two dense layers and tanh activation
    cls_token_output = x[:, 0, :]
    x = Dropout(dropout)(cls_token_output)
    x = Dense(d_model, activation='tanh')(x)  # First Dense layer with tanh activation
    x = Dropout(dropout)(x)
    logits = Dense(num_classes)(x)  # Second Dense layer for final output
    
    # Create model
    model = Model(
        inputs=[input_ids, attention_mask],
        outputs=logits,
        name="RoBERTa_Sequence_Classifier"
    )
    return model


# Build model
MAX_LEN = 128  # Using your original max length
BATCH_SIZE = 32  # Your original batch size
N_LAYERS = 12
D_MODEL = 768
N_HEADS = 12
D_FF = 3072
NUM_CLASSES = 5
DROPOUT = 0.1

model2 = build_roberta_for_sequence_classification(
    n_layers=N_LAYERS,
    d_model=D_MODEL,
    n_heads=N_HEADS,
    d_ff=D_FF,
    vocab_size=VOCAB_SIZE,
    max_length=MAX_LEN,
    num_classes=NUM_CLASSES,
    dropout=DROPOUT
)

# Print model summary
model2.summary()


In [38]:
import tensorflow as tf
from transformers import BertTokenizer
import numpy as np
from tqdm import trange
import pandas as pd
from sklearn.model_selection import train_test_split

def tokenize_and_prepare_data(tokenizer, texts, labels, max_len):
    """
    Tokenize texts and prepare data for training
    """
    # Ensure texts are strings
    texts = [str(text) for text in texts]
    
    # Tokenize
    encodings = tokenizer(
        texts,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        return_attention_mask=True,
        return_token_type_ids=True,
        truncation=True,
        return_tensors='tf'
    )
    
    return encodings['input_ids'], encodings['attention_mask'], encodings['token_type_ids'], tf.convert_to_tensor(labels, dtype=tf.int32)

def split_data(input_ids, attention_masks, token_type_ids, labels, test_size=0.1):
    """
    Split the dataset into train and validation sets using TensorFlow operations
    """
    # Calculate split sizes
    total_size = input_ids.shape[0]
    val_size = int(total_size * test_size)
    train_size = total_size - val_size
    
    # Create shuffled indices
    indices = tf.random.shuffle(tf.range(total_size))
    
    # Split indices into train and validation
    train_indices = indices[:train_size]
    val_indices = indices[train_size:]
    
    # Use tf.gather to split the data
    train_inputs = tf.gather(input_ids, train_indices)
    val_inputs = tf.gather(input_ids, val_indices)
    
    train_masks = tf.gather(attention_masks, train_indices)
    val_masks = tf.gather(attention_masks, val_indices)
    
    train_token_types = tf.gather(token_type_ids, train_indices)
    val_token_types = tf.gather(token_type_ids, val_indices)
    
    train_labels = tf.gather(labels, train_indices)
    val_labels = tf.gather(labels, val_indices)
    
    return (train_inputs, val_inputs,
            train_masks, val_masks,
            train_token_types, val_token_types,
            train_labels, val_labels)

def create_tf_dataset(input_ids, attention_masks, token_type_ids, labels, batch_size, is_training=True):
    """
    Create a TensorFlow dataset
    """
    dataset = tf.data.Dataset.from_tensor_slices((
        {
            'input_ids': input_ids,
            'attention_mask': attention_masks,
            'token_type_ids': token_type_ids
        },
        labels
    ))
    
    if is_training:
        dataset = dataset.shuffle(1000)
    
    dataset = dataset.batch(batch_size)
    
    return dataset

def prepare_and_train_model(df, model, tokenizer, max_len=128, batch_size=32, epochs=4):
    """
    Prepare data and train the model
    """
    # Prepare inputs
    input_ids, attention_masks, token_type_ids, labels = tokenize_and_prepare_data(
        tokenizer, 
        df.Discussion.values, 
        df.Category.values, 
        max_len
    )
    
    # Split data using TensorFlow operations
    (train_inputs, val_inputs,
     train_masks, val_masks,
     train_token_types, val_token_types,
     train_labels, val_labels) = split_data(
        input_ids, attention_masks, token_type_ids, labels
    )
    
    # Create datasets
    train_dataset = create_tf_dataset(
        train_inputs, train_masks, train_token_types, train_labels, 
        batch_size=batch_size, is_training=True
    )
    
    val_dataset = create_tf_dataset(
        val_inputs, val_masks, val_token_types, val_labels, 
        batch_size=batch_size, is_training=False
    )
    
    # Compile model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-8),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=['accuracy']
    )
    
    # Train model
    history = model.fit(
        train_dataset,
        epochs=2,
        validation_data=val_dataset,
        verbose=1
    )
    
    return model, history, val_dataset


# Example usage:
"""
# Assuming df is your preprocessed dataframe with 'Discussion' and 'Category' columns
model, history, val_dataset = prepare_and_train_model(df, model, tokenizer)

# Get predictions
val_predictions = model.predict(val_dataset)
val_predictions = np.argmax(val_predictions, axis=1)
"""

"\n# Assuming df is your preprocessed dataframe with 'Discussion' and 'Category' columns\nmodel, history, val_dataset = prepare_and_train_model(df, model, tokenizer)\n\n# Get predictions\nval_predictions = model.predict(val_dataset)\nval_predictions = np.argmax(val_predictions, axis=1)\n"

In [39]:
model, history, val_dataset = prepare_and_train_model(df, model, tokenizer)

Epoch 1/2


I0000 00:00:1735071930.733720     100 service.cc:145] XLA service 0x7d59d003dd50 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1735071930.733785     100 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1735071930.733789     100 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5


I0000 00:00:1735072024.917306     100 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m693/694[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 691ms/step - accuracy: 0.2272 - loss: 1.8439






[1m694/694[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m723s[0m 866ms/step - accuracy: 0.2274 - loss: 1.8434 - val_accuracy: 0.4237 - val_loss: 1.3134
Epoch 2/2
[1m694/694[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m498s[0m 718ms/step - accuracy: 0.4762 - loss: 1.2627 - val_accuracy: 0.6351 - val_loss: 0.9915


In [40]:
history = model.fit(
        train_dataset,
        epochs=3,
        validation_data=val_dataset,
        verbose=1
    )
    

Epoch 1/3









[1m1540/1541[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 368ms/step - accuracy: 0.6306 - loss: 0.9563




[1m1541/1541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m775s[0m 435ms/step - accuracy: 0.6306 - loss: 0.9563 - val_accuracy: 0.7244 - val_loss: 0.7476
Epoch 2/3
[1m1541/1541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m587s[0m 381ms/step - accuracy: 0.7194 - loss: 0.7521 - val_accuracy: 0.7386 - val_loss: 0.7357
Epoch 3/3
[1m1541/1541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m587s[0m 381ms/step - accuracy: 0.7708 - loss: 0.6293 - val_accuracy: 0.8385 - val_loss: 0.4837


In [41]:
history = model.fit(
        train_dataset,
        epochs=3,
        validation_data=val_dataset,
        verbose=1
    )
    

Epoch 1/3
[1m1541/1541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m586s[0m 381ms/step - accuracy: 0.8122 - loss: 0.5252 - val_accuracy: 0.8636 - val_loss: 0.4085
Epoch 2/3
[1m1541/1541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m585s[0m 380ms/step - accuracy: 0.8418 - loss: 0.4474 - val_accuracy: 0.8669 - val_loss: 0.3816
Epoch 3/3
[1m1541/1541[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m587s[0m 381ms/step - accuracy: 0.8633 - loss: 0.3846 - val_accuracy: 0.9058 - val_loss: 0.2759


In [42]:
print(tokenizer)

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


In [46]:
import pandas as pd
import tensorflow as tf
import numpy as np

# Assuming the model is trained and tokenizer is loaded

# Load the test data
test_df = pd.read_csv('/kaggle/input/traintest/test.csv')
test_df['Discussion'] = test_df['Discussion'].astype(str)

# Tokenize the test data
test_input_ids, test_attention_masks, test_token_type_ids, _ = tokenize_and_prepare_data(
    tokenizer, 
    test_df.Discussion.values, 
    np.zeros(len(test_df)),  # Placeholder, since we don't have labels
    max_len=128
)

# Create the test dataset
def create_test_dataset(input_ids, attention_masks, token_type_ids, batch_size):
    """
    Create a TensorFlow dataset for the test data
    """
    dataset = tf.data.Dataset.from_tensor_slices((
        {
            'input_ids': input_ids,
            'attention_mask': attention_masks,
            'token_type_ids': token_type_ids
        }
    ))
    
    dataset = dataset.batch(batch_size)
    return dataset

test_dataset = create_test_dataset(test_input_ids, test_attention_masks, test_token_type_ids, batch_size=32)

# Perform inference
predictions = []

for batch in test_dataset:
    # Extract inputs
    input_ids = batch['input_ids']
    attention_masks = batch['attention_mask']
    token_type_ids = batch['token_type_ids']
    
    # Create a dictionary for the model
    inputs = {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'token_type_ids': token_type_ids
    }
    
    # Perform inference by passing the entire dictionary
    outputs = model(inputs)  # Pass the inputs as a dictionary
    
    # If the model is producing a tensor, access the logits directly from the output
    logits = outputs.numpy()  # Convert the output tensor to numpy for processing
    batch_predictions = np.argmax(logits, axis=1)  # Get predicted class indices
    predictions.extend(batch_predictions)

# Create a DataFrame for submission
submit = pd.DataFrame({
    "SampleID": test_df["SampleID"],  
    "Category": predictions           
})

# Save the predictions to CSV
submit.to_csv("/kaggle/working/nicetry.csv", index=False)

print(submit.head())


   SampleID  Category
0         1         3
1         2         0
2         3         1
3         4         4
4         5         3
