In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/scicite/test.jsonl
/kaggle/input/scicite/train.jsonl
/kaggle/input/scicite2/synonymized.jsonl
/kaggle/input/scicite2/paraphrased.jsonl


In [2]:
train_df = pd.read_json('/kaggle/input/scicite/train.jsonl', lines=True)
X_train = train_df['string']
y_train = train_df['label']

test_df = pd.read_json('/kaggle/input/scicite/test.jsonl', lines=True)
X_test = test_df['string']
y_test = test_df['label']

print(train_df.shape, test_df.shape)

(8243, 15) (1861, 14)


In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import pandas as pd
import torch 
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from sklearn.metrics import f1_score, accuracy_score
from sklearn.utils import resample

def augment_data_multiclass(X, y):
    df = pd.concat([X, y], axis=1)
    majority_class_size = df['label'].value_counts().max()
    upsampled_dataframes = []
    for class_label in df['label'].unique():
        class_df = df[df['label'] == class_label]
        if len(class_df) < majority_class_size:
            class_df_upsampled = resample(class_df, replace=True, n_samples=majority_class_size, random_state=10)
            upsampled_dataframes.append(class_df_upsampled)
        else:
            upsampled_dataframes.append(class_df)
    upsampled_df = pd.concat(upsampled_dataframes)
    return upsampled_df['string'], upsampled_df['label']

# train the model for a given number of epochs
def train_model(model, tokenizer, num_epoch, learning_rate, batch_size, X_train, y_train):
    # Encode the training data
    encoded_data_train = tokenizer.batch_encode_plus(
        X_train,
        add_special_tokens=True, 
        return_attention_mask=True, 
        pad_to_max_length=True, 
        max_length=512, 
        return_tensors='pt'
    )
    labels_train = torch.tensor(y_train)

    # Create data loader for training
    dataset_train = TensorDataset(encoded_data_train['input_ids'], encoded_data_train['attention_mask'], labels_train)
    dataloader_train = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size)

    # Connect to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Define optimizer for training data
    optimizer = Adam(model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in range(num_epoch):
        model.train()

        curr_total_loss = 0.
        count = 0
        
        for train_batch in dataloader_train:
            optimizer.zero_grad()

            id, mask, label = train_batch
            id = id.to(device)
            mask = mask.to(device)
            label = label.to(device)

            outputs = model(id, attention_mask=mask, labels=label)

            loss = outputs.loss

            curr_total_loss += loss.item()
            count += 1

            loss.backward()
            
            optimizer.step()

        avg_loss = curr_total_loss / count
        print(epoch, avg_loss)       
    
    return model 

# return f1 macro and accuracy of the model
def eval_model(model, tokenizer, X_test, y_test):
    encoded_data_test = tokenizer.batch_encode_plus(
        X_test,
        add_special_tokens=True, 
        return_attention_mask=True, 
        pad_to_max_length=True, 
        max_length=512, 
        return_tensors='pt'
    )
    labels_test = torch.tensor(y_test)

    # Create data loader for test data
    batch_size = 16
    test_dataset = TensorDataset(encoded_data_test['input_ids'], encoded_data_test['attention_mask'], labels_test)
    test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

    # Connect to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Evaluate the model
    model.eval()
    predictions = []
    labels = []
    with torch.no_grad():
        for test_batch in test_dataloader:
            id, mask, label = test_batch
            id = id.to(device)
            mask = mask.to(device)
            label = label.to(device)

            outputs = model(id, attention_mask=mask, labels=label)
            logits = outputs.logits
            _, prediction  = torch.max(logits, dim=1)

            predictions.extend(prediction.tolist())
            labels.extend(label.tolist())
            
    f1 = f1_score(labels, predictions, average='macro')
    acc = accuracy_score(labels, predictions)
    print(f"f1 = {f1}, accuracy = {acc}")
    return f1, acc


def save_model(model, save_path):
    torch.save(model.state_dict(), save_path)

In [5]:
X_train, y_train = augment_data_multiclass(X_train, y_train)

In [6]:
from sklearn.preprocessing import LabelEncoder
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit label encoder and transform string column
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [7]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=3)
model = train_model(model, tokenizer, 1, 4e-5, 16, X_train.to_list(), y_train)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


0 0.35008041922511773


**Test Data Evaluation**

In [8]:
eval_model(model, tokenizer, X_test.to_list(), y_test)

f1 = 0.8377816857463104, accuracy = 0.8527673293927995


(0.8377816857463104, 0.8527673293927995)

**1st Category: Short data**

In [12]:
import nltk
short_df = test_df[test_df['string'].apply(lambda x: len(nltk.word_tokenize(x)) <= 25)]
X_test = short_df['string']
y_test = label_encoder.transform(short_df['label'])
eval_model(model, tokenizer, X_test.to_list(), y_test)

f1 = 0.8525919659266608, accuracy = 0.8587786259541985


(0.8525919659266608, 0.8587786259541985)

**2nd Category: Long data**

In [13]:
long_df = test_df[test_df['string'].apply(lambda x: len(nltk.word_tokenize(x)) > 25)]
X_test = long_df['string']
y_test = label_encoder.transform(long_df['label'])
eval_model(model, tokenizer, X_test.to_list(), y_test)

f1 = 0.836066000363923, accuracy = 0.851782363977486


(0.836066000363923, 0.851782363977486)

**3rd Category: Paragraph data**

In [14]:
paragraph_df = test_df[test_df['string'].apply(lambda x: len(nltk.sent_tokenize(x)) > 1)]
X_test = paragraph_df['string']
y_test = label_encoder.transform(paragraph_df['label'])
eval_model(model, tokenizer, X_test.to_list(), y_test)

f1 = 0.8511515645777811, accuracy = 0.8595641646489104


(0.8511515645777811, 0.8595641646489104)

**4th Category: Typo data**

In [16]:
import random
def rearrange_letter(word):
    word_list = list(word)
    n = len(word_list)
    if n == 1:
        return ''.join(word_list)
    
    idx = random.randint(0, n - 2)
    word_list[idx], word_list[idx + 1] = word_list[idx + 1], word_list[idx]
    return ''.join(word_list)

def rearrange_word(text):
    words = nltk.word_tokenize(text)
    num_words = len(words)

    # rearrange letter for some random word
    for _ in range(5):
        idx = random.randint(0, num_words - 1)
        words[idx] = rearrange_letter(words[idx])
    
    # rearrange word
    for _ in range(min(3, num_words - 1)):
        idx = random.randint(0, num_words - 2)
        words[idx], words[idx + 1] = words[idx + 1], words[idx]

    return ' '.join(words)

typo_series = test_df['string'].apply(rearrange_word)

typo_df = pd.DataFrame({
    'label': test_df.label,
    'string': typo_series
})
X_test = typo_df['string']
y_test = label_encoder.transform(typo_df['label'])
eval_model(model, tokenizer, X_test.to_list(), y_test)

f1 = 0.8006081768309743, accuracy = 0.8275120902740463


(0.8006081768309743, 0.8275120902740463)

**5th Category: Synonym data**

In [17]:
synonymized_test_df = pd.read_json('/kaggle/input/scicite2/synonymized.jsonl', lines=True)
X_test = synonymized_test_df['string']
y_test = label_encoder.transform(synonymized_test_df['label'])
eval_model(model, tokenizer, X_test.to_list(), y_test)

f1 = 0.7087132710001441, accuracy = 0.7807630306286942


(0.7087132710001441, 0.7807630306286942)

**6th Category: Paraphrased data**

In [18]:
paraphrased_test_df = pd.read_json('/kaggle/input/scicite2/paraphrased.jsonl', lines=True)
X_test = paraphrased_test_df['string']
y_test = label_encoder.transform(paraphrased_test_df['label'])
eval_model(model, tokenizer, X_test.to_list(), y_test)

f1 = 0.804372514705901, accuracy = 0.8216012896292316


(0.804372514705901, 0.8216012896292316)