In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import random
from nltk.corpus import wordnet
import nltk

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
# Uploading the necessary resources for WordNet
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load training and test datasets
train_path = 'train.csv'
test_path = 'test.csv'
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sofiakriuchkova/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/sofiakriuchkova/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
# 2. Function for text preprocessing 
def preprocess_text_simple(text):   
    # Lowercase text
    text = text.lower()
    # URL deletion
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Removing special characters, numbers and punctuation
    text = re.sub(r'\W+|\d+', ' ', text)
    # Dividing text into words
    tokens = text.split()
    # stopwords
    stop_words = {'and', 'or', 'but', 'so', 'because', 'the', 'a', 'an', 'in', 'on', 'at', 'of', 'to', 'is', 'it', 'this', 'that'}
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [4]:
train_df['processed_text'] = train_df['text'].apply(preprocess_text_simple)
test_df['processed_text'] = test_df['text'].apply(preprocess_text_simple)

In [5]:
# 3. Function for text augmentation
def get_synonyms(word):

    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

def augment_text_with_wordnet(text):
    
    words = text.split()
    augmented_text = words.copy()
    
    # Replacing words with synonyms (30% chance)
    for i, word in enumerate(words):
        if random.random() < 0.3:  
            synonyms = get_synonyms(word)
            if synonyms:
                augmented_text[i] = random.choice(synonyms)
    
    # Removing random words (20% chance)
    augmented_text = [word for word in augmented_text if random.random() > 0.2]
    return ' '.join(augmented_text)

In [6]:
# Application of augmentation using WordNet
train_df['augmented_text'] = train_df['processed_text'].apply(augment_text_with_wordnet)

# 4. Preparing data for training
# Combination of original and augmented text for learning purposes
train_texts = pd.concat([train_df['processed_text'], train_df['augmented_text']])
train_targets = pd.concat([train_df['target'], train_df['target']])

# Division into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_texts, train_targets, test_size=0.2, random_state=42)

# 5. Converting text to signs with TF-IDF
vectorizer = TfidfVectorizer(max_features=5000) 
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

# 6. Training of the model 
model = LogisticRegression(random_state=42, max_iter=1000)  # Logistic regression
model.fit(X_train_tfidf, y_train)  

# 7. Evaluation of the model  
y_val_pred = model.predict(X_val_tfidf)  # –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è –Ω–∞ –≤–∞–ª–∏–¥–∞—Ü–∏–æ–Ω–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö
validation_report = classification_report(y_val, y_val_pred)  # –û—Ç—á–µ—Ç –æ –∫–∞—á–µ—Å—Ç–≤–µ
print(validation_report)

# 8. Applying the model to the test set
X_test_tfidf = vectorizer.transform(test_df['processed_text'])  # Test data conversion

test_df['target'] = model.predict(X_test_tfidf)  # Prediction of target value

test_df.to_csv('test_predictions.csv', index=False)

              precision    recall  f1-score   support

           0       0.80      0.92      0.86      1700
           1       0.87      0.71      0.79      1346

    accuracy                           0.83      3046
   macro avg       0.84      0.82      0.82      3046
weighted avg       0.83      0.83      0.83      3046



In [7]:
# Reading a prediction file
sample_submission = pd.read_csv("test_predictions.csv")

# Using a trained model for predictions
sample_submission["target"] = model.predict(X_test_tfidf)

# Saving to file
sample_submission.to_csv("sofia_submission.csv", index=False)

In [9]:
X_test = test_df['processed_text']

# 5. Preparing data for BERT
model_name = 'bert-base-uncased' 
tokenizer = BertTokenizer.from_pretrained(model_name)

# Function for tokenisation
def encode_texts(texts, tokenizer, max_length=128):
    return tokenizer(
        texts.tolist(),
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )

# tokenisation
train_encodings = encode_texts(X_train, tokenizer)
val_encodings = encode_texts(X_val, tokenizer)
test_encodings = encode_texts(X_test, tokenizer)

# Creating your own Torch datasets
class TextDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels
        
    def __len__(self):
        return len(self.encodings['input_ids'])
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels.iloc[idx])
        return item

train_dataset = TextDataset(train_encodings, y_train)
val_dataset = TextDataset(val_encodings, y_val)
test_dataset = TextDataset(test_encodings)  # –±–µ–∑ –º–µ—Ç–æ–∫

# 6. Initialising the BERT model for classification 
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(train_df['target'].unique()))

# 7. Model estimation functions 
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision = precision_score(labels, preds, average='weighted', zero_division=0)
    recall = recall_score(labels, preds, average='weighted', zero_division=0)
    f1 = f1_score(labels, preds, average='weighted', zero_division=0)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# 8. Training settings
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy='epoch',
    save_strategy='epoch',  
    logging_dir='./logs',
    logging_steps=100,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# 9. Model training 
trainer.train()

# Evaluation on the validation set
val_results = trainer.evaluate()
print("Validation Results:", val_results)

# Obtaining predictions for the validation set
val_preds_output = trainer.predict(val_dataset)
val_preds = np.argmax(val_preds_output.predictions, axis=1)
print("Validation classification report:")
print(classification_report(y_val, val_preds))

# 10.Applying the model to the test set 
test_preds_output = trainer.predict(test_dataset)
test_preds = np.argmax(test_preds_output.predictions, axis=1)
test_df['target'] = test_preds

# Saving results
test_df.to_csv('test_predictions_new.csv', index=False)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.441,0.428169,0.832896,0.845447,0.832896,0.828674
2,0.2913,0.391311,0.858503,0.859554,0.858503,0.857586
3,0.1623,0.469395,0.864412,0.864471,0.864412,0.863963


Validation Results: {'eval_loss': 0.46939507126808167, 'eval_accuracy': 0.8644123440577807, 'eval_precision': 0.8644712772202431, 'eval_recall': 0.8644123440577807, 'eval_f1': 0.8639626203352668, 'eval_runtime': 54.7788, 'eval_samples_per_second': 55.605, 'eval_steps_per_second': 3.487, 'epoch': 3.0}
Validation classification report:
              precision    recall  f1-score   support

           0       0.86      0.90      0.88      1700
           1       0.87      0.82      0.84      1346

    accuracy                           0.86      3046
   macro avg       0.86      0.86      0.86      3046
weighted avg       0.86      0.86      0.86      3046



In [10]:

sample_submission = pd.read_csv("test_predictions_new.csv")
sample_submission["target"] = test_preds
sample_submission.to_csv("sofia_submission_new.csv", index=False)