In [None]:
import numpy as np 
import pandas as pd
from nltk.corpus import stopwords
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, ClassLabel
from sklearn.metrics import classification_report

In [None]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [None]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
train_df = train_df.dropna()
train_df[:5]

In [None]:
train_df['location'].describe(), train_df['keyword'].describe() 

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
train_df['combined_text'] = (
    train_df['keyword'] + ' ' + train_df['location'] + ' ' + train_df['text']
)

test_df['combined_text'] = (
    test_df['keyword'] + ' ' + test_df['location'] + ' ' + test_df['text']
)

In [None]:
train_df['combined_text']

In [None]:
train_dataset = Dataset.from_pandas(train_df[['combined_text', 'target']])
test_dataset = Dataset.from_pandas(test_df[['combined_text']])

In [None]:
train_dataset, test_dataset

In [None]:
def tokenize_fn(batch):
    return tokenizer(batch['combined_text'], truncation=True, padding='max_length', max_length=128)

In [None]:
train_dataset = train_dataset.map(tokenize_fn, batched=True)

In [None]:
train_dataset

In [None]:
class_label = ClassLabel(num_classes=2, names=['0', '1'])
train_dataset = train_dataset.cast_column('target', class_label)

In [None]:
train_dataset

In [None]:
dataset = train_dataset.train_test_split(test_size=0.2, stratify_by_column='target')
train_ds = dataset['train']
test_ds = dataset['test']
train_ds = train_ds.rename_column('target', 'labels')
test_ds = test_ds.rename_column('target', 'labels')

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

In [None]:
#!pip install --upgrade transformers

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    logging_dir="./logs",         # Where to store logs
    logging_strategy="steps",     # Log every n steps
    logging_steps=100,
    save_strategy="no",
    report_to="none",             # Avoid external logging (wandb, etc.)
    disable_tqdm=False            # Show progress bars
)

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer
)

trainer.train()

In [None]:
preds = trainer.predict(test_ds)

In [None]:
y_pred = np.argmax(preds.predictions, axis = 1)
y_true = np.array(test_ds['labels'])

In [None]:
print(classification_report(y_pred, y_true))

In [None]:
test_df['combined_text'] = test_df['combined_text'].fillna('').astype(str)
test_dataset = Dataset.from_pandas(test_df[['combined_text']])
test_dataset = test_dataset.map(tokenize_fn, batched=True)

In [None]:
preds = trainer.predict(test_dataset)
y_pred = np.argmax(preds.predictions, axis=1)

In [None]:
y_pred

In [None]:
submission = pd.DataFrame({
    'id': test_df['id'],
    'target': y_pred
})
submission.to_csv('/kaggle/working/submission.csv', index=False)