# <font color="#003660">Applied Machine Learning for Text Analysis (M.184.5331)</font>


# <font color="#003660">Week 5: Transformer Architecture</font>

# <font color="#003660">Notebook 3: Text Classification with Transformers for Kaggle</font>

# Import Packages

In [None]:
#!pip install transformers[sentencepiece]
#!pip install datasets
#!pip install accelerate -U
#!pip install pymysql

In [None]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text
import getpass
from sklearn.metrics import accuracy_score, f1_score, roc_curve, auc
from sklearn.metrics import classification_report
from datasets import Dataset, list_datasets, load_dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from scipy.special import softmax

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

# Create Dataset

In [None]:
# Get credentials
user = input("Username: ")
passwd = getpass.getpass("Password: ")
server = input("Server: ")
db = input("Database: ")

# Create an engine instance (SQLAlchemy)
engine = create_engine("mysql+pymysql://{}:{}@{}/{}".format(user, passwd ,server, db))

# Define SQL query
sql_query = "SELECT * FROM WineDataset"

# Query dataset (pandas)
corpus = pd.DataFrame(engine.connect().execute(text(sql_query)))

In [None]:
train_corpus = corpus[corpus["testset"] == 0]
val_corpus = train_corpus.iloc[80000:100000,]
train_corpus = train_corpus.iloc[0:80000,]
test_corpus = corpus[corpus["testset"] == 1]

In [None]:
train_corpus = train_corpus[["description", "verygood"]]
val_corpus = val_corpus[["description", "verygood"]]
test_corpus = test_corpus[["description", "verygood"]]

train_corpus = train_corpus.rename(columns={"description": "text", "verygood": "label"})
val_corpus = val_corpus.rename(columns={"description": "text", "verygood": "label"})
test_corpus = test_corpus.rename(columns={"description": "text", "verygood": "label"})

In [None]:
train_ds = Dataset.from_pandas(train_corpus)
val_ds = Dataset.from_pandas(val_corpus)
test_ds = Dataset.from_pandas(test_corpus)

wine_ds = DatasetDict()

wine_ds['train'] = train_ds
wine_ds['validation'] = val_ds
wine_ds['test'] = test_ds

In [None]:
wine_ds

# Fine-tune Model

In [None]:
model_name = "distilbert-base-uncased"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [None]:
wine_ds_encoded = wine_ds.map(tokenize, batched=True, batch_size=None)

In [None]:
model = (AutoModelForSequenceClassification
         .from_pretrained(model_name, num_labels = 2)
         .to(device))

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
batch_size = 64
logging_steps = len(wine_ds_encoded["train"]) // batch_size

training_args = TrainingArguments(output_dir="results",
                                  num_train_epochs=2,
                                  learning_rate=3e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1",
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps)

In [None]:
trainer = Trainer(model=model,
                  args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=wine_ds_encoded["train"],
                  eval_dataset=wine_ds_encoded["validation"])
trainer.train()

## Evaluate Model

In [None]:
preds_output = trainer.predict(wine_ds_encoded["test"])

In [None]:
y_preds = np.argmax(preds_output.predictions, axis=1)

In [None]:
y_valid = wine_ds_encoded["test"]["label"]

In [None]:
print(classification_report(y_valid, y_preds))

In [None]:
my_submission = pd.DataFrame({'index': wine_ds_encoded["test"]["__index_level_0__"],
                              'verygood':y_preds})
my_submission.to_csv("my_submission.csv", index=False)