In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

class Userdata():
    def __init__(self):
        self.secrets = {
            "KAGGLE_API_KEY":user_secrets.get_secret("KAGGLE_API_KEY"),
            "KAGGLE_USER_NAME":user_secrets.get_secret("KAGGLE_USER_NAME"),
            "HUGG_API_TOKEN":user_secrets.get_secret("HUGG_API_TOKEN"),
            "WandB_API_KEY":user_secrets.get_secret("WandB_API_KEY"),
        }
        
    def get(self, secret_name):
        return self.secrets[secret_name]
userdata = Userdata()

init = True

if init:
#     from google.colab import userdata
    
    !rm -rf /kaggle/working/*    
    
    !rm -rf /root/.kaggle/

    !mkdir ~/.kaggle
    !touch ~/.kaggle/kaggle.json

    api_token = {
        "username":userdata.get("KAGGLE_USER_NAME"),
        "key":userdata.get("KAGGLE_API_KEY"),
    }

    import json

    with open('/root/.kaggle/kaggle.json', 'w') as file:
        json.dump(api_token, file)

    !chmod 600 ~/.kaggle/kaggle.json

    !rm -rf /kaggle/working/*
    !kaggle datasets download -d michaelbryantds/78k-music-album-reviews
    !unzip /kaggle/working/78k-music-album-reviews.zip


    !pip install datasets transformers tqdm transformers[torch]
    !pip install accelerate -U
    import subprocess
    !pip install wandb
    !wandb login 5b2ceb8edc2e4f40870207591750b6a38db675fc
    subprocess.run(["wandb","login", userdata.get("WandB_API_KEY")]) 


In [None]:
import pandas as pd
import numpy as np
import torch as tr
import matplotlib.pyplot as plt
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoConfig,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from datasets import (
    Dataset,
    load_metric,
)
from huggingface_hub import login

In [None]:
login(token=userdata.get("HUGG_API_TOKEN"))

In [None]:
DATA_FILE = "/kaggle/working/music_album_reviews.csv"

In [None]:
df = pd.read_csv(DATA_FILE)

In [None]:
df.head()
# df.value_counts()
# df.columns
# df[:1]
# df["Rating"].isna().sum()

In [None]:
df = df.dropna()
rating_value_mapping = {x : ix for ix, x in enumerate(set(df["Rating"].dropna()))}
sdf = pd.concat([df["Rating"].map(rating_value_mapping), df["Review"]], axis =1)
sdf["Rating"] = sdf["Rating"].astype("int32")
sdf = sdf.rename(
    columns = {
        "Review":"text",
        "Rating":"label"
    }
)
sdf.head()

In [None]:
sdf["label"].value_counts()

In [None]:
plt.bar(sdf["label"].unique(), sdf["label"].value_counts());

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
ds = Dataset.from_pandas(sdf, preserve_index=False)
ds.shape[0]

In [None]:
train_percentage = 80
def train_test_split(ds, train_part):
    split_ix = int(ds.shape[0]*train_part)
    train = Dataset.from_dict(ds[:split_ix])
    test = Dataset.from_dict(ds[split_ix:])
    return train, test

train, test = train_test_split(ds, 0.95)
len(ds), len(train), len(test)

In [None]:
def prep(ex):
    return tokenizer(ex["text"], truncation = True)
tokenized_train = train.map(prep, batched = True)
tokenized_test = test.map(prep, batched = True)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training

In [None]:
id2label = {x:0 for x in range(10)}

start = 0.5
for key in id2label.keys():
    id2label[key] = start
    start += 0.5
    
model_ckpt = "distilbert-base-uncased"
# model_ckpt = "/kaggle/working/lab_11_distilbert_sentiment"

config = AutoConfig.from_pretrained(
    model_ckpt,
    num_labels=10,
    id2label=id2label,
    report_to="wandb"
)
model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt,
    config = config
)

In [None]:
def compute_metrics(eval_pred):
    load_accuracy = load_metric("accuracy", average='macro')
    load_f1 = load_metric("f1", average='macro')

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(
        predictions=predictions,
        references=labels,
    )["accuracy"]
    f1 = load_f1.compute(
        predictions=predictions,
        references=labels,
        average="macro"
    )["f1"]
    return {"accuracy": accuracy, "f1": f1}

In [None]:
repo_name = "lab_11_distilbert_sentiment"

training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    push_to_hub=True,
)

In [None]:
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss
        device = tr.device("cuda" if tr.cuda.is_available() else "cpu")
        loss_fct = tr.nn.CrossEntropyLoss(
        #weight_for_class_i = total_samples / (num_samples_in_class_i * num_classes)
            weight=tr.tensor(
                [ 19.68,    12.21,  5.59,    3.53,  0.55,  0.43,  0.26,  1.76,  1.11, 14.88]
                #[0.5,      1,      1.5,     2,     2.5,   3,     3.5,   4,     4.5,  5] #real values of labels
                #[0,        1,      2,       3,     4,     5,     6,     7,     8     9] #labels
            ).to(device)
        )
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.push_to_hub()

In [None]:
sentiment_model = pipeline(model="Malecc/lab_11_distilbert_sentiment")

In [None]:
sentiment_model("as i can get it, listening worth it")

In [None]:
sentiment_model("this album bad as fuck")

In [None]:
examples = [
    "This enchanting album is a tapestry of emotions, weaving tales of love, loss, and resilience. Swift's ethereal vocals and poetic lyrics create a hauntingly beautiful landscape that resonates with listeners on a deeply personal level. 'Evermore' is a testament to Swift's evolution as an artist, showcasing her ability to craft intricate narratives that tug at the heartstrings and linger long after the music fades",
    "A tour de force of introspection and raw honesty, 'DAMN.' delves into the complexities of the human experience with unflinching precision. Lamar's lyrical prowess and innovative production create a sonic journey that challenges societal norms and sparks introspection. This album is a bold statement that demands attention and reflection, solidifying Lamar's place as a visionary in the world of hip-hop",
    "Radiating warmth and authenticity, 'BE' is a testament to BTS's unwavering bond with their fans and each other. The album's intimate lyrics and infectious melodies invite listeners into a world of hope, resilience, and unity. 'BE' is a beacon of light in challenging times, reminding us of the power of music to connect and uplift spirits",
    "Eilish's sophomore album is a raw and vulnerable exploration of love, heartbreak, and self-discovery. Her haunting vocals and minimalist production create a hauntingly beautiful soundscape that lingers in the mind long after the last note fades. 'Happier Than Ever' is a testament to Eilish's growth as an artist, showcasing her ability to push boundaries and bare her soul with fearless honesty.",
    "A soul-stirring journey through heartache, healing, and self-discovery, '30' is a testament to Adele's unmatched vocal prowess and emotional depth. Each track is a poignant reflection of love, loss, and resilience, drawing listeners into Adele's world of raw emotion and vulnerability. '30' is a cathartic experience that resonates with anyone who has ever loved and lost, solidifying Adele's status as a powerhouse in the world of music",
]
for ex in examples:
    print("*"*50)
    print("EXAMPLE")
    print(ex)
    print(sentiment_model(ex))