In [None]:
!pip install transformers datasets evaluate
!pip install tqdm

In [None]:
# Cloud Storage
# from google.cloud import storage
# storage_client = storage.Client(project='YOUR PROJECT ID')

In [None]:
import os
import gc
import time
import math
import random
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from datasets import Dataset, load_dataset, load_from_disk
from torch.utils.data import DataLoader
from transformers import AutoModel, AutoConfig, AutoTokenizer
import torch 
from pynvml import *


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
Refresh_Topic = False
Refresh_Train_Data = False
Refresh_Tokenize_DB = False
Train_model = False
dver = 201

In [None]:
DATA_PATH = "/kaggle/input/learning-equality-curriculum-recommendations/"
topics = pd.read_csv(DATA_PATH + "topics.csv")
content = pd.read_csv(DATA_PATH + "content.csv")
correlations = pd.read_csv(DATA_PATH + "correlations.csv")
df_topics = None

In [None]:
processed_topic_file = f"./data/df_topics_v{dver}.pkl"
train_data_file= f"./data/df_train_v{dver}.pkl"
full_tokenized_db_file=f"./data/full_tokenized_db_v{dver}.hf"
model_name = f"lecr-text-classification-v{dver}"
model_file = f"./model/kaggle/working/{model_name}"
trained_model_file = f"./model/kaggle/working/{model_name}_{dver}"

In [None]:
if ( not topics.columns[0].startswith("topic_")):
    topics.rename(columns=lambda x: "topic_" + x, inplace=True)
    content.rename(columns=lambda x: "content_" + x, inplace=True)

In [None]:
def get_parents(df, row):
    topic_id = row["topic_id"]
    topic_title = str(row["topic_title"])
    topic_description = str(row["topic_description"])
    topic_parent = row["topic_parent"]
    topic_level = row["topic_level"]
#     while topic_level > 0:
# assume we only have one parent
    subset = df.loc[df['topic_id'] == topic_parent]
    if(len(subset) > 1):
        print(f"We found multiple parents for topic: {topic_id} parent_id: {topic_parent}")
    for index, r in subset.iterrows():
        if (not pd.isna(r["topic_title"])):
#             print(r["topic_title"])
            topic_title = str(r["topic_title"]) + "." + topic_title
        topic_parent = r["topic_parent"]
        topic_level = r["topic_level"]
        break
#     print(topic_title)
    return topic_title

In [None]:
def refresh_topic(topics):

    df_topics = topics

    print(df_topics.head())

    title_full = []


    for index, row in tqdm(df_topics.iterrows(), total=df_topics.shape[0]):

        topic_title = get_parents(df_topics, row)
        title_full.append(topic_title)


    df_topics['topic_title_full'] = title_full

    print(df_topics.head())

    df_topics.to_pickle(processed_topic_file) 
    return df_topics

In [None]:
if (Refresh_Topic):
    print(f"Refresh_Topic >>> ")
    df_topics = refresh_topic(topics)
else:
    print(f"load df_topics from processed_topic_file")
    df_topics = pd.read_pickle(processed_topic_file)

In [None]:
df_topics

In [None]:
# import os

# mf = processed_topic_file
# mfz = f'{processed_topic_file}.tar.gz'
# !tar -czf {mfz} {mf}

# from IPython.display import FileLink

# FileLink(mfz)

In [None]:
def load_train_data(topics):
    train_df_columns = ["topic_title", "content_title", "topic_title_full", "topic_id","content_id", "content_description", "content_text" ]
    if ( not "content_id" in list(topics.columns.values)):
        correlations["content_id"] = correlations["content_ids"].str.split(" ")
        corr = correlations.explode("content_id").drop(columns=["content_ids"])

        corr = corr.merge(df_topics, how="left", on="topic_id")
        corr = corr.merge(content, how="left", on="content_id")

    #     corr["set"] = corr[train_df_columns].values.tolist()

    #     print("Display correlations ....")
    #     print(corr.head())

    train_df = pd.DataFrame(corr[train_df_columns])
    cols = ['content_title', 'content_description', "content_text"]
    train_df['content_full'] = train_df[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
    train_df.rename(columns={"topic_title_full": "text_label", "content_full": "text"}, inplace=True)

    final_train_data = pd.DataFrame(train_df[["text_label", "text"]])
    final_train_data["label"]= final_train_data['text_label'].astype('category').cat.codes

    # 

    # label_list = final_train_data.label_text.unique().sort()

    # label2id = dict(zip(lst, range(len(label_list))))
    # id2label = {i:t for i, t in enumerate(label_list)}      

    # print(f"label2id: {label2id}")

    # print(f"id2label: {id2label}
    final_train_data.head()
    final_train_data.to_pickle(train_data_file)
    
    return final_train_data;

In [None]:
if Refresh_Train_Data:
    print(f"Refresh_Train_Data ==>>>")
    final_train_data = load_train_data(topics)
else:
    print(f"load final_train_data from {train_data_file}")
    final_train_data = pd.read_pickle(train_data_file)

In [None]:
# import os
# # os.chdir(r'/kaggle/working')
# resf = train_data_file
# reszipf = f"{resf}.tar.gz"
# !tar -czf {reszipf} {resf}

# from IPython.display import FileLink

# FileLink(reszipf)

In [None]:
print(f"text_label count: {len(final_train_data.text_label.unique())}  label count: {len(final_train_data.label.unique())}")

In [None]:
# datasets

In [None]:
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, return_tensors="pt")

In [None]:
if (Refresh_Tokenize_DB):
    print(f"Refresh_Tokenize_DB ===>>")
    datasets = Dataset.from_pandas(final_train_data).train_test_split(test_size=0.2)
    full_tokenized_db = datasets.map(preprocess_function, batched=True)
    full_tokenized_db = full_tokenized_db.remove_columns(["text"])
    full_tokenized_db = full_tokenized_db.rename_column("label", "labels")
    full_tokenized_db.set_format("torch")
    full_tokenized_db.save_to_disk(full_tokenized_db_file)
   
else:
    print(f"load from disk {full_tokenized_db_file}")
    full_tokenized_db = load_from_disk(full_tokenized_db_file)
    # full_tokenized_db = load_dataset("", data_dir=full_tokenized_db_file, streaming=True)
print(final_train_data)
print(full_tokenized_db)

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

label2id = dict(zip(final_train_data.text_label, final_train_data.label))
id2label = dict(zip(final_train_data.label, final_train_data.text_label))
num_label = len(id2label)
print(f"label2id length: {len(label2id)}")
print(f"id2label length: {len(id2label)}")

# # train_label2id = dict(zip(full_tokenized_db["train"]["text_label"], full_tokenized_db["train"]["labels"]))
# train_label2id = {}
# train_id2label = {}
# idx = 0
# for val in full_tokenized_db["train"]:
#     idx += 1

#     train_label2id[val["text_label"]] = val["labels"]
#     train_id2label[val["labels"]]=val["text_label"]
#     if (idx % 1000 == 0):
#         print(idx, val["text_label"], val["labels"], len(train_label2id), len(train_id2label) )
# print(len(train_label2id),  len(train_id2label))
# # train_id2label = dict(zip(full_tokenized_db["train"]["labels"], full_tokenized_db["train"]["text_label"]))

# test_label2id = {}
# test_id2label = {}
# idx = 0
# for val in full_tokenized_db["test"]:
#     idx += 1
#     test_label2id[val["text_label"]] = val["labels"]
#     test_id2label[val["labels"]]=val["text_label"]
#     if (idx % 1000 == 0):
#         print(idx, val["text_label"], val["labels"] , len(test_label2id), len(test_id2label) )

# print(len(test_label2id),  len(test_id2label))

# test_label2id = dict(zip(full_tokenized_db["test"]["text_label"], full_tokenized_db["test"]["labels"]))
# test_id2label = dict(zip(full_tokenized_db["test"]["labels"], full_tokenized_db["test"]["text_label"]))
# label2id = {**train_label2id, **test_label2id}
# id2label = {**train_id2label, **test_id2label}
# num_label = len(id2label)
# print(f"label2id length: {len(label2id)} train_label2id {len(train_label2id)}, test_label2id {len(test_label2id)}")
# print(f"id2label length: {len(id2label)} train_id2label {len(train_id2label)}, test_id2label {len(test_id2label)}")

import evaluate

accuracy = evaluate.load("accuracy")

In [None]:
full_tokenized_db

In [None]:

torch.cuda.empty_cache()
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer


def train_model(full_tokenized_db, tokenizer, id2label, label2id, num_label):

    model = AutoModelForSequenceClassification.from_pretrained(
        "distilbert-base-uncased", num_labels=num_label, id2label=id2label, label2id=label2id
    )


    # %% [code] {"jupyter":{"outputs_hidden":false},"execution":{"iopub.status.busy":"2023-02-23T18:28:49.718426Z","iopub.execute_input":"2023-02-23T18:28:49.718814Z","iopub.status.idle":"2023-02-23T18:28:49.786141Z","shell.execute_reply.started":"2023-02-23T18:28:49.718782Z","shell.execute_reply":"2023-02-23T18:28:49.785207Z"}}
    # tokenized_train_db = full_tokenized_db["train"].shuffle(seed=42).select(range(10000))
    # tokenized_eval_db = full_tokenized_db["test"].shuffle(seed=42).select(range(10000))

    num_shards = 3
    for shard_idx in range(num_shards):
        shard_train = full_tokenized_db["train"].shard(num_shards=num_shards, index=shard_idx, contiguous=True)
        shard_test = full_tokenized_db["test"].shard(num_shards=num_shards, index=shard_idx, contiguous=True)


    # %% [code] {"execution":{"iopub.status.busy":"2023-02-23T18:28:54.307592Z","iopub.execute_input":"2023-02-23T18:28:54.307965Z","iopub.status.idle":"2023-02-23T18:28:54.338904Z","shell.execute_reply.started":"2023-02-23T18:28:54.307934Z","shell.execute_reply":"2023-02-23T18:28:54.337343Z"}}

        training_args = TrainingArguments(
            output_dir=model_file,
            learning_rate=2e-5,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            num_train_epochs=2,
            weight_decay=0.01,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            gradient_accumulation_steps=8,
            # gradient_checkpointing=True,
            fp16=True,
            optim="adafactor",
            push_to_hub=False,
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=shard_train,
            eval_dataset=shard_test,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        trainer.train()
        
        trained_file = trained_model_file + f"_{shard_idx}"
        print(f"save model to : {trained_file}")
        trainer.save_model(trained_file)
        print(f"load model from : {trained_file}")

        model = AutoModelForSequenceClassification.from_pretrained( trained_file, num_labels=num_label, id2label=id2label, label2id=label2id )
        tokenizer = AutoTokenizer.from_pretrained(trained_file)

# text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."
# from transformers import pipeline

# classifier = pipeline("sentiment-analysis", model=model_file)
# classifier(text)

if (Train_model):
    train_model(full_tokenized_db, tokenizer, id2label, label2id, num_label)

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained(model_file + "/checkpoint-2332")
model = AutoModelForSequenceClassification.from_pretrained(model_file + "/checkpoint-2332")

test_datasets = Dataset.from_pandas(final_train_data).train_test_split(test_size=0.2)

def get_predict_label(text):
    inputs = tokenizer(text, padding="max_length", truncation=True, return_tensors="pt")

    with torch.no_grad():
        logits = model(**inputs).logits

    predicted_class_id = logits.argmax().item()
    # print(model.config.id2label[predicted_class_id])
    return model.config.id2label[predicted_class_id], predicted_class_id

idx = 0
total = test_datasets["test"][-100:]
# print(total)
size = 100
# test_set =  test_datasets["test"].select(range(total-size, total))

for idx, val in enumerate(total["text"]):
    label, class_id = get_predict_label(val)
    print(f'original>>\t{total["text_label"][idx]}\t\t<predicted>>>>\t{label}\t{class_id}')