In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
!pip install datasets -q
!pip install tokenizers -q
!pip install transformers -q
!pip install seqeval -q
!pip install evaluate -q
!pip install huggingface_hub -q

In [None]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from torch.utils.data import TensorDataset
from sklearn.model_selection import train_test_split
import evaluate
import json
import datasets
from datasets import load_dataset

In [None]:
def dataset():
    with open("drive/MyDrive/NLP/reddit_dataset.json", "r") as f:
        data = json.load(f)
        
        
    sentences = []
    labels = []
    # dataset = {}
    for x in data:
        # print(x)
        sentences.append(x['parent_body']+" "+x['body'])
        labels.append(x['topic'])
        
    return sentences, labels

In [None]:
topic_to_label_map={
    'Education': 0,
    'Politics': 1,
    'Healthcare': 2,
    'Environment': 3,
    'Technology': 4,
    'unknown': 5
    }

sentences, labels = dataset()
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def make_dataset():
  ds = pd.DataFrame(list(zip(sentences,labels)), columns=['sentence','label'])
  ds_education = ds[ds['label'] == 'Education']
  ds_politics = ds[ds['label'] == 'Politics']
  ds_healthcare = ds[ds['label'] == 'Healthcare']
  ds_environment = ds[ds['label'] == 'Environment']
  ds_technology = ds[ds['label'] == 'Technology']
  ds_unknown = ds[ds['label'] == 'unknown']

  ds_education_test = ds_education.sample(1000)
  ds_politics_test = ds_politics.sample(1000)
  ds_healthcare_test = ds_healthcare.sample(1000)
  ds_environment_test = ds_environment.sample(1000)
  ds_technology_test = ds_technology.sample(1000)
  ds_unknown_test = ds_unknown.sample(1000)

  ds_education_train = ds_education[~(ds_education.index.isin(ds_education_test.index))]
  ds_politics_train = ds_politics[~(ds_politics.index.isin(ds_politics_test.index))]
  ds_healthcare_train = ds_healthcare[~(ds_healthcare.index.isin(ds_healthcare_test.index))]
  ds_environment_train = ds_environment[~(ds_environment.index.isin(ds_environment_test.index))]
  ds_technology_train = ds_technology[~(ds_technology.index.isin(ds_technology_test.index))]
  ds_unknown_train = ds_unknown[~(ds_unknown.index.isin(ds_unknown_test.index))]

  ds_train = pd.concat([ds_education_train, ds_politics_train, ds_healthcare_train, ds_environment_train, ds_technology_train, ds_unknown_train], ignore_index=True)
  ds_test = pd.concat([ds_education_test, ds_politics_test, ds_healthcare_test, ds_environment_test, ds_technology_test, ds_unknown_test], ignore_index=True)


  ds_train['label'] = [topic_to_label_map[x] for x in ds_train['label']]
  ds_test['label'] = [topic_to_label_map[x] for x in ds_test['label']]

  ds_train = ds_train.sample(frac=1)
  ds_test = ds_test.sample(frac=1)

  X_train, y_train = ds_train['sentence'].tolist(), ds_train['label'].tolist()
  X_test, y_test = ds_test['sentence'].tolist(), ds_test['label'].tolist()

  X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, test_size=0.2)

  
  # train_encodings = tokenizer(X_train, padding="max_length", truncation=True)
  # val_encodings = tokenizer(X_val, padding="max_length", truncation=True)
  # test_encodings = tokenizer(X_test, padding="max_length", truncation=True)

  return X_train, X_val, X_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = make_dataset()
# train_encodings, val_encodings, test_encodings, y_train, y_val, y_test = make_dataset()



class TopicsDataset(torch.utils.data.Dataset):
    def __init__(self, text, labels):
        super().__init__()
        self.text = text
        self.labels = labels
    
    def __getitem__(self, index):
        item = {'text':self.text[index], 'labels':self.labels[index]}
        return item

    def __len__(self):
        return len(self.labels)      

# class TopicsDataset(torch.utils.data.Dataset):
#     def __init__(self, text, labels):
#         self.text = text
#         self.labels = labels
    
#     def __getitem__(self, index):
#         item = {key: torch.tensor(val[index]) for key, val in self.text.items()}
#         item['labels'] = torch.tensor(self.labels[index])
#         return item

#     def __len__(self):
#         return len(self.labels)                    

# train_dataset = TopicsDataset(train_encodings[:10], y_train[:10])
# val_dataset = TopicsDataset(val_encodings[:10], y_val[:10])
# test_dataset = TopicsDataset(test_encodings[:10], y_test[:10])

    


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
def tokenization(batched_text):
    return tokenizer(batched_text['text'], padding = 'max_length', truncation=True, return_tensors='pt')

In [None]:
train_dataset = datasets.Dataset.from_dict({'text':X_train,'labels':y_train})
val_dataset = datasets.Dataset.from_dict({'text':X_val,'labels':y_val})
test_dataset = datasets.Dataset.from_dict({'text':X_test,'labels':y_test})

In [None]:
train_dataset = train_dataset.map(tokenization, batched=True, batch_size=16)
val_dataset = val_dataset.map(tokenization, batched=True, batch_size=16)
test_dataset = test_dataset.map(tokenization, batched=True, batch_size=16)

Map:   0%|          | 0/71894 [00:00<?, ? examples/s]

Map:   0%|          | 0/17974 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("sentientconch/topic_classifier", num_labels=6)
acc = evaluate.load("accuracy")
prec = evaluate.load("precision")
rec = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy':acc.compute(predictions=predictions, references=labels), 'precision':prec.compute(predictions=predictions, references=labels, average='weighted'), 'recall':rec.compute(predictions=predictions,references=labels,average='weighted')}

Downloading (…)lve/main/config.json:   0%|          | 0.00/993 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

In [None]:
training_args = TrainingArguments(
    output_dir = 'topic_classifier',
    evaluation_strategy='epoch',
    num_train_epochs = 2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_strategy = 'epoch',
    save_strategy='epoch',
    push_to_hub=True
    )



trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
    )

trainer.train()

Cloning https://huggingface.co/sentientconch/topic_classifier into local empty directory.


Download file pytorch_model.bin:   0%|          | 15.4k/413M [00:00<?, ?B/s]

Download file training_args.bin: 100%|##########| 3.50k/3.50k [00:00<?, ?B/s]

Clean file training_args.bin:  29%|##8       | 1.00k/3.50k [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/413M [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall
1,0.2878,0.207276,{'accuracy': 0.9365750528541226},{'precision': 0.9368429467954259},{'recall': 0.9365750528541226}
2,0.1205,0.238539,{'accuracy': 0.9472015132969845},{'precision': 0.9473603583409909},{'recall': 0.9472015132969845}


Trainer is attempting to log a value of "{'accuracy': 0.9365750528541226}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.9368429467954259}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.9365750528541226}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9472015132969845}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.9473603583409909}" of

TrainOutput(global_step=8988, training_loss=0.20413876216465704, metrics={'train_runtime': 14095.8674, 'train_samples_per_second': 10.201, 'train_steps_per_second': 0.638, 'total_flos': 3.783357114878362e+16, 'train_loss': 0.20413876216465704, 'epoch': 2.0})