In [1]:
!pip install datasets
!pip install transformers
!pip install evaluate

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import json
import re
import os
from datasets import Dataset
import evaluate
import nltk

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
os.getcwd()

'/content'

In [5]:
#Test and set gpu for use in colab
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
print(gpu_info)

Found GPU at: /device:GPU:0
Mon Nov  4 08:23:46 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0              52W / 400W |    423MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                        

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
train = "/content/drive/My Drive/Colab Notebooks/redditComments_train.jsonlist"
test = "/content/drive/My Drive/Colab Notebooks/redditComments_test_notGraded.jsonlist"

Convert to Dataframes

In [32]:
#Tokenizer has to go over strings as a list. Int label will cause errors if left as df
#Replace characters and process text
def file_to_dataframe(file):
  data_set = []
  with open(file,  encoding='utf-8') as json_file:
    for f in json_file.readlines():
      f = json.loads(f)
      line = dict()
      line['labels'] = f['subreddit'].lower()
      line['text'] = f['body'].lower()
      data_set.append(line)
  data_set = pd.DataFrame.from_dict(data_set)

  url_pattern = r'\((http[s]?://[^\s]+)\)'
  #Removing stop words not necessary with Distillbert tokenizer-may reduce perf.
  stop_words = set(stopwords.words('english'))
  #Create dictionary for lables and text.

  #Process as usual, ommitting URLs
  replacement_rules = {'“': '"', '”': '"', '’': "'", '--': ','}
  replacements = lambda text: ''.join(replacement_rules.get(char, char) for char in text)
  #data_set['text'] = list(map(replacements, data_set['text']))
  data_set['text'] = data_set['text'].apply(replacements)
  #data_set['labels'] = data_set['labels'].astype(int)

  #Change labels to int. Needs to be cast as int for training

  le = LabelEncoder()
  if file == 'test':
    data_set['labels'] = le.transform(data_set['labels'])
  else:
    data_set['labels'] = le.fit_transform(data_set['labels'])

  def process_text(text):
    combined_tokens = []
    urls = re.findall(url_pattern, text)
    for url in urls:
        text = re.sub(r'https?:\/\/(www\.)?', ' ', text)
        #text = re.sub(r'http\S+', '' , text)
        sub_domain = url.split('/')
        domain_name = sub_domain[0]
        path_words = sub_domain[1:] if len(sub_domain) > 1 else []
        #Split domain and rest of url. Domain will likely be class specific
        split_domain = domain_name.split('.')
        path_tokens = []
        for i in path_words:
            path_tokens.extend(re.split(r'[\/\-_+\)\(\.]', i))
        split_url = split_domain + [token for token in path_tokens if token.strip()]
        #Take care of edge cases
        for j in split_url:
            if ' ' in j:
                combined_tokens.extend(j.lower().split())
            else:
                combined_tokens.append(j.lower())
        for word in combined_tokens:
            word = re.sub(r'\W', '', word.lower())
    #Remove processed urls or else duplicates
    text = re.sub(url_pattern, '', text)
    words = re.findall(r'\b\w+(?:\'\w+)?\b', text)
    cleaned_words = []
    for word in words:
        if word not in combined_tokens:
            if ' ' in word:
                cleaned_words.extend(word.lower().split())
            else:
                cleaned_words.append(word.lower())
    combined_tokens.extend(cleaned_words)
    #May not be needed - tokenizer performs many of these steps
    lemmatizer = WordNetLemmatizer()
    processed_words = [lemmatizer.lemmatize(word) for word in combined_tokens]
    return ' '.join(processed_words)
    #return text

  data_set['text'] = [process_text(text) for text in data_set['text']]
  #Convert to tokenized format. Needed for tokenization step
  return data_set
  #return Dataset.from_pandas(data_set)

In [33]:
train_set = file_to_dataframe(train)
test_set = file_to_dataframe(test)

In [10]:
train_set['text'][:10]

Unnamed: 0,text
0,i found that my eat performed substantially be...
1,n strike era and elite era blasters have radic...
2,really that's actually really interesting so a...
3,pretty much less dead volume and more spring e...
4,huh wild thanks for explaining
5,well that's wacky fun and might be a tiny nigh...
6,why doesn't hasbro just look to this community...
7,i want to see this turned into a real thing
8,hasbro needs to come on and make an elite perf...
9,this looks cool but internals


In [34]:
train_set, val_set = train_test_split(train_set, train_size=0.8, random_state=42)
train_set = train_set.reset_index(drop=True)
val_set = val_set.reset_index(drop=True)

In [12]:
len(val_set)

3920

In [13]:
#Change setting to show entire string in column
pd.set_option('display.max_colwidth', None)
#print(train_set['text'][:1].to_string(index=False))

In [35]:
#train_set = train_set.shuffle()
train_set.head()
#Confirmed files loaded

Unnamed: 0,labels,text
0,3,sorry to bother you but i have another question will any melody work over the chord progression a long a it in the key of dm
1,2,just go active and walk about with the gi bill you'll have veteran's preference point for any job and the fbi ha veteran only position you can then slide into the reserve and have a fed job which give you 15 paid work day to drill and other opportunity of longer assignment if you do 4 active you also get to buy back that 4 year towards your fed pension and also keep that 4 year toward a navy pension if you go reserve after active duty
2,3,from noodling around for a few year
3,1,i wish i think i might have used the term safety incorrectly i meant to refer the switch that rev up the flywheel motor so we gotta make it semi automatic but i love the speed at which the dart are shooting at i think i will be seriously considering using a solenoid i've never used one thank you for the direction i'm gonna start looking for resource now but if you know of a good tutorial please let me know
4,3,haha yea when i tried google it gave me a bunch of chordify link that didnt sound right at all i know it wa kinda a stupid question but i figured hey if there is a better way then you guy would know


In [15]:
from collections import Counter
counts = Counter(train_set['labels'])
print(counts)

Counter({0: 3945, 2: 3933, 1: 3931, 3: 3871})


In [16]:
test_set.tail()

Unnamed: 0,labels,text
195,2,been asked countless times search away x200b https www reddit com r newtothenavy search q hm 20atf restrict _sr 1
196,2,oh good god are you just being downvoted because of your username search this question kiddos
197,2,yes ask your unit about adt adsw and or additional drill opportunities also at the beginning of the year you don't just have the two weeks at starts at 29 days with what's called exceptional or e at so you can do more if you get signed up early
198,2,funeral honors
199,2,sure go ia


In [36]:
#Convert to array so that tokenizer can parse labels
train_set = Dataset.from_pandas(train_set)
val_set = Dataset.from_pandas(val_set)
test_set = Dataset.from_pandas(test_set)

In [18]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Tokenize function
Replace with datasets

In [19]:
#Map tokenizer
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

In [37]:
tokenized_dataset_train = train_set.map(tokenize_function)

Map:   0%|          | 0/15680 [00:00<?, ? examples/s]

In [38]:
tokenized_dataset_val = val_set.map(tokenize_function, batched = True)

Map:   0%|          | 0/3920 [00:00<?, ? examples/s]

In [39]:
tokenized_dataset_test = test_set.map(tokenize_function, batched = True)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [23]:
type(tokenized_dataset_train)

In [24]:
#Option to use smaller set from tokenized set - cut down training time from 1m examples
small_train_dataset = tokenized_dataset_train.shuffle(seed=42).select(range(7500))

In [25]:
tokenized_dataset_val[:1]

{'labels': [0],
 'text': ['your level is great'],
 'input_ids': [[101,
   2115,
   2504,
   2003,
   2307,
   102,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   

In [40]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
training_args = TrainingArguments(output_dir="/content/drive/My Drive/Colab Notebooks", \
                                  num_train_epochs=5, learning_rate = 0.00001,\
                                  #per_device_train_batch_size=16, \
                                  #per_device_eval_batch_size=16,
                                  eval_strategy="epoch", save_strategy="epoch")

In [52]:
metric = evaluate.load("accuracy")

In [53]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [54]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_val,
    compute_metrics=compute_metrics,
)

In [55]:
trainer.train()#resume_from_checkpoint = True)

Epoch,Training Loss,Validation Loss,Accuracy
1,0.1813,0.77179,0.835204
2,0.1288,0.816929,0.840561
3,0.0886,0.9578,0.8375
4,0.1005,0.974773,0.839031
5,0.0845,1.001253,0.833673


TrainOutput(global_step=9800, training_loss=0.11711844677827796, metrics={'train_runtime': 1124.2119, 'train_samples_per_second': 69.738, 'train_steps_per_second': 8.717, 'total_flos': 1.03858144739328e+16, 'train_loss': 0.11711844677827796, 'epoch': 5.0})

In [58]:
trainer.save_model('model')

In [56]:
trainer.evaluate()

{'eval_loss': 1.0012530088424683,
 'eval_accuracy': 0.8336734693877551,
 'eval_runtime': 17.4198,
 'eval_samples_per_second': 225.031,
 'eval_steps_per_second': 28.129,
 'epoch': 5.0}

In [57]:
test_labels = tokenized_dataset_test['labels']
