In [1]:
# %pip install transformers

In [31]:
import json

from transformers import DistilBertTokenizer, DistilBertModel
import torch
import numpy as np

# Load pre-trained model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Sample input text
input_text = "US won men's basketball in 2024 Paris Olympic."

# Tokenize input text
encoded_input = tokenizer(input_text, return_tensors='pt', clean_up_tokenization_spaces=True)

# Obtain embeddings
with torch.no_grad():
    output = model(**encoded_input)

# Extract embeddings from model output
embeddings = output.last_hidden_state.mean(dim=1)  # Mean pooling to obtain single vector representation

print(embeddings)

# Now you can use these embeddings for topic classification, e.g., using a classifier trained on top of these embeddings




tensor([[-1.3464e-01, -3.9202e-02,  1.0596e-01,  1.3805e-01, -1.2659e-01,
         -2.1238e-01,  1.2263e-01,  5.2369e-01, -5.6094e-01, -2.6391e-01,
          1.6788e-01, -4.3357e-01, -1.4720e-01,  3.6785e-01,  1.1393e-01,
          1.7509e-02,  2.4391e-01,  1.3413e-01,  2.4302e-01,  2.4873e-01,
          1.3223e-01,  1.0524e-01,  3.0135e-01,  3.9591e-01,  1.7112e-01,
          2.5108e-01, -3.8376e-01,  5.4299e-02, -3.2488e-03, -1.4319e-01,
          1.8547e-01, -3.3984e-01, -1.0163e-01, -4.6134e-02, -1.8497e-02,
         -2.4148e-01,  1.2116e-01, -5.2486e-02, -1.4436e-01,  8.7720e-02,
         -6.7611e-01, -2.1929e-01,  4.6112e-02,  2.1698e-01,  8.4103e-02,
          1.1627e-01,  1.2164e-01,  2.3831e-01, -2.2935e-02, -1.6549e-05,
         -1.8954e-01,  1.9479e-01, -1.1715e-01,  2.5638e-01,  1.7379e-03,
          8.2271e-01, -1.4014e-01, -5.4046e-01, -2.5901e-01, -3.9249e-01,
          1.6775e-01,  4.1612e-01, -3.5271e-02, -7.3634e-02,  2.5108e-01,
          1.6944e-01,  1.2793e-01,  1.

In [3]:
embeddings.shape

torch.Size([1, 768])

In [4]:
import os

data_dir = "../data/training"

topics = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir,d))]
topics = [t for t in topics if t != "other"]
print(topics)

['technologie', 'entertainment', 'business', 'sport', 'politics', 'historical', 'medical', 'food', 'space', 'graphics']


In [5]:
dataset = []
for topic in topics:
    topic_dir = os.path.join(data_dir, topic)
    text_files = [f for f in os.listdir(topic_dir) if f.endswith(".txt")]
    for file in text_files:
        with open(os.path.join(topic_dir, file), "r") as fd:
            dataset.append({
                "topic": topic,
                "content": fd.read(),
            })

print(dataset)



In [6]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
# Load pre-trained model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

for row in dataset:
    encoded_input = tokenizer(row["content"], return_tensors='pt', max_length=512, truncation=True, padding=True)
    with torch.no_grad():
        output = model(**encoded_input)
    embeddings = output.last_hidden_state.mean(dim=1)
    row["embedding"] = embeddings

print(dataset[0])


{'topic': 'technologie', 'content': 'Games firms \'face tough future\'\n\nUK video game firms face a testing time as they prepare for the next round of games consoles, the industry warns.\n\nFred Hasson, head of Tiga, which represents independent developers, said that more UK firms would go under due to greater risks in making new titles. Three leading UK video game companies also predicted that more firms would close as they struggled to adapt. Microsoft, Sony and Nintendo are expected to release new consoles in the next 18 months. Microsoft has said repeatedly that it wants to be first to the market and some analysts predict that Xbox 2 will be released in the US before the end of 2005.\n\nThe new machines will all have much greater processing and graphical power which will have a huge impact on development of next generation games. Mr Hasson said: "In the last four years we have probably lost a third of independent developers." He said there were about 150 independent developers lef

In [11]:
for row in dataset:
    row["embedding"] = row["embedding"].numpy().tolist()

In [13]:
import json

embedding_file = "../data/experiment/embeddings.json"
with open(embedding_file, "w") as fp:
    json.dump(dataset, fp, indent=4)

In [17]:
num_classes = len(topics)

In [18]:
num_features = len(dataset[0]["embedding"][0])

In [21]:
from classifier.NNClassifier import NNClassifier

classifier = NNClassifier(num_features=num_features, num_classes=num_classes, hidden_dims=[256, 256])


In [22]:
features = np.array([row["embedding"][0] for row in dataset])
labels = np.array([topics.index(row["topic"]) for row in dataset])


In [23]:
features.shape

(1000, 768)

In [24]:
labels.shape

(1000,)

In [28]:
%pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.1-cp39-cp39-macosx_12_0_arm64.whl (11.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hCollecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Collecting joblib>=1.2.0
  Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.8/301.8 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy>=1.19.5
  Downloading numpy-1.24.4-cp39-cp39-macosx_11_0_arm64.whl (13.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.9/13.9 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: threadpoolctl, numpy, joblib, scikit-learn
  Attempting uninstall: threadpoolctl
    Found existing installation: threadpoolctl 2.2.0
    Uninstalling threadpoolctl-2.2.0:
    

In [29]:
from sklearn.model_selection import train_test_split

features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [30]:
classifier.train_with(features=features_train, labels=labels_train, validate_features=features_test, validate_labels=labels_test)

In [33]:
type(embeddings)

torch.Tensor

In [34]:
results = classifier.infer(embeddings.numpy())

In [35]:
results

array([[3.        , 0.99985313]])

In [36]:
topics[3]

'sport'

In [53]:
sample_batch = [row["content"] for row in dataset[0:2]]
sample_batch
encoded2 = tokenizer(sample_batch, return_tensors='pt', max_length=512, truncation=True, padding=True)
with torch.no_grad():
    output = model(**encoded2)
    embeddings2 = output.last_hidden_state


RuntimeError: The size of tensor a (571) must match the size of tensor b (512) at non-singleton dimension 1

In [50]:
embeddings2

tensor([[-0.1732, -0.0015,  0.3509,  ..., -0.1563,  0.1381, -0.0815],
        [-0.0730,  0.0125,  0.2264,  ..., -0.0409, -0.0903,  0.1571]])

In [52]:
embeddings2.shape

torch.Size([2, 500, 768])