<a href="https://colab.research.google.com/github/mertcan-basut/llm-applications/blob/main/llm_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install -q openai
!pip install -q tiktoken
!pip install -q python-dotenv

In [3]:
!echo "OPENAI_API_KEY=editme" > .env

In [315]:
from openai import OpenAI as OpenAIClient

import tiktoken

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score

import pandas as pd
import numpy as np

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv(), override=True) # read local .env file

## Prepare dataset

LLMs' contextual and semantic perception capabilities are exploited for classifying BBC news articles into 5 distinct categories: `tech`, `business`, `sport`, `entertainment`, and `politics`

In [5]:
# Kaggle API Token is downloaded from https://www.kaggle.com/settings
# and uploaded to the file system's working directory

!mkdir .kaggle
!mv kaggle.json .kaggle/kaggle.json
!chmod 600 .kaggle/kaggle.json

!kaggle datasets download yufengdev/bbc-fulltext-and-category
!unzip bbc-fulltext-and-category.zip -d data/
!rm bbc-fulltext-and-category.zip

Dataset URL: https://www.kaggle.com/datasets/yufengdev/bbc-fulltext-and-category
License(s): CC0-1.0
Downloading bbc-fulltext-and-category.zip to /content
  0% 0.00/1.83M [00:00<?, ?B/s]
100% 1.83M/1.83M [00:00<00:00, 147MB/s]
Archive:  bbc-fulltext-and-category.zip
  inflating: data/bbc-text.csv       


In [289]:
def get_samples(data: pd.DataFrame, categories_col_name: str, samples_len: int):
  """
  Get samples from the dataset while keeping the balance of the samples with respect to the classification categories.

  Args:
    data (pd.DataFrame): The dataset.
    categories_col_name (str): The name of the column containing the classification categories.
    samples_len (int): The number of samples to get.
  """
  categories = data[categories_col_name].unique()
  if samples_len < categories.size:
    categories = np.random.choice(categories, size=samples_len, replace=False)
    samples = pd.concat([data[data[categories_col_name] == category].sample(n=1, random_state=42) for category in categories])
  else:
    for _, test_index in StratifiedShuffleSplit(
        n_splits=1, test_size=samples_len, random_state=42
      ).split(data, data['category']): samples = data.iloc[test_index]
  return samples

In [290]:
data = pd.read_csv("data/bbc-text.csv")
data['category'] = data['category'].str.lower()
categories = data['category'].unique()

few_shot_samples = get_samples(data, 'category', 5)
test_data = get_samples(data.drop(few_shot_samples.index), 'category', 100)

## Classification using *function calling*

In [340]:
category_descriptions = {
  'tech': "News articles or a piece of text about `technology` related topics.",
  'business': "News articles or a piece of text about `business` related topics.",
  'sport': "News articles or a piece of text about `sports` realted topics.",
  'entertainment': "News articles or a piece of text about `entertainment` related topics.",
  'politics': "News articles or a piece of text about `politics` related topics."
}

tools = [
  {
    'type': 'function',
    'function': {
      'name': key,
      'description': value
    }
  } for key, value in category_descriptions.items()
]

sys_prompt = """You are a news article classifier. \
Your task is to classify the category of the given article into \
tech, business, sport, entertainment, or politics.

{few_shot_examples}""".format(
  few_shot_examples="\n".join([f"Text: ```{row['text']}```\nCategory: {row['category']}" for _, row in few_shot_samples.iterrows()])
)

usr_prompt = """Text: ```{text}```"""

client = OpenAIClient()
llm_model_name = "gpt-4-turbo"

In [327]:
def classification(text: str):
  response = client.chat.completions.create(
    model=llm_model_name,
    messages=[
      {'role': 'system', 'content': sys_prompt},
      {'role': 'user', 'content': usr_prompt.format(text=text)}
    ],
    tools=tools,
    tool_choice='required', # 'auto'|'required'
    parallel_tool_calls=False,
    temperature=0.0
  )

  category = response.choices[0].message.tool_calls[0].function.name
  return category

In [None]:
test_data['prediction'] = test_data['text'].apply(lambda text: classification(text))
accuracy_score(test_data['category'], test_data['prediction'])

## Classification using *log probabilities*

In [307]:
category_descriptions = {
  'tech': "News articles about technology.",
  'business': "News articles about business.",
  'sport': "News articles about sports.",
  'entertainment': "News articles about entertainment.",
  'politics': "News articles about politics."
}

sys_prompt = """You are a news article classifier. \
You are given a news article as text and you need to classify it into \
one of the following categories:

{category_descriptions}

{few_shot_examples}""".format(
  category_descriptions="\n".join([f"- `{key}`: {value}" for key, value in category_descriptions.items()]),
  few_shot_examples="\n".join([f"Text: ```{row['text']}```\nCategory: {row['category']}" for _, row in few_shot_samples.iterrows()])
)

usr_prompt = """Classify the text into tech, business, sport, entertainment, or politics. \
Text: ```{text}```
Category: """

client = OpenAIClient()
llm_model_name = "gpt-4"

tokenizer = tiktoken.encoding_for_model(llm_model_name)
categories_token = [token for category in categories for token in tokenizer.encode(category)] + [tokenizer.eot_token]
max_tokens = max([len(tokenizer.encode(category)) for category in categories]) + 1

In [311]:
def classification(text: str):
  response = client.chat.completions.create(
    model=llm_model_name,
    messages=[
      {'role': 'system', 'content': sys_prompt},
      {'role': 'user', 'content': usr_prompt.format(text=text)}
    ],
    logprobs=True,
    logit_bias={token: 100 for token in categories_token},
    max_tokens=max_tokens,
    temperature=0.0
  )

  category = response.choices[0].message.content
  confidence = np.array([np.exp(token.logprob) for token in response.choices[0].logprobs.content]).prod()
  return category, confidence

In [312]:
test_data['prediction'] = test_data['text'].apply(lambda text: classification(text)[0])
accuracy_score(test_data['category'], test_data['prediction'])

0.94