#### Goal of this project
- Classify the news into ten categories

<b>`Steps`<b>
- install the cohere package in order to connect to the cohere API
- pip install cohere
- Generate API key for the cohere

In [28]:
import pandas as pd
import cohere
from sklearn.model_selection import train_test_split
# from scripts.utils import read_from_dvc
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import sys
sys.path.append('../')

In [29]:
from  config import api_cohere
api_key= api_cohere['api_key']
coh = cohere.Client(api_key)

In [26]:
# repo="https://github.com/niyotham/in-context-learning-LLMs"
# test=util.read_from_dvc("data/test_news.csv",repo,"test-news-v2",low_memory=False)
# train=util.read_from_dvc("data/trainer_news.csv",repo,"train-news-v2",low_memory=False)

In [30]:
data = pd.read_excel('../data/news.xlsx')
data.shape

(10, 9)

In [17]:
# ! pip install openpyxl

In [31]:
data['Analyst_Rank'] = data['Analyst_Rank'].apply(lambda value: 0 if value < 4 else 1)
data['Analyst_Rank'].value_counts()

1    7
0    3
Name: Analyst_Rank, dtype: int64

In [32]:
# copy the dataset 
df=data.copy()

`Split the dataset` 
 - training and test datasets
- Training the model
- Test for evaluating the classifier performance

In [33]:

X, y = df["Title"], df["Analyst_Rank"].astype(str)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.10, random_state=21)

### Few-shot method
e.g.

In [36]:
# Set the number of examples per category
EX_PER_CAT = 4

# Create list of examples containing texts and labels - sample from the dataset
ex_texts, ex_labels = [], []
for intent in y_train.unique().tolist():
  y_temp = y_train[y_train == intent]
  sample_indexes = y_temp.index#.sample(n=EX_PER_CAT, random_state=42).index
  ex_texts += X_train[sample_indexes].tolist()
  ex_labels += y_train[sample_indexes].tolist()

# print(f'Number of examples per class: {EX_PER_CAT}')
print(f'Number of classes: {len(y_train.unique().tolist())}')
print(f'Total number of examples: {len(ex_texts)}')

Number of classes: 2
Total number of examples: 9


 Get classifications 

In [37]:
# Collate the examples via the Example module
from cohere.classify import Example

examples = list()
for txt, lbl in zip(X_train,y_train):
  examples.append(Example(txt,lbl))

In [38]:
def classify_text(text, examples):
    classifications = coh.classify(
    model='medium',  # model version - medium-22020720
    inputs=[text],
    examples=examples
    )
    return classifications.classifications[0].prediction

In [39]:
# Generate classification predictions on the test dataset (this will take a few minutes)
y_pred = X_test.apply(classify_text, args=(examples,)).tolist()

In [40]:

# Compute metrics on the test dataset
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {100*accuracy:.2f}')
print(f'F1-score: {100*f1:.2f}')

Accuracy: 100.00
F1-score: 100.00


`Embedding of the news title`

In [41]:
# Embed the training set
train_emb = coh.embed(texts=X_train.tolist(),
                            model="large",
                            truncate="LEFT").embeddings
# Embed the testing set
test_emb = coh.embed(texts=X_test.tolist(),
                           model="large",
                           truncate="LEFT").embeddings

In [None]:
print(f"Review text: {X_train[0]}")
print(f"Embedding vector: {train_emb[0][:10]}")

Train a classifier using the training set

In [42]:
# import SVM classifier code
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


# Initialize a support vector machine, with class_weight='balanced' because
# our training set has roughly an equal amount of positive and negative
# sentiment sentences
svm_classifier = make_pipeline(StandardScaler(), SVC(class_weight='balanced'))

# fit the support vector machine
svm_classifier.fit(train_emb, y_train)

In [43]:
# get the score from the test set, and print it out to screen!
score = svm_classifier.score(test_emb, y_test)
print(f"Validation accuracy on Large is {100*score}%!")

Validation accuracy on Large is 100.0%!


In [44]:
# Collate the examples via the Example module
from cohere.classify import Example

examples = list()
for txt, lbl in zip(X_train, y_train):
  examples.append(Example(txt,lbl))