In [None]:
#installing datasets
!pip install -q datasets
#import
import string
from datasets import load_dataset
import pandas as pd
#load dataset
ag_news=load_dataset("ag_news")
#check the format of data
print(ag_news.keys())
# split into train test dataframe
df_train=pd.DataFrame({'text':ag_news['train']['text'],'label':ag_news['train']['label']})
df_test=pd.DataFrame({'text':ag_news['test']['text'],'label':ag_news['test']['label']})
df_train['label'].value_counts()
#standardizing data
def standardize_text(text):
  return text.str.lower().str.replace(r'[^\w\s]','',regex=True)
df_train['text']=standardize_text(df_train['text'])
df_test['text']=standardize_text(df_test['text'])
df_train.head(3)
#CountVectorizer include stop words ,n-grams and limited features
from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer(stop_words='english',max_features=5000,ngram_range=(1,2))
x_train_counts=vectorizer.fit_transform(df_train['text'])
x_test_counts=vectorizer.transform(df_test['text'])
print(x_train_counts.shape , x_test_counts.shape)
#Train a classifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
clf=MultinomialNB()
clf.fit(x_train_counts,df_train['label'])
#prediction
y_pred=clf.predict(x_test_counts)
#Evaluation
target_names = ag_news['train'].features['label'].names
print("Accuracy Score :\n ",accuracy_score(df_test['label'],y_pred))
print("Classification Report : \n",classification_report(df_test['label'],y_pred,target_names=target_names))
print("Confusion matrixcon :\n",confusion_matrix(df_test['label'],y_pred))

#If you want to see which words are most influential:
feature_names = vectorizer.get_feature_names_out()
log_probs = clf.feature_log_prob_
print("Top 10 words for each class:")
for i, class_name in enumerate(target_names):
    top_idx = log_probs[i].argsort()[-10:][::-1]
    words = [feature_names[j] for j in top_idx]
    print(f"Top words for {class_name}: {', '.join(words)}")


# working on joblib to save codes
import joblib
joblib.dump(clf, "ag_news_nb_model.pkl")
joblib.dump(vectorizer, "ag_news_vectorizer.pkl")





The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

dict_keys(['train', 'test'])
(120000, 5000) (7600, 5000)
Accuracy Score :
  0.8881578947368421
Classification Report : 
               precision    recall  f1-score   support

       World       0.90      0.89      0.89      1900
      Sports       0.94      0.97      0.95      1900
    Business       0.86      0.83      0.85      1900
    Sci/Tech       0.85      0.86      0.86      1900

    accuracy                           0.89      7600
   macro avg       0.89      0.89      0.89      7600
weighted avg       0.89      0.89      0.89      7600

Confusion matrixcon :
 [[1693   68   93   46]
 [  30 1844   12   14]
 [  82   17 1578  223]
 [  85   36  144 1635]]
Top 10 words for each class:
Negative influential words:
said 39s ap iraq reuters president new afp minister killed
Positive influential words:
39s ap new game win team season ap ap night world


['ag_news_vectorizer.pkl']

#                       **AG_NEWS BY USING  TFIDFVECTORIZER**

In [None]:
#import
import string
from datasets import load_dataset
import pandas as pd
import numpy as np # Import numpy with the alias np
#load dataset
ag_news=load_dataset("ag_news")
#check the format of data
print(ag_news.keys())
# split into train test dataframe
df_train=pd.DataFrame({'text':ag_news['train']['text'],'label':ag_news['train']['label']})
df_test=pd.DataFrame({'text':ag_news['test']['text'],'label':ag_news['test']['label']})
df_train['label'].value_counts()
#standardizing data
def standardize_text(text):
  return text.str.lower().str.replace(r'[^\w\s]','',regex=True)
df_train['text']=standardize_text(df_train['text'])
df_test['text']=standardize_text(df_test['text'])
df_train.head(3)
#CountVectorizer include stop words ,n-grams and limited features
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer(stop_words='english',max_features=5000,ngram_range=(1,2))
x_train_tfidf=vectorizer.fit_transform(df_train['text'])
x_test_tfidf=vectorizer.transform(df_test['text'])
print(x_train_tfidf.shape , x_test_tfidf.shape)
#Train a classifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
clf1=MultinomialNB()
clf1.fit(x_train_tfidf,df_train['label'])
#prediction
y_pred_tfidf=clf1.predict(x_test_tfidf)
#Evaluation
target_names = ag_news['train'].features['label'].names
print("Accuracy Score :\n ",accuracy_score(df_test['label'],y_pred_tfidf))
print("Classification Report : \n",classification_report(df_test['label'],y_pred_tfidf,target_names=target_names))
print("Confusion matrixcon :\n",confusion_matrix(df_test['label'],y_pred_tfidf))

#If you want to see which words are most influential:
feature_names = vectorizer.get_feature_names_out()
log_probs = clf1.feature_log_prob_
print("Top 10 words for each class:")
for i, class_name in enumerate(target_names):
    top_idx = log_probs[i].argsort()[-10:][::-1]
    words = [feature_names[j] for j in top_idx]
    print(f"Top words for {class_name}: {', '.join(words)}")





#versions used
import sklearn, pandas, numpy
print("scikit-learn:", sklearn.__version__)
print("pandas:", pd.__version__)
print("numpy:", np.__version__)

# working on joblib to save codes
import joblib
joblib.dump(clf1, "ag_news_tfidf_model.pkl")
joblib.dump(vectorizer, "ag_news_tfidf_vectorizer.pkl")

dict_keys(['train', 'test'])
(120000, 5000) (7600, 5000)
Accuracy Score :
  0.89
Classification Report : 
               precision    recall  f1-score   support

       World       0.90      0.89      0.90      1900
      Sports       0.94      0.97      0.95      1900
    Business       0.86      0.84      0.85      1900
    Sci/Tech       0.86      0.86      0.86      1900

    accuracy                           0.89      7600
   macro avg       0.89      0.89      0.89      7600
weighted avg       0.89      0.89      0.89      7600

Confusion matrixcon :
 [[1693   65   93   49]
 [  29 1843   13   15]
 [  83   18 1591  208]
 [  78   35  150 1637]]
Top 10 words for each class:
Top words for World: iraq, ap, said, reuters, president, 39s, afp, killed, minister, iraqi
Top words for Sports: 39s, ap, game, win, team, season, night, cup, victory, league
Top words for Business: oil, 39s, reuters, said, new, prices, company, percent, profit, stocks
Top words for Sci/Tech: microsoft, new, sof

['ag_news_tfidf_vectorizer.pkl']