In [None]:
import torch
print("CUDA available?", torch.cuda.is_available())
print("Device name:", torch.cuda.get_device_name(0))

import torch
print(torch.cuda.get_device_name(0))


CUDA available? True
Device name: NVIDIA A100-SXM4-40GB
NVIDIA A100-SXM4-40GB


In [None]:
# 1) Install kaggle, prompt for API token
!pip install --quiet kaggle

import os
if not os.path.exists('/root/.kaggle/kaggle.json'):
    from google.colab import files
    print("Please upload your kaggle.json API token:")
    files.upload()  # upload your kaggle.json here

# 2) Move token into place
os.makedirs('/root/.kaggle', exist_ok=True)
!cp kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

# 3) Download only train.csv and test.csv from the AG‑News dataset
!kaggle datasets download -d amananandrai/ag-news-classification-dataset --quiet
!unzip -o ag-news-classification-dataset.zip


Dataset URL: https://www.kaggle.com/datasets/amananandrai/ag-news-classification-dataset
License(s): unknown
Archive:  ag-news-classification-dataset.zip
  inflating: test.csv                
  inflating: train.csv               
-rw-r--r-- 1 root root 1.8M Apr 20  2020 test.csv
-rw-r--r-- 1 root root  28M Apr 20  2020 train.csv


In [None]:
!pip install --upgrade pip --quiet
!pip install --quiet transformers torch scikit-learn huggingface_hub[hf_ext] ipywidgets
!jupyter nbextension enable --py widgetsnbextension --sys-prefix


[0mEnabling notebook extension jupyter-js-widgets/extension...
Paths used for configuration of notebook: 
    	/usr/etc/jupyter/nbconfig/notebook.json
Paths used for configuration of notebook: 
    	
      - Validating: [32mOK[0m
Paths used for configuration of notebook: 
    	/usr/etc/jupyter/nbconfig/notebook.json


In [None]:
import os
os.environ['TRANSFORMERS_NO_TQDM'] = '1'

import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline


In [None]:
# — USER PARAMETERS & IMPORTS —
input_csv = 'test.csv'           # or 'test.csv'
output_csv = 'test_features.csv' # final output
n_tfidf = 5
tfidf_vectorizer_path = 'tfidf_vectorizer.pkl'
MAX_VOCAB_SIZE = 20000
ZS_BATCH_SIZE = 16
candidate_labels = ["World", "Sports", "Business", "Sci/Tech"]

import os, pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# — 1) LOAD & RENAME —
df = pd.read_csv(input_csv)
df.rename(columns={
    'Class Index': 'label',
    'Title'      : 'title',
    'Description': 'description'
}, inplace=True)
df['text'] = df['title'].fillna('') + ' ' + df['description'].fillna('')

# — 2) “Easy” feature: uppercase count —
df['uppercase_char_count'] = df['title'].apply(
    lambda s: sum(1 for c in s if c.isupper())
)

print("Step 1 complete — df has columns:", df.columns.tolist())
# You can peek at df.head() or even save an intermediate if you like:
# df.to_pickle('step1_df.pkl')



Step 1 complete — df has columns: ['label', 'title', 'description', 'text', 'uppercase_char_count']


In [None]:
# — 3) TF‑IDF top‑N keywords —
if os.path.exists(tfidf_vectorizer_path):
    with open(tfidf_vectorizer_path, 'rb') as f:
        vectorizer = pickle.load(f)
    tfidf_matrix = vectorizer.transform(df['text'])
else:
    vectorizer = TfidfVectorizer(max_features=MAX_VOCAB_SIZE)
    tfidf_matrix = vectorizer.fit_transform(df['text'])
    with open(tfidf_vectorizer_path, 'wb') as f:
        pickle.dump(vectorizer, f)

feature_names = vectorizer.get_feature_names_out()
def top_n_keywords(idx):
    row = tfidf_matrix[idx].toarray().flatten()
    top_inds = row.argsort()[-n_tfidf:][::-1]
    return [feature_names[i] for i in top_inds]

df['tfidf_top_keywords'] = [top_n_keywords(i) for i in range(len(df))]

print("Step 2 complete — added tfidf_top_keywords")
# Optional: save again if you want to checkpoint
# df.to_pickle('step2_df.pkl')


Step 2 complete — added tfidf_top_keywords


In [None]:
from transformers import pipeline

# — 4) Batched zero‑shot topic scoring on GPU —
classifier = pipeline(
    'zero-shot-classification',
    model='facebook/bart-large-mnli',
    framework='pt',
    device=0,
    batch_size=ZS_BATCH_SIZE
)

outs = classifier(df['text'].tolist(), candidate_labels)
topic_df = pd.DataFrame([dict(zip(o['labels'], o['scores'])) for o in outs])
df = pd.concat([df, topic_df], axis=1)

# — 5) SAVE —
df.to_csv(output_csv, index=False)
print(f"All done — saved features to {output_csv}")


Device set to use cuda:0


✅ All done — saved features to test_features.csv


In [None]:
from google.colab import files
files.download('test_features.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>