In [1]:
!pip install transformers[sentencepiece]
import pandas as pd
from transformers import pipeline
from google.colab import files



In [2]:
# Upload CSV file
uploaded = files.upload()
filename = list(uploaded.keys())[0]
print(f"File {filename} uploaded successfully.")

# Load dataset
df = pd.read_csv(filename, delimiter=',')  # Adjust delimiter if needed
print("Original dataset preview:")
print(df.head())

# Check for empty 'name' fields -> something with preprocessing went wrong, the conversation continues after the missing values so the order is still the same
df= df[~(df['name'].isna() | (df['name'].str.strip() == ''))]

print(df.head())

Saving classified_output.csv to classified_output.csv
File classified_output.csv uploaded successfully.
Original dataset preview:
         name  deltas                                               body  \
0     t3_test     0.0   This question is a philosophical quagmire, an...   
1  t1_cd3i9qu     0.0   Would you submit this CMV if you had to attac...   
2  t1_cd395uz     0.0   Identity theft is already a problem that can ...   
3  t1_cd3jgud     1.0   While anonymity would have its benefits, I be...   
4  t1_cd39ev8     0.0   If we surrender anonymity is will allow for e...   

   story  agency  event_sequencing  world_making  
0      0       0                 0             0  
1      0       0                 0             0  
2      0       0                 0             0  
3      0       0                 0             1  
4      0       0                 0             0  
         name  deltas                                               body  \
0     t3_test     0.0   This qu

In [3]:
# Keep only rows with all required columns present
required_columns = ['name', 'agency', 'event_sequencing', 'world_making', 'story', 'body']
df_clean = df[required_columns].dropna()
print(f"\nFiltered dataset preview ({len(df_clean)} rows retained):")
print(df_clean.head())


Filtered dataset preview (47809 rows retained):
         name  agency  event_sequencing  world_making  story  \
0     t3_test       0                 0             0      0   
1  t1_cd3i9qu       0                 0             0      0   
2  t1_cd395uz       0                 0             0      0   
3  t1_cd3jgud       0                 0             1      0   
4  t1_cd39ev8       0                 0             0      0   

                                                body  
0   This question is a philosophical quagmire, an...  
1   Would you submit this CMV if you had to attac...  
2   Identity theft is already a problem that can ...  
3   While anonymity would have its benefits, I be...  
4   If we surrender anonymity is will allow for e...  


In [4]:
# Set up zero-shot classification
hypothesis_template = "This example is about {}"
classes_verbalized = ["politics", "economy", "entertainment", "sports"]

zeroshot_classifier = pipeline(
    "zero-shot-classification",
    model="MoritzLaurer/deberta-v3-base-zeroshot-v1.1-all-33"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

Device set to use cuda:0


In [5]:
from tqdm import tqdm

# Helper function for topic prediction
def get_topic_label(result, threshold=0.5):
    if result["scores"][0] < threshold:
        return "other"
    return result["labels"][0]

topics = []

for _, row in tqdm(df.iterrows(), total=len(df), desc="Classifying"):
    body = row["body"]
    result = zeroshot_classifier(
        body,
        classes_verbalized,
        hypothesis_template=hypothesis_template,
        multi_label=False
    )
    topic = get_topic_label(result)
    topics.append(topic)

df["topic"] = topics

# Save result
df.to_csv("classified_output_topics.csv", index=False)
print("Saved to classified_output_topics.csv")

Classifying:   0%|          | 0/47809 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Classifying:   0%|          | 10/47809 [00:03<2:03:17,  6.46it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Classifying: 100%|██████████| 47809/47809 [1:30:09<00:00,  8.84it/s]


Saved to classified_output_topics.csv
