# Classification

In [2]:
import pandas as pd
from transformers import pipeline
from collections import defaultdict

In [4]:
# Load the classifiers
emotion_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", truncation=True)
topic_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", truncation=True)

# Define possible categories for topic classification
categories = ["money", "politics", "crime", "technology", "health", "education", "environment", "sports", "entertainment"]

# Load dataset (adjust the file path and delimiter as needed)
df = pd.read_csv("../../dataset/processed/topics_formatted.csv", sep=",")  # Adjust file path and delimiter as needed

# Extract document numbers and lemmas
doc_numbers = df['Dataset']  # Assuming the first column is named 'Doc'
lemmas = df['Sentence']  # Assuming the column is named 'Lemma'

# Initialize dictionary for storing merged results
merged_results = defaultdict(lambda: {"Emotions": set(), "Topics": set()})

# Process each  
print("Processing lemmas...")
for i, (doc_number, lemma) in enumerate(zip(doc_numbers, lemmas), 1):
    # Split the lemma column into individual lemmas (assume comma-separated)
    individual_lemmas = lemma.split(" ")

    for single_lemma in individual_lemmas:
        # Classify emotion
        emotion_predictions = emotion_classifier(single_lemma.strip())
        dominant_emotion = emotion_predictions[0]['label']  # Most probable emotion

        # Classify topic
        topic_predictions = topic_classifier(single_lemma.strip(), candidate_labels=categories)
        dominant_topic = topic_predictions['labels'][0]  # Most probable topic

        # Merge results for the same document number
        merged_results[doc_number]["Emotions"].add(dominant_emotion)
        merged_results[doc_number]["Topics"].add(dominant_topic)

    # Print progress every 10 lemmas (adjust frequency as needed)
    if i % 10 == 0 or i == len(lemmas):
        print(f"Processed {i}/{len(lemmas)} lemmas.", end="\r")

# Prepare the final results DataFrame
final_results = []
for doc_number, data in merged_results.items():
    final_results.append({
        "Doc": doc_number,
        "Emotions": " | ".join(data["Emotions"]),
        "Topics": " | ".join(data["Topics"])
    })

final_results_df = pd.DataFrame(final_results)

# Save results to a CSV file
final_results_df.to_csv("../../dataset/processed/classified_lemmas_merged.csv", index=False)
print("Classified lemmas saved to 'classified_lemmas_merged.csv'.")

# Display the first few rows of the results DataFrame
print(final_results_df.head())


Device set to use cpu
Device set to use cpu


Processing lemmas...
Classified lemmas saved to 'classified_lemmas_merged.csv'.
   Doc                          Emotions  \
0    0                 neutral | sadness   
1    1                   neutral | anger   
2    2  anger | fear | neutral | sadness   
3    3                   neutral | anger   
4    4            fear | neutral | anger   

                                              Topics  
0  crime | environment | health | entertainment |...  
1  technology | crime | environment | entertainme...  
2  technology | crime | environment | health | ed...  
3        money | crime | entertainment | environment  
4  crime | environment | health | education | ent...  
