**Accessing via Huggingface API**

In [None]:
pip install huggingface_hub



In [None]:
pip install pandas requests



In [None]:
import pandas as pd
import requests
import time

# -------------------------------------------------
# CONFIGURATION
# -------------------------------------------------
HF_API_KEY = "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"
MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B"

# CSV splits to process
csv_files = ["/content/VAST_train.csv"]

# If your dataset has a "dataset_description"
# you might define it here:
DATASET_DESCRIPTION = "VAST isimli, 2020 yılında oluşturulmuş bu dataset, The New York Times’in ‘Room for Debate’ bölümündeki yorumların etiketlenmesiyle hazırlanmış olup, özellikle tutum (stance) tespiti görevleri için tasarlanmıştır. Politika (örneğin, 'Filistin devleti'), eğitim (örneğin, 'imtiyazlı okullar') ve halk sağlığı (örneğin, 'çocukluk aşıları') gibi geniş temaları kapsayan çeşitli konuları kapsar. Ayrıca, ‘kampüste silahlar’ ile ‘kampüste ateşli silahlar’ gibi benzer ifadelerin bir arada bulunduğu örnekleri de içerir."

# -------------------------------------------------
# HELPER: Generate 'Context' text from model
# -------------------------------------------------
def generate_context(text, target):
    """
    Calls the Hugging Face Inference API with the prompt
    and returns the model's response as a string.
    """
    # Build prompt
    prompt = f"""Sosyal bir uzman olduğunu varsayarak, aşağıda {DATASET_DESCRIPTION} datasetinin kısa bir pasajı verilmiştir, lütfen adım adım düşün, metindeki anahtar kelimeleri çıkar, yazarın ima ettiği duyguları, retorik araçları vb. analiz et, son olarak yazarın Target hakkındaki duruşunu kısaca analiz et, sonuca varmadan analiz sürecini vermeye dikkat et. Ayrıca açıklama 100 kelimeyi geçmesin ve hiçbir şekilde link içermesin.
Passage: {text}
Target: {target}
"""

    # Prepare JSON payload for the HF Inference API
    payload = {
        "inputs": prompt,
        "parameters": {
            "max_new_tokens": 500,
        }
    }

    # Make request
    headers = {
        "Authorization": f"Bearer {HF_API_KEY}",
        "Content-Type": "application/json"
    }
    HF_API_URL = f"https://api-inference.huggingface.co/models/{MODEL_ID}"

    response = requests.post(HF_API_URL, headers=headers, json=payload)

    # If no success, handle errors
    if response.status_code != 200:
        print(f"Error: {response.status_code}, {response.text}")
        return ""

    data = response.json()
    # data is typically [{"generated_text": "..."}]
    if isinstance(data, list) and len(data) > 0 and "generated_text" in data[0]:
        return data[0]["generated_text"]
    elif isinstance(data, dict) and "generated_text" in data:
        return data["generated_text"]

    return ""

# -------------------------------------------------
# MAIN: For each CSV, add a "Context" column
# -------------------------------------------------
for csv_file in csv_files:
    print(f"Processing: {csv_file}")
    df = pd.read_csv(csv_file, encoding="utf-8")

    # create a new column "Context" by calling generate_context row by row.
    context_list = []
    for i, row in df.iterrows():
        target_val = row["Target"]
        text_val = row["Text"]

        # Generate context from LLM
        context_text = generate_context(text_val, target_val)

        # For large datasets, you might want to add a short sleep
        # to avoid rate limits or slow down
        # time.sleep(1)

        context_list.append(context_text)

    # Add the new column
    df["Context"] = context_list

    # Save the updated CSV
    output_name = csv_file.replace(".csv", "_with_context.csv")
    df.to_csv(output_name, index=False, encoding="utf-8-sig")
    print(f"Saved: {output_name}")


**Accesiing Locally**

In [1]:
pip install torch transformers accelerate sentencepiece

Collecting torchNote: you may need to restart the kernel to use updated packages.

  Using cached torch-2.5.1-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting transformers
  Using cached transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
Collecting accelerate
  Downloading accelerate-1.2.1-py3-none-any.whl.metadata (19 kB)
Collecting sentencepiece
  Using cached sentencepiece-0.2.0-cp312-cp312-win_amd64.whl.metadata (8.3 kB)
Collecting filelock (from torch)
  Using cached filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4.8.0 (from torch)
  Using cached typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting networkx (from torch)
  Using cached networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Downloading jinja2-3.1.5-py3-none-any.whl.metadata (2.6 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting setuptools (from torch)
  Using cached set

In [2]:
!git lfs install
!git clone https://huggingface.co/NousResearch/Hermes-3-Llama-3.1-8B

Updated Git hooks.
Git LFS initialized.


Cloning into 'Hermes-3-Llama-3.1-8B'...
Filtering content:  50% (2/4)
Filtering content:  50% (2/4), 1.66 GiB | 1.44 MiB/s
Filtering content:  75% (3/4), 1.66 GiB | 1.44 MiB/s
Filtering content:  75% (3/4), 2.32 GiB | 2.98 MiB/s
Filtering content: 100% (4/4), 2.32 GiB | 2.98 MiB/s
Filtering content: 100% (4/4), 2.95 GiB | 4.38 MiB/s
Filtering content: 100% (4/4), 2.95 GiB | 4.90 MiB/s, done.


In [4]:
pip install pandas

Collecting pandas
  Using cached pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.3-cp312-cp312-win_amd64.whl (11.5 MB)
Using cached pytz-2024.2-py2.py3-none-any.whl (508 kB)
Using cached tzdata-2024.2-py2.py3-none-any.whl (346 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.2.3 pytz-2024.2 tzdata-2024.2
Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM

############################
# CONFIGURATION
############################
MODEL_PATH = "C:/Users/nuref/Desktop/Stance_Detection_Datasets/LLM_Context/Hermes-3-Llama-3.1-8B"  # local folder with the model
INPUT_CSV = "C:/Users/nuref/Desktop/Stance_Detection_Datasets/VAST/VAST_dev.csv"                  
OUTPUT_CSV = "C:/Users/nuref/Desktop/Stance_Detection_Datasets/VAST/VAST_dev_with_context.csv"     # output CSV with new 'Context' column

DATASET_DESCRIPTION = (
    "VAST isimli, 2020 yılında oluşturulmuş bu dataset, The New York Times’in ‘Room for Debate’ bölümündeki yorumların "
    "etiketlenmesiyle hazırlanmış olup, özellikle tutum (stance) tespiti görevleri için tasarlanmıştır. Politika (örneğin, 'Filistin devleti'),"
    "eğitim (örneğin, 'imtiyazlı okullar') ve halk sağlığı (örneğin, 'çocukluk aşıları') gibi geniş temaları kapsayan çeşitli konuları kapsar. "
    "Ayrıca, ‘kampüste silahlar’ ile ‘kampüste ateşli silahlar’ gibi benzer ifadelerin bir arada bulunduğu örnekleri de içerir."
)

############################
# LOAD MODEL LOCALLY
############################
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.float16,  # or bfloat16, etc. adjust as needed
    device_map="auto",         # automatically map to GPU or CPU
)

############################
# GENERATION FUNCTION
############################
def generate_context(text, target):
    """
    Generates context from the local LLaMA model:
    - No external links
    - Under ~100 words
    """
    # Prompt: no links, under 100 words
    prompt = f"""
Sosyal bir uzman olduğunu varsayarak, aşağıda {DATASET_DESCRIPTION} datasetinden
kısa bir pasaj verilmiştir. Lütfen adım adım düşün, metindeki anahtar kelimeleri çıkar;
yazarın ima ettiği duyguları, retorik araçları vb. analiz et;
ve son olarak yazarın Target hakkındaki duruşuna dair kısa bir analiz yap, sonuca varmadan analiz sürecini vermeye dikkat et.
Ayrıca açıklama 100 kelimeyi geçmesin ve hiçbir şekilde link içermesin.

Passage: {text}
Target: {target}
"""

    # Encode prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Generate text
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=512,        # limit generation
            do_sample=True
        )

    # Decode the entire output
    full_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Separate out the part after the prompt
    prompt_len = len(tokenizer.encode(prompt))
    response = tokenizer.decode(output_ids[0][prompt_len:], skip_special_tokens=True)

    # Optional: a quick post-process to ensure we don't exceed ~100 words
    #   (model might still exceed if it doesn't follow instructions)
    response_words = response.strip().split()
    if len(response_words) > 100:
        response = " ".join(response_words[:100])

    return response.strip()

############################
# MAIN FLOW
############################
def main():
    print(f"Reading input CSV: {INPUT_CSV}")
    df = pd.read_csv(INPUT_CSV, encoding="utf-8")

    # We'll assume columns "Text" and "Target" exist in the CSV
    # If your file uses different names, update accordingly.
    if "Text" not in df.columns or "Target" not in df.columns:
        raise ValueError("Input CSV must have 'Text' and 'Target' columns.")

    # Generate 'Context' for each row
    contexts = []
    for i, row in df.iterrows():
        text_val = str(row["Text"])
        target_val = str(row["Target"])
        ctx = generate_context(text_val, target_val)
        contexts.append(ctx)
        if (i+1) % 10 == 0:
            print(f"Processed {i+1} rows...")

    df["Context"] = contexts

    # Save the output
    print(f"Saving to {OUTPUT_CSV}")
    df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")
    print("Done!")

if __name__ == "__main__":
    main()


In [4]:
import os
import requests
import pandas as pd

############################
# CONFIGURATION
############################
# Instead of local model paths, we have an endpoint URL:
INFERENCE_ENDPOINT_URL = "https://m8zx5wkcgrqynj2p.us-east-1.aws.endpoints.huggingface.cloud"  # <-- your HF Inference Endpoint URL
HF_API_TOKEN = "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"  # your HF token or endpoint token

INPUT_CSV = "C:/Users/nuref/Desktop/Stance_Detection_Datasets/VAST/VAST_dev.csv"
OUTPUT_CSV = "C:/Users/nuref/Desktop/Stance_Detection_Datasets/VAST/VAST_dev_with_context.csv"

DATASET_DESCRIPTION = (
    "VAST isimli, 2020 yılında oluşturulmuş bu dataset, The New York Times’in ‘Room for Debate’ bölümündeki "
    "yorumların etiketlenmesiyle hazırlanmış olup, özellikle tutum (stance) tespiti görevleri için tasarlanmıştır. "
    "Politika (örneğin, 'Filistin devleti'), eğitim (örneğin, 'imtiyazlı okullar') ve halk sağlığı (örneğin, 'çocukluk aşıları') "
    "gibi geniş temaları kapsayan çeşitli konuları kapsar. Ayrıca, ‘kampüste silahlar’ ile ‘kampüste ateşli silahlar’ "
    "gibi benzer ifadelerin bir arada bulunduğu örnekleri de içerir."
)

############################
# GENERATION FUNCTION
############################
def generate_context(text, target):
    """
    Calls your remote Inference Endpoint with the prompt.
    Expects the endpoint to return generated text in JSON format.
    """
    prompt = f"""
Sosyal bir uzman olduğunu varsayarak, aşağıda {DATASET_DESCRIPTION} datasetinden
kısa bir pasaj verilmiştir. Lütfen adım adım düşün, metindeki anahtar kelimeleri çıkar;
yazarın ima ettiği duyguları, retorik araçları vb. analiz et;
ve son olarak yazarın Target hakkındaki duruşuna dair kısa bir analiz yap, sonuca varmadan analiz sürecini vermeye dikkat et.
Ayrıca açıklama 100 kelimeyi geçmesin ve hiçbir şekilde link içermesin.

Passage: {text}
Target: {target}
"""

    payload = {
        "inputs": prompt,
        "parameters": {
            "max_new_tokens": 512,
            "temperature": 0.7,
            "top_p": 0.9,
            "do_sample": True
        }
    }

    headers = {
        "Authorization": f"Bearer {HF_API_TOKEN}",
        "Content-Type": "application/json"
    }

    # POST request to your HF Endpoint
    response = requests.post(INFERENCE_ENDPOINT_URL, headers=headers, json=payload)
    if response.status_code != 200:
        print("Error:", response.status_code, response.text)
        return ""  # return empty on error

    data = response.json()
    # Typically, data might be a list of dicts with "generated_text" or something similar
    # e.g. [{"generated_text": "..."}]
    if isinstance(data, list) and len(data) > 0 and "generated_text" in data[0]:
        generated_text = data[0]["generated_text"]
    elif "generated_text" in data:
        generated_text = data["generated_text"]
    else:
        generated_text = ""

    # Optionally truncate to 100 words
    words = generated_text.split()
    if len(words) > 100:
        generated_text = " ".join(words[:100])
    return generated_text.strip()

############################
# MAIN FLOW
############################
def main():
    print(f"Reading input CSV: {INPUT_CSV}")
    df = pd.read_csv(INPUT_CSV, encoding="utf-8")

    if "Text" not in df.columns or "Target" not in df.columns:
        raise ValueError("Input CSV must have 'Text' and 'Target' columns.")

    contexts = []
    for i, row in df.iterrows():
        text_val = str(row["Text"])
        target_val = str(row["Target"])
        ctx = generate_context(text_val, target_val)
        contexts.append(ctx)
        if (i+1) % 10 == 0:
            print(f"Processed {i+1} rows...")

    df["Context"] = contexts

    print(f"Saving to {OUTPUT_CSV}")
    df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")
    print("Done!")

if __name__ == "__main__":
    main()


Reading input CSV: C:/Users/nuref/Desktop/Stance_Detection_Datasets/VAST/VAST_dev.csv
Processed 10 rows...


KeyboardInterrupt: 