In [2]:
!pip install ollama -q

In [3]:
import ollama
import pandas as pd
import re
import os

In [4]:
FILEPATH_ORIGIN = r"../data/cl_posts_date_isr.csv"
OUTPUT_PATH = r"../data/post_isr_result.csv"
CHECKPOINT_PATH = r"../data/checkpoint.csv"
ollama_host = "http://127.0.0.1:11434" 
client = ollama.Client(host=ollama_host)

In [5]:
def messages_for(user_message):
    keywords = [
        "israel", "gaza", "hamas", "palestine", "lebanon",
        "hisbollah", "hezbollah", "iran", "netanyahu", "rafah", "karim khan",
        "netanyahu arrest", "international criminal court", "netanyahu warrant",
        "hostage", "october 7", "ceasefire", "al-sinwar", "genocide",
        "war crime", "the hague", "nasrallah", "gaza strip", "united nations",
        "the bibi files", "qatar", "moral responsibility",
        "yoav gallant", "donald trump", "israel release", "bibas", "israel bus", "israel hostage deal",
        "israel gaza ceasefire", "the times of israel", "israel deal", "israeli hostages released"
    ]

    messages = [
        {
            "role": "system",
            "content": (
                "You are a highly accurate text classifier specializing in detecting references to Israel "
                "or the Middle East conflict in German-language social media posts, particularly on Telegram. "
                "Base your classification strictly on the content of the post, avoiding assumptions beyond what is explicitly stated."
            )
        },
        {
            "role": "user",
            "content": (
                f"Classify the following Telegram post strictly based on its content:\n\n"
                "Label it with '1' if the post references Israel, Palestine, Hamas, or the broader Middle East conflict, "
                "including political, military, or humanitarian aspects about Israel and Palestine.\n"
                "Label it with '0' if it is about a different topic.\n"
                "If the classification is unclear, respond ONLY with 'Uncertain'.\n\n"
                f"Consider the following keywords as strong indicators that the post is about the Israel-Middle East conflict:\n"
                f"{', '.join(keywords)}\n\n"
                "However, do not rely solely on keywords—evaluate the full context of the message.\n\n"
                f"Post:\n{user_message}"
            )
        }
    ]
    return messages


In [6]:
def classify(messages,model="llama3.2:latest"):
    response = client.chat(model, messages)["message"]["content"]
    return response

In [7]:
def parse_output(response):
    uncertain = re.search(r"uncertain", response, flags=re.IGNORECASE)
    label = re.search(r"\d", response)
    if uncertain:
        return uncertain.group()
    elif label:
        return label.group()
    else: 
        return "invalid"

In [8]:
def read_data(filepath):
    data = pd.read_csv(filepath, index_col=None)
    return data

In [9]:
def add_labels(data, labels):
    labels_col = pd.Series(labels)
    data["About Israel"] = labels_col
    return data

In [10]:
def save_data(outpath, data):
    try:
        data.to_csv(outpath, index=False)
        print("Data saved successfully")
    except:
        print("There was a problem. You might want to check your data or filepath")

In [11]:
def classification_pipe(inpath, outpath, checkpoint_path=None):
    counter = 0
    parsed_labels = []
    processed_texts = []

    if checkpoint_path and os.path.exists(checkpoint_path):
        data = pd.read_csv(checkpoint_path)
        raw_messages = data["Post Text"].to_list()
    else:
        data_origin = read_data(inpath)
        raw_messages = data_origin["Post Text"].to_list()

    for n, post in enumerate(raw_messages): 
        print(f"Processing post: {post}") 
        messages = messages_for(post)
        model_response = classify(messages)
        parsed_response = parse_output(model_response)
        print(f"Model response: {model_response} was recognized as {parsed_response}")

        parsed_labels.append(parsed_response)
        processed_texts.append(post)
        counter += 1 
        
        if counter % 10 == 0 and checkpoint_path:
            checkpoint_df = pd.DataFrame({"Post Text": processed_texts, "Label": parsed_labels})
            checkpoint_df.to_csv(checkpoint_path, mode="a", index=False, header=not os.path.exists(checkpoint_path))

    data = add_labels(data_origin, parsed_labels)
    save_data(outpath, data)


In [12]:
classification_pipe(FILEPATH_ORIGIN, OUTPUT_PATH, CHECKPOINT_PATH)

Processing post: BREAKING NEWS

Reiner Fuellmich in U-Haft

Alle Informationen um 20 Uhr - LIVE

Dlive
https://dlive.tv/bitteltv

VK
https://vk.com/bitteltv

GETTR
https://gettr.com/user/bitteltv

KICK
https://kick.com/bittel-tv

Twitter
https://twitter.com/bitteltv

Twitch
https://www.twitch.tv/bitteltv

TEILEN - mehr Infos bei:
@BITTELTV - EINFACH ANDERS
Model response: 0 was recognized as 0
Processing post: Pfizer 💥 Vertrag-Leak: Der größte Skandal der Medizingeschichte

Universitätsdozent Dr. Hannes Strasser war Impfarzt! Er sitzt in der österr. Ärztekammer und hat sich den ungeschwärzten Vertrag genau angesehen:

▪️ Dass die vorliegenden Verträge von der EU-Kommission und den Regierungen der EU-Mitgliedsstaaten unterschrieben wurden, ist für mich als Arzt unfassbar. Das ist meiner Meinung nach der größte Skandal in der Medizingeschichte. 

▪️ Man muss sich das alles vorstellen: ein Impfstoff, dessen Wirksamkeit und Sicherheit nicht bekannt ist, dessen Produktion mit großen Risiken