### WebScraping with LLMs

In [79]:
%pip install -qU langchain-groq

In [80]:
import requests
import json
from bs4 import BeautifulSoup
from langchain_groq import ChatGroq
from google.colab import userdata
import getpass
import os
if "GROQ_API_KEY" not in os.environ:
    try:
        os.environ["GROQ_API_KEY"] = userdata.get('GROQ_API_KEY')
    except Exception as e:
        os.environ["GROQ_API_KEY"] = getpass.getpass("Enter your Groq API key: ")


In [81]:
# Configure the Groq client
llm_Groq = ChatGroq(model="llama-3.1-8b-instant", temperature=0)

def fetch_page(url):
    """Performs a GET request on the page and returns its content."""
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.text
    else:
        return None

def clean_html(html):
    """Removes HTML tags and returns only the raw text."""
    soup = BeautifulSoup(html, "html.parser")
    return soup.get_text(separator="\n").strip()

def extract_info(content, task):
    """Uses Groq LLaMA 3.1 to extract specific information."""
    prompt = f"""
    I will provide you with the content of a news article. Extract only the {task} without any additional text.

    Article:
    {content[:4000]}  # Limit the length to avoid errors

    Expected result:
    """
    response = llm_Groq.invoke(prompt)
    return response.content.strip()

def scrape_news(url):
    """Scrapes the page and uses agents to extract information."""
    content = fetch_page(url)
    if not content:
        return {"error": "Unable to retrieve content"}

    clean_content = clean_html(content)
    title = extract_info(clean_content, "article title")
    text = extract_info(clean_content, "full article text")
    date_author = extract_info(clean_content, "publication date and author, british format - MM/DD/YYYY")

    return {
        "title": title,
        "text": text,
        "date_author": date_author
    }

# Esempio di utilizzo
url = "https://www.ilmessaggero.it/italia/terremoto_campi_flegrei_oggi_napoli_sciame_sismico_scuole_evacuate_fuorigrotta-8637855.html"
data = scrape_news(url)
print(json.dumps(data, indent=2, ensure_ascii=False))


{
  "title": "Terremoto Napoli Campi Flegrei, registrate oltre 40 scosse: la più forte da 3.1. Scuole evacuate a Fuorigrotta",
  "text": "Terremoto Napoli Campi Flegrei, registrate oltre 40 scosse: la più forte da 3.1. Scuole evacuate a Fuorigrotta\n\n Il sisma è stato avvertito in tantissime zone della città (Pianura, Fuorigrotta) e province (Quarto, Arco Felice)\n\n3 Minuti di Lettura\n\nMercoledì  5 Febbraio 2025, 09:19                            \n - Ultimo aggiornamento: 11:41\n\nOre 18\n - \nNewsletter\n\nIl punto serale sulle notizie del giorno\n\n\nIscriviti\n e ricevi le notizie via email\n\nTerremoto\n ai \nCampi Flegrei\n. Un nuovo sciame sismico ha fatto tremare ancora una volta la zona di Napoli. La scossa più forte fino ad ora registrata è stata di 3.1 alle ore 8.52 alla profondità di 2.7 km con epicentro zona Pisciarelli ad Agnano. Il sisma è stato sentito in tantissime zone della città (Pianura, Fuorigrotta) e province (Quarto, Arco Felice).  #",
  "date_author": "02/05

## Dataset Augementation with LLMs

In [82]:
!git clone https://github.com/usc-isi-i2/logical-fallacy-identification.git




Cloning into 'logical-fallacy-identification'...
remote: Enumerating objects: 5208, done.[K
remote: Counting objects: 100% (1045/1045), done.[K
remote: Compressing objects: 100% (395/395), done.[K
remote: Total 5208 (delta 704), reused 978 (delta 641), pack-reused 4163 (from 1)[K
Receiving objects: 100% (5208/5208), 228.95 MiB | 11.21 MiB/s, done.
Resolving deltas: 100% (1611/1611), done.


In [83]:
import pandas as pd
import random
from langchain_groq import ChatGroq
max_length = 100
df = pd.read_csv("logical-fallacy-identification/data/fallacy_train.csv")[["sentence", "fine_class"]]
# Filter out sentences that are too long
df_filtered = df[df["sentence"].str.len() <= max_length]
df_filtered.head()

Unnamed: 0,sentence,fine_class
1,All writing teachers must be awesome because M...,faulty generalization
2,"If we teach anton how to drive the car, he’ll ...",faulty generalization
3,Labor unions in America are just as bad as Rus...,fallacy of relevance
4,America: love it or leave it. Either you’re f...,false dilemma
5,claiming that an idea or belief is true simply...,ad populum


In [84]:
# Select N random rows
N = 5  # Change this value to select more or fewer rows
df_sample = df_filtered.sample(N)

In [85]:
llm_Groq = ChatGroq(model="llama-3.1-8b-instant", temperature=0)

In [86]:
# Function to generate a news-like text incorporating the fallacious sentence
def generate_news(sentence, fine_class):
    prompt = f"""Write a short news article (3-4 sentences) that naturally incorporates the following fallacious statement.
    The statement must be included exactly as it is, without quotation marks. Ensure it is well integrated into the news context.
    \nStatement: {sentence}\nFallacy type: {fine_class}\n\nNews article:"""
    return llm_Groq.invoke(prompt).content

# Generate news articles for the selected sentences
results = []
for _, row in df_sample.iterrows():
    news = generate_news(row["sentence"], row["fine_class"])
    results.append({"sentence": row["sentence"], "fine_class": row["fine_class"], "generated_news": news})

# Create a DataFrame with the results
df_results = pd.DataFrame(results)

# Print the results
for _, row in df_results.iterrows():
    print(f"Fallacy: {row['fine_class']}")
    print(f"Sentence: {row['sentence']}")
    print(f"Generated News: {row['generated_news']}\n")

Fallacy: fallacy of logic
Sentence: All judges are fair-minded individuals;  therefore, Judge Ito is fair in his decisions.
Generated News: A recent study has found that the number of appeals in the US court system has decreased significantly since the implementation of a new judicial training program aimed at improving judges' decision-making skills. The program, which focuses on empathy and critical thinking, has been credited with reducing the number of disputed verdicts. All judges are fair-minded individuals; therefore, Judge Ito is fair in his decisions. As a result, many are hailing the program as a major success in promoting fairness and justice in the American judicial system.

Fallacy: fallacy of logic
Sentence: Good people don't lie. You told a lie. Therefore, you are not a good person.
Generated News: A local politician has come under fire after being accused of misrepresenting their voting record in a recent campaign speech. Critics argue that the politician's actions demo

In [87]:
df_results.head()

Unnamed: 0,sentence,fine_class,generated_news
0,All judges are fair-minded individuals; there...,fallacy of logic,A recent study has found that the number of ap...
1,Good people don't lie. You told a lie. Therefo...,fallacy of logic,A local politician has come under fire after b...
2,We should stop using hairspray because it is s...,fallacy of logic,A severe winter storm has hit the northeastern...
3,“Live with me or live on the streets”,false dilemma,A new homeless shelter in downtown Los Angeles...
4,Using a double standard or arguing for an unju...,intentional,A recent proposal to increase funding for loca...


### Simple Hallucination Detection Framework

In [90]:
from difflib import SequenceMatcher

In [91]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def check_hallucination(text, k=2, similarity_threshold=0.1):
    llm_Groq = ChatGroq(model="llama-3.1-8b-instant", temperature=0)

    # Get original response
    original_response = llm_Groq.invoke(text).content
    print("Original Response:", original_response)

    # Generate additional responses with high temperature
    llm_Groq_varied = ChatGroq(model="llama-3.1-8b-instant", temperature=1)
    responses = []
    for i in range(k):
        response = llm_Groq_varied.invoke(text).content
        responses.append(response)
        print(f"Response {i+1} with high temperature:", response)

    # Check similarity
    similarity_inconsistency = any(similar(original_response, resp) < similarity_threshold for resp in responses)

    # Model consistency check
    inconsistency_check_prompt = f"""Given the following prompt and responses, are there any inconsistencies?
    Prompt: '{text}'
    Original response: '{original_response}'
    Additional responses:
    {' '.join([f'Response {i+1}: {resp}' for i, resp in enumerate(responses)])}

    Respond with only 'True' if there are inconsistencies, or 'False' if there are no inconsistencies."""

    model_inconsistency = llm_Groq.invoke(inconsistency_check_prompt).content.strip().lower() == 'true'

    print("Model inconsistency check result:", "True" if model_inconsistency else "False")

    return "Potential hallucination detected." if (similarity_inconsistency or model_inconsistency) else "No hallucination detected."

In [92]:
prompts = [
        "Provide academic paper citations about the impact of moonlight on deep-sea coral growth",
        "10+15*2"
    ]
for i, prompt in enumerate(prompts, 1):
    print(f"\\nChecking prompt {i}: {prompt}")
    result = check_hallucination(prompt)
    print(f"Final result of hallucination check for prompt {i}: {result}\\n")

\nChecking prompt 1: Provide academic paper citations about the impact of moonlight on deep-sea coral growth
Original Response: Unfortunately, I couldn't find any specific academic papers that directly address the impact of moonlight on deep-sea coral growth. However, I can provide some relevant studies that discuss the effects of light on deep-sea corals and the importance of lunar cycles in marine ecosystems.

1. **Hochberg, E. J., & Butman, C. A. (1985).** "Deep-sea coral reefs: Upwelling centers and associated food webs on the Pacific outer shelf." Journal of Marine Research, 43(2), 251-275. (This study discusses the importance of upwelling and associated food webs on deep-sea coral reefs, but does not specifically address the impact of moonlight.)

2. **Roberts, J. M., et al. (2009).** "Ecological characteristics of deep-water coral reefs." Current Biology, 19(16), R129-R134. (This review article discusses the ecological characteristics of deep-water coral reefs, including the imp