In [None]:
import re
import os
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup

# ---------- CLEANING & EXTRACTION FUNCTIONS ----------

def extract_para(url):
    response = requests.get(url)
    time.sleep(3)
    soup = BeautifulSoup(response.content, "html.parser")
    paragraph_tagged = soup.find_all(class_="tat")
    return paragraph_tagged

def clean_paragraph(p):
    p = str(p)
    p = re.sub('<p .*>', " ", p)
    p = re.sub('</p>', " ", p)
    p = re.sub(r"\n|\t", " ", p)
    return p.strip()

def find_sloke_number(s):
    p = 0
    list_str = ""
    while p < len(s) and s[p] != '[':
        p += 1
    for i in range(p + 1, min(len(s), p + 25)):
        if s[i] == ']':
            break
        list_str += s[i]
    return list_str.split(',') if list_str else []

# ---------- MAIN SCRAPE FUNCTION ----------

def compile(url, folder_path, sarga_index):
    print(f"🔍 Scraping: {url}")
    paragraph_tagged = extract_para(url)
    paragraph = [clean_paragraph(tag) for tag in paragraph_tagged]
    sloka_number = [find_sloke_number(i[-25:]) if len(i) > 25 else [] for i in paragraph]
    
    compiled_data = [(url, sloka_number[i], paragraph[i]) for i in range(len(paragraph))]
    df = pd.DataFrame(compiled_data, columns=["url", "slokenumber", "text"])

    # Only save if more than 3 rows
    if len(df) > 3:
        os.makedirs(folder_path, exist_ok=True)
        file_name = f"sarga{sarga_index}.csv"
        file_path = os.path.join(folder_path, file_name)
        df.to_csv(file_path, index=False)
        print(f"✅ Saved: {file_path}")
    else:
        print(f"⚠️ Skipped {url} (less than 4 rows)")
    
    return df

# ---------- SCRAPER PER KANDA ----------

def scrape_kanda(kanda,updatedkhanda=''):
    if updatedkhanda=='':
        updatedkhanda=kanda
    base_url = f"https://www.valmikiramayan.net/utf8/{kanda}"
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, "html.parser")
    links = [a['href'] for a in soup.find_all('a')]
    sargas = [x for x in links if x.startswith('sarga')]
    # if sargas[0]=="sarga0":
    #     sargas.remove("sarga0")
    if kanda=='yuddha':
    #     sargas.insert(0,'sarga0')
        # sargas.remove("sarga0")
        sargas=sargas[1:]
    folder_path = os.path.join("scraped_sargas", kanda)
    print(f"\n📖 Kanda: {kanda} | Sargas found: {len(sargas)}")

    for i in range(1, len(sargas)):
        sarga_url = f"{base_url}/sarga{i}/{updatedkhanda}sans{i}.htm"
        try:
            compile(sarga_url, folder_path, i)
            time.sleep(1)
        except Exception as e:
            print(f"❌ Error scraping {sarga_url}: {e}")

# ----------- Run the scraper -----------
# scrape_kanda('baala','bala')  

# scrape_kanda('kish','kishkindha')  
# scrape_kanda('sundara')
# scrape_kanda('yuddha')
scrape_kanda('ayodhya')

In [None]:
import pandas as pd
import glob
import re
from sentence_transformers import SentenceTransformer, util


def load_ramayana_corpus(base_folder):
    files = glob.glob(f"{base_folder}/**/*.csv", recursive=True)  
    df_list = [pd.read_csv(f) for f in files]
    df = pd.concat(df_list, ignore_index=True)
    texts = df["text"].dropna().tolist()
    return texts

model = SentenceTransformer('all-roberta-large-v1')  


def clean_text(text):
    text = re.sub(r'\s+', ' ', text).strip()
    return text
corpus = [clean_text(t) for t in load_ramayana_corpus("scraped_sargas")]
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)

# Step 2: Clean paragraph text

# corpus = [clean_text(t) for t in load_ramayana_corpus("scraped_sargas/baala")]
# corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
# Load and embed corpus
corpus = [clean_text(t) for t in load_ramayana_corpus("scraped_sargas")]
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)

# Step 4: Fact checking function
def fact_check(statement, threshold=0.64, top_k=10):
    query_embedding = model.encode(statement, convert_to_tensor=True)
    scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    
    best_score = scores.max().item()
    best_index = scores.argmax().item()
    
    print(f"\n🔍 Fact: {statement}")
    print(f"🔗 Top match score: {best_score:.4f}")
    print(f"📘 Closest match: {corpus[best_index]}")

    return "TRUE" if best_score > threshold else "FALSE"

In [None]:
true_statment=["Rama is the eldest son of King Dasharatha.",
"Sita was discovered by King Janaka in a furrow during ploughing and was later adopted by him."
"Lakshmana, Ramaâ€™s devoted younger brother, accompanied him into exile.",
"Bharata, another brother of Rama, revered him and ruled as regent in his absence.",
"Hanuman was the best minister of Sugreeva.",
"Ravana, the demon king of Lanka, abducted Sita, setting in motion the events of the epic.",
"On seeing Hanuman, Sita lost her consciousness for a long time.",
"Nila, a general of the monkey army, was rendered unconscious by a mystic arrow shot by Ravana.",
"Nala constructed a bridge for Rama to cross over to Lanka.",
"The sage Vishwamitra mentored Rama during his early life, imparting knowledge of warfare and ritual."]


In [None]:
false_statement=["Sita herself built the bridge to Lanka with the help of the monkey army.",
"Lakshmana was a disciple of Ravana and learned warfare techniques from him.",
"Bharata was originally from Lanka and was later adopted by King Dasharatha.",
"After meeting Sita, Hanuman lost his ability to speak entirely.",
"Ravana is depicted solely as a peaceful poet who never engaged in any battles.",
"Ramaâ€™s exile lasted only one year rather than the traditionally recounted fourteen years.",
"Sita and Rama were married in Lanka with Ravana acting as the priest for their wedding.",
"Lakshmana is not mentioned at all in the Valmiki Ramayana.",
"Bharata built a grand palace in Lanka following Ramaâ€™s victory over Ravana.",
"Hanuman was the son of Ravana."]

for i in false_statement:
    # print(i)
    if fact_check(i)=='True':
        # count+=1
        print(i)
# print(count)

