In [68]:
import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [69]:
import os, re, string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

STOPWORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()

In [70]:
# Load text files
def load_text_files(folder_path):
    data = {}
    doc_id_to_filename = {}
    doc_id = 0
    print(f"Scanning folder: {folder_path}")
    for filename in os.listdir(folder_path):
        if filename.lower() == "queries.txt":
            continue
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()
                data[doc_id] = content
                doc_id_to_filename[doc_id] = filename
                print(f"Loaded doc_id {doc_id} -> {filename}")
                doc_id += 1

    print(f"Total documents loaded: {len(data)}")
    return data, doc_id_to_filename

In [71]:
# Text cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    tokens = word_tokenize(text)

    cleaned_tokens = [
        LEMMATIZER.lemmatize(word)
        for word in tokens
        if word not in STOPWORDS and len(word) > 1
    ]
    return " ".join(cleaned_tokens)

In [72]:
# Generate queries file
def generate_queries_file(output_file):

    queries = [
    "impact of chemical contaminants on marine mammals biomagnification",
    "eutrophication causes and effects on oceanic dead zones",
    "microplastics ingestion and tissue necrosis in coral reefs",
    "long-term effects of oil spills on seabird health",
    "ghost nets entanglement mortality sea turtles macroplastic debris",
    "POPs endocrine disrupting chemicals marine life reproduction",
    "nitrogen and phosphorus runoff sources coastal hypoxia",
    "microplastic contamination as pathogen and toxin vectors in marine organisms",
    "PAHs chronic toxicity marine mammals respiratory inflammation",
    "macroplastic debris ingestion starvation and false satiation in sea turtles",
    "global statistics marine plastic pollution entanglement and ingestion",
    "solutions to microplastic pollution in coastal waters",
    "biomarkers for heavy metal contamination in deep-sea fish",
    "marine debris policy and international agreements effectiveness",
    "role of zooplankton in microplastic transfer food web",
    "coastal area wastewater discharge effects on coral reef pathogens"
]

    with open(output_file, "w", encoding="utf-8") as f:
        for q in queries:
            f.write(q + "\n")

    print(f"\nGenerated queries.txt at: {output_file}")
    return queries

In [73]:
# Load Queries From File
def load_queries(query_file):
    queries = []
    print(f"\nLoading queries from: {query_file}")

    with open(query_file, "r", encoding="utf-8") as f:
        for line in f:
            q = line.strip()
            if q:
                queries.append(q)
                print(f"Loaded query: {q}")

    return queries

In [74]:
# Compute TF-IDF + Cosine Similarity
def compute_similarity(doc_texts, queries, doc_id_to_filename, output_file="similarity_results.txt"):
    print("\nComputing TF-IDF vectors...")

    vectorizer = TfidfVectorizer()

    doc_vectors = vectorizer.fit_transform(doc_texts)

    with open(output_file, "w", encoding="utf-8") as result_file:

        for query in queries:
            cleaned_query = clean_text(query)
            query_vector = vectorizer.transform([cleaned_query])
            scores = cosine_similarity(query_vector, doc_vectors)[0]
            ranking = sorted(
                list(enumerate(scores)),
                key=lambda x: x[1],
                reverse=True
            )
            print(f"\nQuery: {query}")
            result_file.write(f"\nQuery: {query}\n")
            result_file.write("="*60 + "\n")

            for rank, (doc_id, score) in enumerate(ranking, start=1):
                filename = doc_id_to_filename[doc_id]
                output_line = f"Rank {rank}: {filename}  |  Similarity = {score:.4f}"
                print(output_line)
                result_file.write(output_line + "\n")

            result_file.write("\n")

    print(f"\nSimilarity results saved to: {output_file}")

In [75]:
# Main Program
def main():

    folder_path = "/content/"
    query_file = os.path.join(folder_path, "queries.txt")

    generate_queries_file(query_file)

    data, doc_id_to_filename = load_text_files(folder_path)

    cleaned_docs = []
    for doc_id, content in data.items():
        cleaned = clean_text(content)
        cleaned_docs.append(cleaned)
        print(f"Cleaned Doc {doc_id}: {cleaned[:80]}...")

    queries = load_queries(query_file)

    compute_similarity(cleaned_docs, queries, doc_id_to_filename)

if __name__ == "__main__":
    main()


Generated queries.txt at: /content/queries.txt
Scanning folder: /content/
Loaded doc_id 0 -> Article 5.txt
Loaded doc_id 1 -> Article 4.txt
Loaded doc_id 2 -> Article 3.txt
Loaded doc_id 3 -> Article 2.txt
Loaded doc_id 4 -> Article 1.txt
Total documents loaded: 5
Cleaned Doc 0: article physical lethal harm macroplastic debris macroplastic debrislarge item l...
Cleaned Doc 1: article oil spill longterm effect bird mammal immediate mortality major oil spil...
Cleaned Doc 2: article microplastics impact coral reef ecosystem microplastics mp plastic fragm...
Cleaned Doc 3: article eutrophication formation oceanic dead zone eutrophication excessive enri...
Cleaned Doc 4: article chemical contaminant biomagnification apex predator exposure marine mamm...

Loading queries from: /content/queries.txt
Loaded query: impact of chemical contaminants on marine mammals biomagnification
Loaded query: eutrophication causes and effects on oceanic dead zones
Loaded query: microplastics ingestion and ti