In [None]:
import os
import json

def parse_json(data):
    parsed_data = {}

    # Extracting common fields
    if 'paper_id' in data:
        parsed_data['paper_id'] = data['paper_id']
    if 'title' in data:
        parsed_data['title'] = data['title']
    if 'year' in data:
        parsed_data['year'] = data['year']
    if 'abstract' in data:
        parsed_data['abstract'] = data['abstract']

    if 'authors' in data:
        authors = []
        for author in data['authors']:
            author_info = {
                'first_name': author.get('first', ''),
                'middle_name': author.get('middle', ''),
                'last_name': author.get('last', ''),
                'affiliation': author.get('affiliation', {}).get('institution', ''),
                'location': author.get('affiliation', {}).get('location', {}).get('settlement', ''),
                'region': author.get('affiliation', {}).get('location', {}).get('region', ''),
                'email': author.get('email', '')
            }
            authors.append(author_info)
        parsed_data['authors'] = authors

    if 'pdf_parse' in data and 'keywords' in data['pdf_parse']:
        parsed_data['keywords'] = data['pdf_parse']['keywords']

    if 'pdf_parse' in data and 'abstract' in data['pdf_parse']:
        parsed_data['abstract_breakdown'] = []
        for abstract_text in data['pdf_parse']['abstract']:
            parsed_data['abstract_breakdown'].append(abstract_text['text'])

    if 'pdf_parse' in data and 'body_text' in data['pdf_parse']:
        parsed_data['body_text'] = []
        for text in data['pdf_parse']['body_text']:
            parsed_data['body_text'].append(text['text'])

    if 'pdf_parse' in data and 'back_matter' in data['pdf_parse']:
        parsed_data['back_matter'] = []
        for text in data['pdf_parse']['back_matter']:
            parsed_data['back_matter'].append(text['text'])

    if 'pdf_parse' in data and 'ref_entries' in data['pdf_parse']:
        ref_entries = data['pdf_parse']['ref_entries']
        parsed_data['ref_entries'] = {}
        for key, value in ref_entries.items():
            if 'type_str' in value and value['type_str'] == 'table':
                parsed_data['ref_entries'][key] = value['content']

    return parsed_data

# Path to the folder containing JSON files
folder_path = r"C:\Users\Asus\Downloads\assignementdataset\assignementdataset"
output_folder = r"C:\Users\Asus\Downloads\assignementdataset\parsed_files"

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Iterate over JSON files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        file_path = os.path.join(folder_path, filename)

        # Load and parse the JSON file
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            parsed_data = parse_json(data)

            # Save the parsed data to a new file
            output_file_path = os.path.join(output_folder, f"parsed_{filename}")
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                json.dump(parsed_data, output_file, indent=2)


In [1]:
!pip install -U langchain-community


Collecting langchain-community
  Downloading langchain_community-0.3.13-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.13 (from langchain-community)
  Downloading langchain-0.3.13-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.27 (from langchain-community)
  Downloading langchain_core-0.3.28-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.7.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.23.2-py3-none-any.whl.metadata (7.1 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-

In [2]:
!pip install faiss-gpu


Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [3]:
!pip install langchain transformers




In [4]:
!pip install langchain transformers




In [5]:
!pip install langchain-huggingface

Collecting langchain-huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Downloading langchain_huggingface-0.1.2-py3-none-any.whl (21 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.1.2


In [6]:
!pip install --upgrade langchain



In [7]:
!pip install langchain>=0.0.189  # upgrade langchain

In [8]:
!pip install faiss-gpu # Install FAISS GPU version



In [14]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import torch
import os
import json

# Check if a GPU is available and set the device to GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device set to use {device}")

# Initialize embeddings model
embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Load and preprocess corpus
directory_path = 'data'
corpus = []
for filename in os.listdir(directory_path):
    if filename.endswith('.json'):
        with open(os.path.join(directory_path, filename), 'r') as file:
            data = json.load(file)
            if isinstance(data, dict) and "body_text" in data:
                corpus.extend(data["body_text"])
#Translation



# Create FAISS vector store
vector_store = FAISS.from_texts(corpus, embeddings_model)

# Load Question-Answering model and pipeline
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name).to(device)
hf_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=0 if device == "cuda" else -1)

# Define chatbot interaction
def chatbot(query):
    # Retrieve relevant documents
    retrieved_docs = vector_store.similarity_search(query, k=1)
    if not retrieved_docs:
        return "I couldn't find any relevant information."

    # Use the first retrieved document as context
    context = retrieved_docs[0].page_content[:500]  # Truncate context for safety
    print(f"Retrieved context: {context}")  # Debugging

    # Pass question and context to the Hugging Face QA pipeline
    try:
        result = hf_pipeline(question=query, context=context)
        if not result["answer"].strip() or result["answer"] in context:
            return "I couldn't find a satisfactory answer. Could you rephrase your question?"
        return result["answer"]
    except ValueError as e:
        return f"Error: {str(e)}"


# Run chatbot
while True:
    query = input("Ask a question about reproductive medicine (type 'exit' to quit): ")
    if query.lower() == "exit":
        break
    print(f"Answer: {chatbot(query)}")


Device set to use cuda


Some weights of T5ForQuestionAnswering were not initialized from the model checkpoint at google/flan-t5-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


Ask a question about reproductive medicine (type 'exit' to quit): what is sterility ?
Retrieved context: Fertility and Sterility®
Answer: I couldn't find a satisfactory answer. Could you rephrase your question?
Ask a question about reproductive medicine (type 'exit' to quit): what is maternal anxiety about pregnancy ?
Retrieved context: For many of the primary outcomes (maternal anxiety and perinatal loss) in this review, the authors recognised a lack of trial evidence.
Answer: I couldn't find a satisfactory answer. Could you rephrase your question?
Ask a question about reproductive medicine (type 'exit' to quit): how can anexiety affect preganancy ?
Retrieved context: Modifiable lifestyle factors such as weight, diet, alcohol intake, ca eine intake, physical activity, smoking, and other substance abuse may a ect the chance of people with infertility having a live birth (Homan 2007 Rooney 2014) .Research suggests that these factors may have important e ects both during the preconceptio

KeyboardInterrupt: Interrupted by user

In [17]:
!apt-get install git -y



Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.11).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [18]:
!git config --global user.name "nermine01"
!git config --global user.email "nermine.haouala@esprit.tn"


In [20]:
!git clone https://github.com/nermine01/rep-medecine


Cloning into 'rep-medecine'...


In [22]:
%cd rep-medecine


/content/rep-medecine


In [None]:
!cp /content/your_notebook_name.ipynb .
