In [None]:
import os
from openai import OpenAI

token = os.environ["GITHUB_TOKEN"]
print(token)
print(os.environ["OPENAI_API_KEY"])
endpoint = "https://models.inference.ai.azure.com"
model_name = "gpt-4o"

client = OpenAI(
    base_url=endpoint,
    api_key=token,
)

response = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant.",
        },
        {
            "role": "user",
            "content": "What is the capital of England?",
        }
    ],
    temperature=0,
    top_p=1.0,
    max_tokens=1000,
    model=model_name
)


In [1]:

from langchain_openai import OpenAI
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.indexes import VectorstoreIndexCreator
from IPython.display import display, Markdown



In [155]:
llm = OpenAI(model=model_name,temperature=0)

In [43]:
## get the json data into a langchain dataloader
from langchain.document_loaders import JSONLoader
import json

with open("../authors_with_h_index.json",encoding="utf-8") as f:
    authors = json.load(f)

## get only 1000 authors
authors = authors[:100]

## save the restricted list of authors to anothe json
with open("authors_with_h_index_restricted.json", "w",encoding="utf-8") as f:
    json.dump(authors, f)

## load it back 
with open("authors_with_h_index_restricted.json",encoding="utf-8") as f:
    authors = json.load(f)
    ## stringfy the authors
    authors_string = json.dumps(authors)



jq_schema = "."
json_loader = JSONLoader("authors_with_h_index_restricted.json", jq_schema=jq_schema, text_content=False)


In [44]:
documents = json_loader.load()
for doc in documents:
    print(doc.page_content)  # This will contain the JSON data as a string.


[{"profile_name": "Adel Trabelsi", "profile_link": "https://scholar.google.com/citations?hl=zh-TW&user=atpUzjMAAAAJ", "profile_affiliations": "University Tunis-El Manar, Tunis, Tunisia", "profile_email": " cern.ch ", "profile_city_by_count": " 25786 ", "profile_interests": ["Physics", "Nuclear", "High Energy Physics"], "hindex": 86, "hindex5y": 28, "i10index": 201, "i10index5y": 74}, {"profile_name": "Chedly Abdelly", "profile_link": "https://scholar.google.com/citations?hl=zh-TW&user=_62kqDAAAAAJ", "profile_affiliations": "Center of Biotechnology of Borj-Cedria, CBBC", "profile_email": " cbbc.rnrt.tn ", "profile_city_by_count": " 21135 ", "profile_interests": ["Ecophysiology", "Halophytes", "Abiotic constraints", "Bioactive substances"], "hindex": 72, "hindex5y": 56, "i10index": 336, "i10index5y": 282}, {"profile_name": "Moncef nasri", "profile_link": "https://scholar.google.com/citations?hl=zh-TW&user=1g45JHwAAAAJ", "profile_affiliations": "Professor, LGEM Laboratory, ENIS, Universit

In [2]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large",
                              dimensions=1024)


In [3]:
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x000001C47FB99E90>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x000001C47F97E850>, model='text-embedding-3-large', dimensions=1024, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base='https://models.inference.ai.azure.com', openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [45]:
## create a vectorstore
embeddings = OpenAIEmbeddings(model="text-embedding-3-large",
                              dimensions=1024)

index = VectorstoreIndexCreator(
    embedding=embeddings,
    vectorstore_cls=DocArrayInMemorySearch
).from_loaders([json_loader])

In [46]:
query = "what are the top 10 authors with the highest h-index?"
response = index.query(query,llm)


In [47]:
display(Markdown(response))

 
1. Adel Trabelsi - h-index: 86
2. Chedly Abdelly - h-index: 72
3. EMNA AMMAR - h-index: 36
4. Julie Dahlstrom - h-index: 33
5. Mabrouk Bahloul - h-index: 32
6. Ridha Barbouche - h-index: 31
7. hichem Ben jannet - h-index: 30
8. Moncef nasri - h-index: 45
9. Maher Moakher - h-index: 23
10. (not enough data to determine the 10th author) 

(Note: The list is not in order of h-index, and the last entry is a placeholder indicating that there are not enough authors listed to provide a complete top 10.)<|fim_suffix|>1. Adel Trabelsi - h-index: 86
2. Chedly Abdelly - h-index: 72
3. EMNA AMMAR - h-index: 36
4. Julie Dahlstrom - h-index: 33
5. Mabrouk Bahloul - h-index: 32
6. Ridha Barbouche - h-index: 31
7. hic

In [50]:
query = "What are the most interesting fields of research based on authors contributions?"
response = index.query(query,llm)

In [51]:
display(Markdown(response))

 The most interesting fields of research based on the authors' contributions include:

1. Physics and High Energy Physics (Adel Trabelsi)
2. Biotechnology and Environmental Sciences (Emna Ammar)
3. Astrophysics and Interstellar Medium (Julie Dahlstrom)
4. Osteoarthritis and Tendon Research (Mabrouk Bahloul)
5. Matrix Analysis and Image Processing (Maher Moakher)
6. Ecophysiology and Halophytes (Chedly Abdelly)
7. Chronobiology and Ramadan Fasting (Moncef Nasri)
8. Tourism and Culinary Studies (Richard NS Robinson)
9. Hydrogeology and Geochemistry (Kamel Zouari) 

These fields reflect a diverse range of scientific inquiry and innovation.<|fim_suffix|>The most interesting fields of research based on the authors' contributions include:

1. Physics and High Energy Physics (Adel Trabelsi)
2. Biotechnology and Environmental Sciences (Emna Ammar)
3. Astrophysics and Interstellar Medium (Julie Dahlstrom)
4. Osteoarthritis and Tendon Research (Mabrouk Bahloul)
5. Matrix Analysis and Image Processing (Maher Moakher)
6. Ecophysiology and Halophytes (Chedly Abdel

In [52]:
query = "who are the authors that does not seem to be Tunisian?"
response = index.query(query,llm)

In [54]:
query = "Who are the authors that have many contributions in the field of computer science?"
response = index.query(query,llm)

In [55]:
display(Markdown(response))

 The authors with significant contributions in the field of computer science include:

1. Habib Youssef - Professor of Computer Science, University of Sousse, Tunisia
   - Interests: Computer Networks, Cyber-physical systems, Performance analysis, Combinatorial optimization
   - h-index: 28, i10-index: 86

2. Brahim Hnich - Professor of Computer Science, Dept. of Computer Science, Monastir University, Tunisia
   - Interests: Artificial Intelligence, Constraint Programming, Uncertainty
   - h-index: 35, i10-index: 82

These authors have demonstrated substantial research output and impact in their respective areas within computer science.<|fim_suffix|>The authors with significant contributions in the field of computer science include:

1. **Habib Youssef** - Professor of Computer Science, University of Sousse, Tunisia
   - Interests: Computer Networks, Cyber-physical systems, Performance analysis, Combinatorial optimization
   - h-index: 28, i10-index: 86

2. **Brahim Hnich** - Professor of Computer Science, Dept. of Computer Science, Monastir University, Tunisia
   - Interests: Artificial Intelligence, Constraint Programming, Uncertainty
   - h-index: 

In [62]:
## Let's do it with more data now 
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS

In [132]:
# Load the large JSON document
with open("../authors_with_h_index.json", "r", encoding="utf-8") as file:
    large_json = json.load(file)

# Convert the JSON to a formatted string for better chunking
def flatten_json(json_obj, indent=0):
    result = []
    for key, value in json_obj.items():
        if isinstance(value, dict):
            result.append(f"{'  ' * indent}{key}:")
            result.extend(flatten_json(value, indent + 1))
        elif isinstance(value, list):
            result.append(f"{'  ' * indent}{key}: [")
            for item in value:
                result.append(f"{'  ' * (indent + 1)}- {json.dumps(item)}")
            result.append(f"{'  ' * indent}]")
        else:
            result.append(f"{'  ' * indent}{key}: {json.dumps(value)}")
    return result

# Flatten and join the JSON as a string
flattened_text = "\n".join([item for author in large_json for item in flatten_json(author)])

# Split the flattened text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=190, chunk_overlap=10)
chunks = text_splitter.split_text(flattened_text)

print(f"Number of chunks created: {len(chunks)}")


Number of chunks created: 10337


In [133]:
embedding_model = OpenAIEmbeddings(model="text-embedding-3-large",dimensions=1024)

# Embed each chunk and add to FAISS
vectorstore = FAISS.from_texts(chunks, embedding_model)

# Save the FAISS index for later use
vectorstore.save_local("faiss_index")


                    dimensions was transferred to model_kwargs.
                    Please confirm that dimensions is what you intended.


In [None]:
from langchain_openai import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings(model="text-embedding-3-large",dimensions=1024)

# Load the saved FAISS index
vectorstore = FAISS.load_local("faiss_index", embedding_model,allow_dangerous_deserialization=True)

# Perform a similarity search
query = "what is the most successful research field in Tunisia?"
results = vectorstore.similarity_search(query, k=5)
for result in results:
    print(result.page_content)


In [None]:
from langchain.chains import RetrievalQA

query = "What is the most successful research field in Tunisia?"

# Retrieve the top 10 relevant chunks from the vector store
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

# Create the QA chain with the retriever and LLM
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)

# Run the query
response = qa_chain.invoke(query)

# Print the result and source documents
print("\nResponse:\n", response["result"])
print("\nSource Documents:")
for doc in response["source_documents"]:
    print(f"Source:\n{doc.page_content}\n{'-' * 80}")


In [149]:
display(Markdown(response))

 I don't know.<|fim_suffix|>I don't know.<|fim_suffix|>

In [None]:
import os

from azure.ai.inference import EmbeddingsClient
from azure.core.credentials import AzureKeyCredential

endpoint = "https://models.inference.ai.azure.com"
model_name = "Cohere-embed-v3-english"
token = os.environ["GITHUB_TOKEN"]

client = EmbeddingsClient(
    endpoint=endpoint,
    credential=AzureKeyCredential(token)
)

response = client.embed(
    input=["first phrase", "second phrase", "third phrase"],
    model=model_name
)

for item in response.data:
    length = len(item.embedding)
    print(
        f"data[{item.index}]: length={length}, "
        f"[{item.embedding[0]}, {item.embedding[1]}, "
        f"..., {item.embedding[length-2]}, {item.embedding[length-1]}]"
    )
print(response.usage)

In [1]:
import json

# Load your JSON dataset
with open("../authors_with_h_index.json", encoding="utf-8") as f:
    authors = json.load(f)

# Flatten and chunk the JSON data
def flatten_json(json_obj):
    """Flatten nested JSON into key-value pairs."""
    out = {}

    def recurse(t, parent_key=""):
        if isinstance(t, dict):
            for k, v in t.items():
                recurse(v, parent_key + k + ".")
        elif isinstance(t, list):
            for i, v in enumerate(t):
                recurse(v, parent_key + str(i) + ".")
        else:
            out[parent_key[:-1]] = t

    recurse(json_obj)
    return out

# Convert and chunk the data

# Ensure that each batch has at most 96 items
chunk_size = 96  # Cohere API limit
flat_data = [json.dumps(flatten_json(author)) for author in authors]


In [4]:
from azure.ai.inference import EmbeddingsClient
from azure.core.credentials import AzureKeyCredential
 

endpoint = "https://models.inference.ai.azure.com"
client = EmbeddingsClient(endpoint=endpoint, credential=AzureKeyCredential(token))
model_name = "Cohere-embed-v3-multilingual"

In [7]:
from langchain_cohere.chat_models import ChatCohere
from langchain_cohere import CohereEmbeddings
from langchain_core.documents.base import Document
from langchain.vectorstores import FAISS
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
from langchain.schema.output_parser import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain.schema import HumanMessage

# Set up the embedding model to be used in the vector index
embed_model = CohereEmbeddings(
    base_url=endpoint, cohere_api_key=os.environ["GITHUB_TOKEN"], model=model_name
)


In [None]:
from langchain.embeddings.base import Embeddings
from langchain.vectorstores import FAISS

endpoint = "https://models.inference.ai.azure.com"
client = EmbeddingsClient(endpoint=endpoint, credential=AzureKeyCredential(token))
model_name = "Cohere-embed-v3-english"

class CustomAzureEmbeddings(Embeddings):
    def __init__(self, client, model_name):
        self.client = client
        self.model_name = model_name
        
    def embed_documents(self, texts):
        # Handle batch size limit of 96
        chunk_size = 96
        all_embeddings = []
        
        for i in range(0, len(texts), chunk_size):
            chunk = texts[i:i + chunk_size]
            response = self.client.embed(input=chunk, model=self.model_name)
            embeddings = [item.embedding for item in response.data]
            all_embeddings.extend(embeddings)
        
        return all_embeddings
    
    def embed_query(self, text):
        response = self.client.embed(input=[text], model=self.model_name)
        return response.data[0].embedding

# Create custom embedding function
embedding_function = CustomAzureEmbeddings(client, model_name)

# Create FAISS index with the custom embedding function
vectorstore = FAISS.from_texts(
    texts=flat_data,
    embedding=embedding_function,
    metadatas=[{"source": str(i)} for i in range(len(flat_data))]
)

# Save the FAISS index
vectorstore.save_local("faiss_cohere_index")



In [8]:
vectorstore = FAISS.load_local(
    "faiss_cohere_index", 
    embeddings=embed_model,
    allow_dangerous_deserialization=True
)


In [9]:

# Retrieve the top 5 relevant chunks
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})


In [10]:
import os
from azure.ai.inference import ChatCompletionsClient
from azure.ai.inference.models import SystemMessage, UserMessage
from azure.core.credentials import AzureKeyCredential

endpoint = "https://models.inference.ai.azure.com"
model_name = "DeepSeek-R1"
token = os.environ["GITHUB_TOKEN"]

client = ChatCompletionsClient(
    endpoint=endpoint,
    credential=AzureKeyCredential(token),
)

response = client.complete(
    messages=[
        SystemMessage(content="You are a helpful assistant."),
        UserMessage(content="What is the capital of France?"),
    ],
    temperature=1.0,
    top_p=1.0,
    max_tokens=1000,
    model=model_name
)

print(response.choices[0].message.content)

<think>
Okay, so the user asked, "What is the capital of France?" Hmm, let me think about how to approach this. First, I know that France is a country in Europe. It's a pretty well-known country, so I should be able to recall its capital. Wait, I've heard Paris mentioned a lot in this context. But maybe I should double-check to make sure I'm not confusing it with another city.

I remember that Paris is a major city in France, home to landmarks like the Eiffel Tower and the Louvre Museum. Also, when I think of French government, political activities, that's usually centered in Paris. But just to confirm, does France have any other cities that could be the capital? Maybe Lyon or Marseille? But no, those are just other big cities, not the capital.

Another way to verify is to think about historical facts. For instance, during World War II, the capital remained Paris, although the government moved to Vichy for a time. After the war, Paris was restored as the capital. So that's a good indic

In [None]:
from langchain.chains import RetrievalQA
from langchain_cohere import ChatCohere
from langchain_openai import OpenAI
from IPython.display import display, Markdown
from langchain_azure_ai.chat_models import AzureAIChatCompletionsModel


llm = AzureAIChatCompletionsModel(
    endpoint="https://models.inference.ai.azure.com",
    credential=os.environ["GITHUB_TOKEN"],
    model_name="gpt-40",
    api_version="2024-05-01-preview",
)
client = ChatCompletionsClient(
    endpoint=endpoint,
    credential=AzureKeyCredential(token),
)

# Create a QA chain using the retriever
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

query = "Who is the researcher that have the highest h-index in artificial intelligence?"

response = qa_chain.invoke(query)
display(Markdown(response["query"]))
display(Markdown(response["result"]))