In [None]:
!pip install -r requirements.txt

In [1]:

import networkx as nx

# Create a new directed graph
G = nx.DiGraph()

# Add nodes with attributes (e.g., entity types, descriptions)
G.add_node("Artificial Intelligence", description="A field of computer science focused on building smart machines.", type="Field")
G.add_node("Machine Learning", description="A subset of AI involving the use of data and algorithms to mimic human learning.", type="Subfield")
G.add_node("Deep Learning", description="A subset of machine learning using neural networks with many layers.", type="Subfield")

# Add edges to establish relationships between the nodes
G.add_edge("Artificial Intelligence", "Machine Learning", relation="has_subfield")
G.add_edge("Machine Learning", "Deep Learning", relation="has_subfield")

# Optionally, you can visualize the graph or query it
print("Nodes in the graph:", G.nodes(data=True))
print("Edges in the graph:", G.edges(data=True))

Nodes in the graph: [('Artificial Intelligence', {'description': 'A field of computer science focused on building smart machines.', 'type': 'Field'}), ('Machine Learning', {'description': 'A subset of AI involving the use of data and algorithms to mimic human learning.', 'type': 'Subfield'}), ('Deep Learning', {'description': 'A subset of machine learning using neural networks with many layers.', 'type': 'Subfield'})]
Edges in the graph: [('Artificial Intelligence', 'Machine Learning', {'relation': 'has_subfield'}), ('Machine Learning', 'Deep Learning', {'relation': 'has_subfield'})]


In [2]:
import faiss
import numpy as np
import torch
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer

# Load a pre-trained model and tokenizer
tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
model = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

# example embedding
input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="pt")["input_ids"]
embeddings = model(input_ids).pooler_output

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:

print(embeddings)

tensor([[ 3.2362e-02,  1.2753e-01,  1.6819e-01,  2.7976e-03,  3.8969e-01,
          2.4265e-01,  2.1790e-01, -2.3352e-02, -8.4820e-02, -1.4324e-01,
         -4.6391e-01, -7.3888e-02, -2.6837e-01, -2.5854e-01, -3.9385e-01,
         -4.6203e-02,  4.6798e-02,  2.9801e-01, -7.3548e-02,  1.9216e-01,
         -9.2044e-02, -3.9324e-01, -4.1118e-01, -5.0958e-01,  1.7611e-01,
          1.4473e-01,  1.2819e-01,  3.6131e-01, -3.7362e-01, -4.8173e-02,
         -1.3587e-01, -2.2887e-01,  4.0440e-01, -2.7060e-01, -1.5117e-01,
          1.1219e-01, -1.0912e-01, -1.8131e-01, -2.0688e-01,  3.5515e-01,
          2.4943e-01,  1.4510e-01,  1.9775e-01, -4.0816e-02, -2.7108e-01,
         -1.0122e-01, -7.1524e-01,  1.9005e-01,  4.7088e-01, -1.7366e-01,
          5.3381e-02,  2.5160e-01,  1.4066e-02, -5.7960e-02, -9.5530e-02,
          1.0499e-01,  2.2886e-02, -3.8995e-01,  8.1239e-03, -5.6475e-02,
          2.6910e-02,  6.4864e-02,  2.2938e-01,  5.1146e-02, -4.0683e-01,
          5.7723e-01,  1.0042e-01,  9.

In [4]:
# Example corpus and queries
corpus = [
    "AI is transforming industries.",
    "Machine Learning is part of AI.",
    "Deep Learning is a type of Machine Learning."
]
queries = ["What is AI?", "What is Machine Learning?"]

def encode(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():  # Disable gradient calculation
        embeddings = model(**inputs).last_hidden_state  # Access the last_hidden_state
    return embeddings.mean(dim=1).numpy()  # Return the mean of the token embeddings

# Encode the entire corpus
encoded_corpus = np.vstack([encode(doc) for doc in corpus])

# Build the FAISS index
index = faiss.IndexFlatL2(encoded_corpus.shape[1])
index.add(encoded_corpus)

# Encode a query and search for relevant documents
def search(query):
    encoded_query = encode(query)
    D, I = index.search(encoded_query, k=1)  # k is the number of top documents to return
    return I[0][0], D[0][0]  # Return the index and distance

# Example query
query = "What is Machine Learning?"
result_index, distance = search(query)
print(f"Top result for query: {corpus[result_index]} with distance: {distance}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


AttributeError: 'DPRQuestionEncoderOutput' object has no attribute 'last_hidden_state'

In [6]:
from transformers import AutoModelForSeq2SeqLM, pipeline

# Load a pre-trained language model for sequence-to-sequence generation
generation_pipeline = pipeline("text2text-generation",  model="facebook/bart-large-cnn")

# Sample query
query = "Explain the relationship between AI and Deep Learning."

# Search for relevant documents (via FAISS)
encoded_query = model(**tokenizer(query, return_tensors="pt")).last_hidden_state.mean(dim=1).detach().numpy()
D, I = index.search(encoded_query, k=2)

# Retrieve top documents
retrieved_docs = [corpus[idx] for idx in I[0]]
context = " ".join(retrieved_docs)

# Incorporate the knowledge graph
relevant_nodes = ["Artificial Intelligence", "Deep Learning"]  # Nodes based on some heuristic or lookup

# Generate a response grounded in the graph and retrieved documents
generation_input = f"Question: {query}\nContext: {context}\nRelevant Nodes: {', '.join(relevant_nodes)}"
generated_response = generation_pipeline(generation_input)
print(f"Generated response: {generated_response[0]['generated_text']}")


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


AttributeError: 'DPRQuestionEncoderOutput' object has no attribute 'last_hidden_state'