## Importing the necessary libraries

In [106]:
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_community.embeddings import OllamaEmbeddings
from langchain.document_loaders import PDFPlumberLoader

In [107]:
import torch
torch.cuda.is_available()



True

## Loading the pdf file

In [108]:
# Load the pdf file
loader = PDFPlumberLoader(
    file_path="Sna_Youtube_Final_2.pdf"
)
docs = loader.load()

In [109]:
docs

[Document(page_content='Mapping the Digital Influence: A Social\n9/2/2024\nNetwork Analysis of the Bioneer Youtube\nChannel\nCourse:\nSocial Network Analysis\n7th Semester\nSupervising Lecturer:\nDimitris Pournarakis\nSpanakis Panagiotis-Alexios (8200158)\n', metadata={'source': 'Sna_Youtube_Final_2.pdf', 'file_path': 'Sna_Youtube_Final_2.pdf', 'page': 0, 'total_pages': 54, 'Author': 'Παναγιώτης Σπακης', 'Creator': 'Microsoft® Word για το Microsoft 365', 'CreationDate': "D:20240127125934+02'00'", 'ModDate': "D:20240127125934+02'00'", 'Producer': 'Microsoft® Word για το Microsoft 365'}),
 Document(page_content='Contents\nIntroduction ............................................................................................................................... 2\nData Overview ........................................................................................................................... 3\n1. Graphical Representation of the Network ............................................

## Splitting the pdf file into chunks

In [110]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OllamaEmbeddings(model="mistral"))

## Creating the vector database and the prompt

In [111]:
# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")
llm_model = 'mistral'
llm = ChatOllama(model=llm_model, temperature=0.0)


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
)

## Finalizing the rag chain

In [112]:
from langchain_core.runnables import RunnableParallel

rag_chain_from_docs = (
        RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
        | prompt
        | llm
        | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [113]:
response = rag_chain_with_source.invoke("What is the goal of the study?")
response

{'context': [Document(page_content='This distribution provides insights into how attention and authority are\ndistributed across the network, with a few nodes being central influencers, while the\nvast majority have a more modest presence.', metadata={'Author': 'Παναγιώτης Σπακης', 'CreationDate': "D:20240127125934+02'00'", 'Creator': 'Microsoft® Word για το Microsoft 365', 'ModDate': "D:20240127125934+02'00'", 'Producer': 'Microsoft® Word για το Microsoft 365', 'file_path': 'Sna_Youtube_Final_2.pdf', 'page': 50, 'source': 'Sna_Youtube_Final_2.pdf', 'total_pages': 54}),
  Document(page_content='For the Bioneer, these local bridges represent opportunities for maintaining and\nstrengthening its position within the network by being the sole connector for these\nchannels. However, from a network resilience perspective, it would be beneficial to\ndevelop additional connections among these channels to reduce dependency on a single\nnode and thus increase the robustness of the network against

## Running the rag chain

In [114]:
answer = rag_chain_with_source.invoke("What is betweeness centrality?")

In [115]:
# Get the answer from the rag chain and print it.
answer["answer"]

" Betweenness centrality is a measure of the influence or importance of nodes in a network, based on their ability to act as intermediaries between other nodes. In this context, it provides insights into how attention and authority are distributed across the network, with some nodes being central influencers while others have a more modest presence. The size distribution chart from Gephi shows that all nodes in the network belong to one giant component, indicating a 'small world' network where most nodes are connected and part of a single web of connections."

In [116]:
rag_chain_with_source.invoke("What is the betweeness centrality?")

{'context': [Document(page_content='This distribution provides insights into how attention and authority are\ndistributed across the network, with a few nodes being central influencers, while the\nvast majority have a more modest presence.', metadata={'Author': 'Παναγιώτης Σπακης', 'CreationDate': "D:20240127125934+02'00'", 'Creator': 'Microsoft® Word για το Microsoft 365', 'ModDate': "D:20240127125934+02'00'", 'Producer': 'Microsoft® Word για το Microsoft 365', 'file_path': 'Sna_Youtube_Final_2.pdf', 'page': 50, 'source': 'Sna_Youtube_Final_2.pdf', 'total_pages': 54}),
  Document(page_content="the network extraction process. A seed channel is typically the starting point for\ncrawling the network.\n• Seed Rank: If the channel was used as a seed, this numerical value indicates its\nrank or order in the seeding process, which may affect the network structure as\nit unfolds from this point.\n• Subscriber Count: Reflecting the popularity and reach of the channel, this\nfigure denotes the 

In [117]:
rag_chain_with_source.invoke("Who is the Supervising Lecturer?")

{'context': [Document(page_content='This distribution provides insights into how attention and authority are\ndistributed across the network, with a few nodes being central influencers, while the\nvast majority have a more modest presence.', metadata={'Author': 'Παναγιώτης Σπακης', 'CreationDate': "D:20240127125934+02'00'", 'Creator': 'Microsoft® Word για το Microsoft 365', 'ModDate': "D:20240127125934+02'00'", 'Producer': 'Microsoft® Word για το Microsoft 365', 'file_path': 'Sna_Youtube_Final_2.pdf', 'page': 50, 'source': 'Sna_Youtube_Final_2.pdf', 'total_pages': 54}),
  Document(page_content='For the Bioneer, these local bridges represent opportunities for maintaining and\nstrengthening its position within the network by being the sole connector for these\nchannels. However, from a network resilience perspective, it would be beneficial to\ndevelop additional connections among these channels to reduce dependency on a single\nnode and thus increase the robustness of the network against

## Evaluating the rag chain

In [132]:
from langchain.evaluation.qa import QAGenerateChain

example_gen_chain = QAGenerateChain.from_llm(ChatOllama(model=llm_model,temperature=0.5))

In [133]:
new_examples = example_gen_chain.apply(
    [{"doc": t} for t in docs[:5]]
)

In [134]:
new_examples

[{'qa_pairs': {'query': 'Who is the supervising lecturer for the Social Network Analysis course in the 7th semester, as mentioned in this document?',
   'answer': 'Dimitris Pournarakis.'}},
 {'qa_pairs': {'query': 'What are the main sections and sub-sections covered in the document regarding network analysis of YouTube channels?',
   'answer': 'The document covers various aspects of network analysis of YouTube channels, which includes Data Overview, Graphical Representation of the Network, Basic Topological Properties, Component Measures, Degree Measures, Centrality Measures, Clustering Effects, Bridges and Local Bridges, Gender, Homophily, and Graph Density. The sub-sections under each of these sections are further detailed in the document.'}},
 {'qa_pairs': {'query': 'Which tools were used for extracting network data from YouTube and conducting the analysis in this study?',
   'answer': 'Bernhard Rieder’s YouTube Data Tools modules were used for extracting detailed network data from 

In [135]:
reformatted_examples = [{'query': pair['qa_pairs']['query'], 'answer': pair['qa_pairs']['answer']} for pair in
                        new_examples]
reformatted_examples

[{'query': 'Who is the supervising lecturer for the Social Network Analysis course in the 7th semester, as mentioned in this document?',
  'answer': 'Dimitris Pournarakis.'},
 {'query': 'What are the main sections and sub-sections covered in the document regarding network analysis of YouTube channels?',
  'answer': 'The document covers various aspects of network analysis of YouTube channels, which includes Data Overview, Graphical Representation of the Network, Basic Topological Properties, Component Measures, Degree Measures, Centrality Measures, Clustering Effects, Bridges and Local Bridges, Gender, Homophily, and Graph Density. The sub-sections under each of these sections are further detailed in the document.'},
 {'query': 'Which tools were used for extracting network data from YouTube and conducting the analysis in this study?',
  'answer': 'Bernhard Rieder’s YouTube Data Tools modules were used for extracting detailed network data from YouTube, and a combination of Python with Ne

In [136]:
from langchain.chains import RetrievalQA

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    verbose=True,
    chain_type_kwargs={
        "document_separator": "<<<<>>>>>"
    }
)

In [137]:
predictions = qa.batch(reformatted_examples)
predictions



[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


[{'query': 'Who is the supervising lecturer for the Social Network Analysis course in the 7th semester, as mentioned in this document?',
  'answer': 'Dimitris Pournarakis.',
  'result': ' I cannot directly answer that question using the provided context. The text mentions the analysis of a network and various attributes of channels within that network, but it does not mention any specific lecturer or course details, including the name or identity of the supervising lecturer for the Social Network Analysis course in the 7th semester.'},
 {'query': 'What are the main sections and sub-sections covered in the document regarding network analysis of YouTube channels?',
  'answer': 'The document covers various aspects of network analysis of YouTube channels, which includes Data Overview, Graphical Representation of the Network, Basic Topological Properties, Component Measures, Degree Measures, Centrality Measures, Clustering Effects, Bridges and Local Bridges, Gender, Homophily, and Graph Den

In [138]:
from langchain.evaluation.qa import QAEvalChain

llm = ChatOllama(temperature=1, model=llm_model)
eval_chain = QAEvalChain.from_llm(llm)

In [139]:
graded_outputs = eval_chain.evaluate(reformatted_examples, predictions)

In [140]:
graded_outputs

[{'results': ' INCORRECT - The student answer did not provide the correct name of the supervising lecturer. However, the student answer was reasonable as it correctly pointed out that the provided text does not contain information about the identity of the lecturer.'},
 {'results': ' INCORRECT\n\nThe student\'s answer mentioned two main sections - "Description and Importance of Network Metrics for YouTube Channels" and "Analysis of Bioneer Channel\'s Network using Python (Networkx library)" - which is not a perfect match with the true answer. The true answer includes additional topics covered in the document, such as Data Overview, Graphical Representation of the Network, Basic Topological Properties, Component Measures, Degree Measures, Centrality Measures, Clustering Effects, Bridges and Local Bridges, Gender, Homophily, and Graph Density. While the student\'s answer does provide some correct information, it is not as comprehensive as the true answer.'},
 {'results': " INCORRECT\n\nT

In [141]:
for i, eg in enumerate(new_examples):
    print(f"Example {i}:")
    print("Question: " + predictions[i]['query'])
    print("Real Answer: " + predictions[i]['answer'])
    print("Predicted Answer: " + predictions[i]['result'])
    print("Result: " + graded_outputs[i]['results'])
    print("Is correct: " + "No" if "INCORRECT" in graded_outputs[i]['results'] else "Yes")
    print()

Example 0:
Question: Who is the supervising lecturer for the Social Network Analysis course in the 7th semester, as mentioned in this document?
Real Answer: Dimitris Pournarakis.
Predicted Answer:  I cannot directly answer that question using the provided context. The text mentions the analysis of a network and various attributes of channels within that network, but it does not mention any specific lecturer or course details, including the name or identity of the supervising lecturer for the Social Network Analysis course in the 7th semester.
Result:  INCORRECT - The student answer did not provide the correct name of the supervising lecturer. However, the student answer was reasonable as it correctly pointed out that the provided text does not contain information about the identity of the lecturer.
Is correct: No

Example 1:
Question: What are the main sections and sub-sections covered in the document regarding network analysis of YouTube channels?
Real Answer: The document covers vari