## Importing the necessary libraries

In [15]:
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_community.embeddings import OllamaEmbeddings
from langchain.document_loaders import DirectoryLoader

In [16]:
import torch
torch.cuda.is_available()

True

## Loading the pdf file

In [17]:
# Load the pdf file
loader = DirectoryLoader(
    "./",glob="**/*.pdf" , show_progress=True
)
docs = loader.load()

100%|██████████| 1/1 [00:02<00:00,  2.20s/it]


In [18]:
docs

[Document(page_content='9/2/2024\n\nMapping the Digital Influence: A Social Network Analysis of the Bioneer Youtube Channel\n\nCourse:\n\nSocial Network Analysis\n\n7th Semester\n\nSupervising Lecturer:\n\nDimitris Pournarakis\n\nSpanakis Panagiotis-Alexios (8200158)\n\nContents Introduction ............................................................................................................................... 2\n\nData Overview ........................................................................................................................... 3\n\n1. Graphical Representation of the Network ............................................................................ 4\n\n2. Basic Topological Properties .................................................................................................. 7\n\n3. Component Measures ........................................................................................................... 8\n\n4. Degree Measures.................

## Splitting the pdf file into chunks

In [19]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4096, chunk_overlap=1024)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OllamaEmbeddings(model="mistral"))

## Creating the vector database and the prompt

In [20]:
# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")
llm_model = 'mistral'
llm = ChatOllama(model=llm_model, temperature=0.0)


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
)

## Finalizing the rag chain

In [21]:
from langchain_core.runnables import RunnableParallel

rag_chain_from_docs = (
        RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
        | prompt
        | llm
        | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [22]:
response = rag_chain_with_source.invoke("What is the goal of the study?")
response

{'context': [Document(page_content="In our network, the assortativity coefficient for the country is around 0.04, indicating that channels have a slight preference for connecting with others from the same country, but the effect is not strong. This level of homophily can be influenced by various factors, such as the language of the content, cultural specifics that resonate more strongly with a domestic audience, or the geographic focus of the channel's content. Channels naturally gravitate towards others in the same country due to shared cultural contexts, ease of collaboration, or similar time zones, which can facilitate interaction. However, the low magnitude of the coefficient suggests that while there is a hint of homophily based on country, the network is relatively open to cross-country connections, reflecting a more globally integrated community of YouTube channels.", metadata={'source': 'Sna_Youtube_Final_2.pdf'}),
  Document(page_content='We can get a clear view of the interac

## Running the rag chain

In [23]:
answer = rag_chain_with_source.invoke("What is betweeness centrality?")

In [24]:
# Get the answer from the rag chain and print it.
answer["answer"]

' Betweenness centrality is a measure of the importance of a node in a graph or network, specifically how often it lies on the shortest paths between other nodes. In this context, a low assortativity coefficient of around 0.04 and a reciprocity of around 0.083 suggest that channels in the network have only a slight preference for connecting with others from the same country and there is a relatively low likelihood (8.3%) of mutual connections between channels. These characteristics may indicate a network where some nodes play significant roles as intermediaries or connectors between different communities, contributing to its structure and communication dynamics.'

In [25]:
rag_chain_with_source.invoke("Who is the Supervising Lecturer?")

{'context': [Document(page_content="In our network, the assortativity coefficient for the country is around 0.04, indicating that channels have a slight preference for connecting with others from the same country, but the effect is not strong. This level of homophily can be influenced by various factors, such as the language of the content, cultural specifics that resonate more strongly with a domestic audience, or the geographic focus of the channel's content. Channels naturally gravitate towards others in the same country due to shared cultural contexts, ease of collaboration, or similar time zones, which can facilitate interaction. However, the low magnitude of the coefficient suggests that while there is a hint of homophily based on country, the network is relatively open to cross-country connections, reflecting a more globally integrated community of YouTube channels.", metadata={'source': 'Sna_Youtube_Final_2.pdf'}),
  Document(page_content='We can get a clear view of the interac

## Evaluating the rag chain

In [26]:
from langchain.evaluation.qa import QAGenerateChain

example_gen_chain = QAGenerateChain.from_llm(ChatOllama(model=llm_model,temperature=0.5))

In [27]:
new_examples = example_gen_chain.apply(
    [{"doc": t} for t in docs[:5]]
)

ValueError: Could not parse output:  Title: An In-depth Analysis of the Bioneer YouTube Channel Network

Abstract: This study provides an in-depth analysis of the Bioneer YouTube channel network, exploring its structure, diversity, and interconnectedness. Using Social Network Analysis (SNA) techniques, the paper examines the network's centrality, community segmentation, gender representation, homophily, graph density metrics, and PageRank distribution. The findings highlight the Bioneer's role as a central hub, the presence of specialized communities, the predominance of male-led channels, the selective nature of connections, and the network's potential for growth. The study underscores the multifaceted impact a single channel can have in the context of content consumption on YouTube and emphasizes the need for more inclusive representation within the platform's ecosystem.

Keywords: Social Network Analysis (SNA), YouTube Channel Network, Bioneer, Centrality, Community Segmentation, Gender Representation, Homophily, Graph Density Metrics, PageRank Distribution.

1. Introduction
The introduction sets the context for the study and explains the rationale behind analyzing the Bioneer YouTube channel network using Social Network Analysis (SNA) techniques. It highlights the importance of understanding the structure, diversity, and interconnectedness of content ecosystems on YouTube.

2. Methodology
The methodology section describes the data collection process for the Bioneer YouTube channel network and explains how the SNA analysis was conducted using Python libraries such as NetworkX and Scipy. It also discusses the use of graph visualization tools like Gephi to create visual representations of the network.

3. Results and Discussion

3.1 Centrality Analysis
This subsection explores the Bioneer's role in the network by analyzing its degree centrality, betweenness centrality, closeness centrality, and eigenvector centrality. The findings suggest that the Bioneer serves as a central hub in the network.

3.2 Community Detection
This subsection discusses the presence of five distinct communities within the network based on their thematic or topical focus: mainstream and niche fitness, calisthenics, science, digital entertainment, and technology. The analysis reveals that each community contributes uniquely to the network's collective narrative.

3.3 Gender Analysis
This subsection examines the gender representation within the network by analyzing channel names, profile pictures, and video titles. The findings indicate a predominance of male-led channels, reflecting broader societal trends and emphasizing the need for more inclusive representation within the network.

3.4 Homophily Analysis
This subsection explores the network's inclination towards forming connections based on shared national backgrounds, audience sizes, and similar video and view counts. The analysis also reveals points of cross-boundary interactions essential for diversity and idea exchange.

3.5 Graph Density Metrics
This subsection discusses the network's resilience by analyzing graph density metrics such as average degree, clustering coefficient, transitivity, and modularity. The findings suggest that the network is selective and sparsely connected, with room for growth and potential for more robust community structures.

3.6 PageRank Analysis
This subsection examines the Bioneer's influence within the network by analyzing its PageRank score and role as a bridge or connector between different communities. The findings indicate that the Bioneer serves as a unique position, occupying fewer central roles but potentially influencing multiple communities indirectly.

4. Conclusion
The conclusion summarizes the main findings of the study and emphasizes the importance of understanding the structure, diversity, and interconnectedness of content ecosystems on YouTube, particularly in the context of the Bioneer channel network. It also highlights the potential for content creators to act as unifying entities in an increasingly segmented digital landscape.

5. References
The paper includes a list of references used throughout the study.

In [None]:
new_examples

In [None]:
reformatted_examples = [{'query': pair['qa_pairs']['query'], 'answer': pair['qa_pairs']['answer']} for pair in
                        new_examples]
reformatted_examples

In [None]:
from langchain.chains import RetrievalQA

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    verbose=True,
    chain_type_kwargs={
        "document_separator": "<<<<>>>>>"
    }
)

In [None]:
predictions = qa.batch(reformatted_examples)
predictions

In [None]:
from langchain.evaluation.qa import QAEvalChain

llm = ChatOllama(temperature=1, model=llm_model)
eval_chain = QAEvalChain.from_llm(llm)

In [None]:
graded_outputs = eval_chain.evaluate(reformatted_examples, predictions)

In [None]:
graded_outputs

In [None]:
for i, eg in enumerate(new_examples):
    print(f"Example {i}:")
    print("Question: " + predictions[i]['query'])
    print("Real Answer: " + predictions[i]['answer'])
    print("Predicted Answer: " + predictions[i]['result'])
    print("Result: " + graded_outputs[i]['results'])
    print("Is correct: " + "No" if "INCORRECT" in graded_outputs[i]['results'] else "Yes")
    print()