# Document


In [None]:
!pip install 

In [4]:
from indoxArcg.data_connectors import Document

# Create a new document
doc = Document(
    source="Wikipedia",
    content="Wikipedia is a free online encyclopedia.",
    metadata={"language": "English", "accessed_date": "2024-08-20"}
)

# Access document attributes
print(f"Document ID: {doc.id_}")
print(f"Source: {doc.source}")
print(f"Content: {doc.content}")
print(f"Metadata: {doc.metadata}")

# Convert to dictionary
doc_dict = doc.to_dict()
print("Document as dictionary:", doc_dict)

# Create a new document from dictionary
new_doc = Document.from_dict(doc_dict)
print("New document:", new_doc)

# String representation
print(str(doc))

Document ID: 1b4c4c46d9824485a3f7b8b136575b59
Source: Wikipedia
Content: Wikipedia is a free online encyclopedia.
Metadata: {'language': 'English', 'accessed_date': '2024-08-20'}
Document as dictionary: {'doc_id': '1b4c4c46d9824485a3f7b8b136575b59', 'source': 'Wikipedia', 'content': 'Wikipedia is a free online encyclopedia.', 'metadata': {'language': 'English', 'accessed_date': '2024-08-20'}}
New document: Doc ID: 1b4c4c46d9824485a3f7b8b136575b59
Source: Wikipedia
Content: Wikipedia is a free online encyclopedia.

Doc ID: 1b4c4c46d9824485a3f7b8b136575b59
Source: Wikipedia
Content: Wikipedia is a free online encyclopedia.



# Wikipedia

In [None]:
!pip install wikipedia

In [8]:
from indoxArcg.data_connectors import WikipediaReader

# Initialize the reader
reader = WikipediaReader()

# Fetch content from specific Wikipedia pages
pages = ["Python (programming language)", "Artificial intelligence"]
documents = reader.load_data(pages=pages)

# Process the retrieved documents
for doc in documents:
    print(f"Title: {doc.metadata['title']}")
    print(f"URL: {doc.metadata['url']}")
    print(f"Summary: {doc.metadata['summary'][:200]}...")
    print(f"Content preview: {doc.content[:200]}...")
    print("---")

Title: Python (programming language)
URL: https://en.wikipedia.org/wiki/Python_(programming_language)
Summary: Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation.
Python is dynamically typed and garbage-collect...
Content preview: Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation.
Python is dynamically typed and garbage-collect...
---
Title: Artificial intelligence
URL: https://en.wikipedia.org/wiki/Artificial_intelligence
Summary: Artificial intelligence (AI), in its broadest sense, is intelligence exhibited by machines, particularly computer systems. It is a field of research in computer science that develops and studies metho...
Content preview: Artificial intelligence (AI), in its broadest sense, is intelligence exhibited by machines, particularly computer systems. It is a field of 

# YouTube

In [None]:
!pip install youtube_transcript_api

In [None]:
from indoxArcg.data_connectors import YoutubeTranscriptReader

# Initialize the reader
reader = YoutubeTranscriptReader()

# Fetch transcripts from specific YouTube videos
video_links = ["https://www.youtube.com/watch?v=dN0lsF2cvm4&t=44s"]
documents = reader.load_data(ytlinks=video_links)

# Process the retrieved documents
for doc in documents:
    print(f"Video ID: {doc.metadata['video_id']}")
    print(f"Video Link: {doc.metadata['link']}")
    print(f"Language: {doc.metadata['language']}")
    print(f"Transcript preview: {doc.content[:200]}...")
    print("---")

# Arxiv

In [12]:
from indoxArcg.data_connectors import ArxivReader

reader = ArxivReader()

paper_ids = ["2201.08239", "2203.02155"]
documents = reader.load_data(paper_ids)

for doc in documents:
    print(f"Title: {doc.metadata['title']}")
    print(f"Authors: {doc.metadata['authors']}")
    print(f"Abstract: {doc.content[:200]}...") 
    print(f"arXiv URL: {doc.metadata['arxiv_url']}")
    print("---")

Title: LaMDA: Language Models for Dialog Applications
Authors: Romal Thoppilan, Daniel De Freitas, Jamie Hall, Noam Shazeer, Apoorv Kulshreshtha, Heng-Tze Cheng, Alicia Jin, Taylor Bos, Leslie Baker, Yu Du, YaGuang Li, Hongrae Lee, Huaixiu Steven Zheng, Amin Ghafouri, Marcelo Menegali, Yanping Huang, Maxim Krikun, Dmitry Lepikhin, James Qin, Dehao Chen, Yuanzhong Xu, Zhifeng Chen, Adam Roberts, Maarten Bosma, Vincent Zhao, Yanqi Zhou, Chung-Ching Chang, Igor Krivokon, Will Rusch, Marc Pickett, Pranesh Srinivasan, Laichee Man, Kathleen Meier-Hellstern, Meredith Ringel Morris, Tulsee Doshi, Renelito Delos Santos, Toju Duke, Johnny Soraker, Ben Zevenbergen, Vinodkumar Prabhakaran, Mark Diaz, Ben Hutchinson, Kristen Olson, Alejandra Molina, Erin Hoffman-John, Josh Lee, Lora Aroyo, Ravi Rajakumar, Alena Butryna, Matthew Lamm, Viktoriya Kuzmina, Joe Fenton, Aaron Cohen, Rachel Bernstein, Ray Kurzweil, Blaise Aguera-Arcas, Claire Cui, Marian Croak, Ed Chi, Quoc Le
Abstract: Title: LaMDA: Lang

In [5]:
print(documents[0].metadata)

{'paper_id': '2201.08239', 'title': 'LaMDA: Language Models for Dialog Applications', 'authors': 'Romal Thoppilan, Daniel De Freitas, Jamie Hall, Noam Shazeer, Apoorv Kulshreshtha, Heng-Tze Cheng, Alicia Jin, Taylor Bos, Leslie Baker, Yu Du, YaGuang Li, Hongrae Lee, Huaixiu Steven Zheng, Amin Ghafouri, Marcelo Menegali, Yanping Huang, Maxim Krikun, Dmitry Lepikhin, James Qin, Dehao Chen, Yuanzhong Xu, Zhifeng Chen, Adam Roberts, Maarten Bosma, Vincent Zhao, Yanqi Zhou, Chung-Ching Chang, Igor Krivokon, Will Rusch, Marc Pickett, Pranesh Srinivasan, Laichee Man, Kathleen Meier-Hellstern, Meredith Ringel Morris, Tulsee Doshi, Renelito Delos Santos, Toju Duke, Johnny Soraker, Ben Zevenbergen, Vinodkumar Prabhakaran, Mark Diaz, Ben Hutchinson, Kristen Olson, Alejandra Molina, Erin Hoffman-John, Josh Lee, Lora Aroyo, Ravi Rajakumar, Alena Butryna, Matthew Lamm, Viktoriya Kuzmina, Joe Fenton, Aaron Cohen, Rachel Bernstein, Ray Kurzweil, Blaise Aguera-Arcas, Claire Cui, Marian Croak, Ed Chi, Q

# Twitter

In [None]:
from indoxArcg.data_connectors import TwitterTweetReader
import os
from dotenv import load_dotenv

load_dotenv()
# Initialize the reader with your bearer token
twitter_token = os.environ['twitter_token']
reader = TwitterTweetReader(bearer_token=twitter_token)

# Fetch tweets from specific Twitter handles
twitter_handles = ["OpenAI", "DeepMind"]
documents = reader.load_data(twitterhandles=twitter_handles, num_tweets=50)

# Process the retrieved documents
for doc in documents:
    print(f"Username: {doc.metadata['username']}")
    print(f"User ID: {doc.metadata['user_id']}")
    print(f"Number of tweets: {doc.metadata['num_tweets']}")
    print(f"Tweets preview: {doc.content[:200]}...")
    print("---")

# GutenBerg

In [None]:
from indoxArcg.data_connectors import GutenbergReader

# Initialize the reader
reader = GutenbergReader()

# Fetch a specific book by ID
book_id = "11"  # Alice's Adventures in Wonderland
book = reader.get_book(book_id)

if book:
    print(f"Title: {book.metadata['title']}")
    print(f"Content preview: {book.content[:200]}...")
    print("---")

# Search for books
search_query = "Sherlock Holmes"
search_results = reader.search_gutenberg(search_query)

for result in search_results[:5]:  # Print first 5 results
    print(f"Book ID: {result.metadata['book_id']}")
    print(f"Title: {result.metadata['title']}")
    print(f"Author: {result.metadata['author']}")
    print("---")

Title: Unknown Title
Content preview: ï»¿ï»¿*** START OF THE PROJECT GUTENBERG EBOOK ALICE'S ADVENTURES IN
WONDERLAND ***
[Illustration]




Aliceâs Adventures in Wonderland

by Lewis Carroll

THE MILLENNIUM FULCRUM EDITION 3...
---
Book ID: 1661
Title: The Adventures of Sherlock Holmes
Author: Arthur Conan Doyle
---
Book ID: 244
Title: A Study in Scarlet
Author: Arthur Conan Doyle
---
Book ID: 2852
Title: The Hound of the Baskervilles
Author: Arthur Conan Doyle
---
Book ID: 2097
Title: The Sign of the Four
Author: Arthur Conan Doyle
---
Book ID: 834
Title: The Memoirs of Sherlock Holmes
Author: Arthur Conan Doyle
---


# Github

In [None]:
from indoxArcg.data_connectors import GithubClient, GithubRepositoryReader
from dotenv import load_dotenv
import os 

load_dotenv('api.env')
github_token = os.environ['github_token']
github_client = GithubClient(github_token=github_token)

# Instantiate the repository reader
repo_reader = GithubRepositoryReader(
    github_client=github_client,
    owner="osllmai",
    repo="indoxjudge",
    filter_directories=(["docs"], GithubRepositoryReader.FilterType.INCLUDE),
    filter_file_extensions=([".md"], GithubRepositoryReader.FilterType.INCLUDE)
)

# Load data from the repository
documents = repo_reader.load_data(branch="main")

# Print document information
for doc in documents:
    print(f"File: {doc.metadata['file_name']}")
    print(f"Path: {doc.metadata['file_path']}")
    print(f"Size: {doc.metadata['file_size']} bytes")
    print(f"Content preview: {doc.content[:200]}...")
    print("---")

Processing file: Branch_and_PR_Guidelines.md
Processing file: README.md
Processing file: docs/metrics/AnswerRelevancy.md
Processing file: docs/metrics/BLEU.md
Processing file: docs/metrics/Bertscore.md
Processing file: docs/metrics/Bias.md
Processing file: docs/metrics/ContextualRelevancy.md
Processing file: docs/metrics/Fairness.md
Processing file: docs/metrics/Faithfulness.md
Processing file: docs/metrics/GEval.md
Processing file: docs/metrics/Gruen.md
Processing file: docs/metrics/Hallucination.md
Processing file: docs/metrics/Harmfulness.md
Processing file: docs/metrics/KnowledgeRetention.md
Processing file: docs/metrics/METEOR.md
Processing file: docs/metrics/MachineEthics.md
Processing file: docs/metrics/Misinformation.md
Processing file: docs/metrics/Privacy.md
Processing file: docs/metrics/ROUGE.md
Processing file: docs/metrics/Stereotype and Bias.md
Processing file: docs/metrics/Toxicity.md
Processing file: docs/piplines/CustomEvaluator.md
Processing file: docs/piplines/LLMCom

# Discord

In [None]:
from indoxArcg.data_connectors import DiscordChannelReader
import os
from dotenv import load_dotenv
import nest_asyncio

# Apply the nest_asyncio patch
nest_asyncio.apply()

load_dotenv('api.env')
# Initialize the reader with your bearer token
discord_token = os.environ['discord_token']
reader = DiscordChannelReader(bot_token=discord_token)

# Fetch messages from specific Discord channels
channel_ids = [1275046109722447915]
documents = reader.load_data(channel_ids=channel_ids, num_messages=50)

# Process the retrieved documents
for doc in documents:
    print(f"Channel ID: {doc.metadata['channel_id']}")
    print(f"Channel Name: {doc.metadata['channel_name']}")
    print(f"Number of messages: {doc.metadata['num_messages']}")
    print(f"Messages preview: {doc.content[:200]}...")
    print("---")

[2024-08-20 17:07:31] [INFO    ] discord.client: logging in using static token
[2024-08-20 17:07:34] [INFO    ] discord.gateway: Shard ID None has connected to Gateway (Session ID: 92edb3a8acf6df38664b999619ee7edd).


Channel ID: 1275046109722447915
Channel Name: general
Number of messages: 2
Messages preview: 
Hi this a test...
---
