# DeDuping readers/loaders between llama-index and llama-hub

In [1]:
class cd:
    """Context manager for changing the current working directory"""
    def __init__(self, newPath):
        self.newPath = os.path.expanduser(newPath)

    def __enter__(self):
        self.savedPath = os.getcwd()
        os.chdir(self.newPath)

    def __exit__(self, etype, value, traceback):
        os.chdir(self.savedPath)

In [2]:
IGNORE_LIST = [
    "__init__.py",
    "__pycache__",
    "loading.py",
    "base.py",
    "README.md"
]

In [3]:
import os
from pathlib import Path
import shutil
from os import listdir
from os.path import isfile, join

In [4]:
llama_index_path = f"/Users/nerdai/Projects/forks/llama_index/llama-index-legacy/llama_index/legacy/readers"
llama_index = [el for el in listdir(llama_index_path) if el not in IGNORE_LIST]
llama_index = sorted(llama_index)

In [5]:
llama_hub_path = f"/Users/nerdai/Projects/llama-hub/llama_hub/"
llama_hub = [el for el in listdir(llama_hub_path) if el not in IGNORE_LIST]
llama_hub = sorted(llama_hub)

In [6]:
common = sorted([e.replace(".py", "") for e in llama_index if e.replace(".py", "") in llama_hub])
common

['bagel',
 'chatgpt_plugin',
 'chroma',
 'database',
 'deeplake',
 'elasticsearch',
 'faiss',
 'file',
 'make_com',
 'metal',
 'milvus',
 'mongo',
 'notion',
 'obsidian',
 'pinecone',
 'qdrant',
 'slack',
 'steamship',
 'string_iterable',
 'twitter',
 'weaviate',
 'web',
 'wikipedia',
 'youtube_transcript']

### LlamaIndex \ LlamaHub

In [7]:
extensions = [el for el in llama_index if el.replace(".py", "") not in common]
extensions

['awadb.py',
 'dashvector.py',
 'discord_reader.py',
 'download.py',
 'github_readers',
 'google_readers',
 'jaguar.py',
 'json.py',
 'mbox.py',
 'myscale.py',
 'pathway.py',
 'psychic.py',
 'redis',
 'schema']

In [8]:
extensions_path = "/Users/nerdai/Projects/forks/llama_index/llama-index-integrations/readers"
print(extensions_path)

/Users/nerdai/Projects/forks/llama_index/llama-index-integrations/readers


In [9]:
import subprocess # just to call an arbitrary command e.g. 'ls'
from llama_index_migration_tools.main import main

run = False

if run:
    handle_manually = []
    for ext in extensions:
        if ".py" not in ext:
            handle_manually.append(ext)
            continue
        name = ext.replace(".py","").replace("_"," ")
        base_file = f"{llama_index_path}/{ext}"
        # enter the directory like this:
        with cd(extensions_path):
            main(integration_name=name, integration_type="readers", prefix=None, base_file=base_file)

In [10]:
handle_manually  # + redis

NameError: name 'handle_manually' is not defined

### LlamaHub \ LlamaIndex

In [11]:
extensions = [el for el in llama_hub if el not in common + ["README.md", "add_loader.sh", "utils.py", "tools", "llama_datasets", "llama_packs","library.json"]]
extensions

['agent_search',
 'airbyte_cdk',
 'airbyte_gong',
 'airbyte_hubspot',
 'airbyte_salesforce',
 'airbyte_shopify',
 'airbyte_stripe',
 'airbyte_typeform',
 'airbyte_zendesk_support',
 'airtable',
 'apify',
 'asana',
 'assemblyai',
 'astra_db',
 'athena',
 'azcognitive_search',
 'azstorage_blob',
 'bilibili',
 'bitbucket',
 'boarddocs',
 'confluence',
 'couchdb',
 'dad_jokes',
 'discord',
 'docstring_walker',
 'docugami',
 'earnings_call_transcript',
 'feedly_rss',
 'feishu_docs',
 'firebase_realtimedb',
 'firestore',
 'github_repo',
 'github_repo_collaborators',
 'github_repo_issues',
 'gmail',
 'google_calendar',
 'google_docs',
 'google_drive',
 'google_keep',
 'google_sheets',
 'gpt_repo',
 'graphdb_cypher',
 'graphql',
 'guru',
 'hatena_blog',
 'hive',
 'hubspot',
 'huggingface',
 'hwp',
 'imdb_review',
 'intercom',
 'jira',
 'joplin',
 'jsondata',
 'kaltura',
 'kibela',
 'lilac_reader',
 'linear',
 'macrometa_gdn',
 'mangadex',
 'mangoapps_guides',
 'maps',
 'memos',
 'microsoft_one

In [12]:
llama_hub_path

'/Users/nerdai/Projects/llama-hub/llama_hub/'

In [14]:
from llama_index_migration_tools.main_hub import main_hub

run = True

handle_manually = []
if run:
    for name in extensions[1:]:
        hub_path = f"{llama_hub_path}/{name}"
        print(llama_hub_path)
        # enter the directory like this:
        with cd(extensions_path):
            try:
                main_hub(integration_name=name, integration_type="readers", hub_path=hub_path)
            except FileNotFoundError as e:
                handle_manually.append(name)

/Users/nerdai/Projects/llama-hub/llama_hub/
/Users/nerdai/Projects/llama-hub/llama_hub/
/Users/nerdai/Projects/llama-hub/llama_hub/
/Users/nerdai/Projects/llama-hub/llama_hub/
/Users/nerdai/Projects/llama-hub/llama_hub/
/Users/nerdai/Projects/llama-hub/llama_hub/
/Users/nerdai/Projects/llama-hub/llama_hub/
/Users/nerdai/Projects/llama-hub/llama_hub/
/Users/nerdai/Projects/llama-hub/llama_hub/
/Users/nerdai/Projects/llama-hub/llama_hub/
/Users/nerdai/Projects/llama-hub/llama_hub/
/Users/nerdai/Projects/llama-hub/llama_hub/
/Users/nerdai/Projects/llama-hub/llama_hub/
/Users/nerdai/Projects/llama-hub/llama_hub/
/Users/nerdai/Projects/llama-hub/llama_hub/
/Users/nerdai/Projects/llama-hub/llama_hub/
/Users/nerdai/Projects/llama-hub/llama_hub/
/Users/nerdai/Projects/llama-hub/llama_hub/
/Users/nerdai/Projects/llama-hub/llama_hub/
/Users/nerdai/Projects/llama-hub/llama_hub/
/Users/nerdai/Projects/llama-hub/llama_hub/
/Users/nerdai/Projects/llama-hub/llama_hub/
/Users/nerdai/Projects/llama-hub

In [15]:
handle_manually  # are likely multi-file

['apify',
 'dad_jokes',
 'docstring_walker',
 'gpt_repo',
 'guru',
 'huggingface',
 'joplin',
 'kaltura',
 'mangadex',
 'memos',
 'minio',
 'openalex',
 'papers',
 'readwise',
 'remote']

### LlamaIndex & LlamaHub

**Github**
- github_readers (llama-index)

**Google**
- google_readers (llama-index)

In [16]:
common

['bagel',
 'chatgpt_plugin',
 'chroma',
 'database',
 'deeplake',
 'elasticsearch',
 'faiss',
 'file',
 'make_com',
 'metal',
 'milvus',
 'mongo',
 'notion',
 'obsidian',
 'pinecone',
 'qdrant',
 'slack',
 'steamship',
 'string_iterable',
 'twitter',
 'weaviate',
 'web',
 'wikipedia',
 'youtube_transcript']