In [1]:
from PyPDF2 import PdfReader
from langchain_text_splitters import CharacterTextSplitter
import chromadb
import together

In [2]:
together.api_key = "<YOUR TOGETHER API KEY HERE>"

In [3]:
# Use reader to turn pdf into text
def extract_text(doc_path : str) -> str:
    reader = PdfReader(doc_path)
    nr_of_pages = len(reader.pages)
    text_string = ''
    for page in reader.pages:
        text_string += page.extract_text()
    nr_of_characters = len(text_string)

    print(f"Nr. of pages in document: {nr_of_pages}")
    print(f"Nr. of characters in document: {nr_of_characters}")
    
    return text_string

In [4]:
text_blob = extract_text('../documents/linux-commands-handbook.pdf')
text_blob

Nr. of pages in document: 135
Nr. of characters in document: 60625


'1T able of Contents\nPreface\nIntroduction to Linux and shells\nman\nls\ncd\npwd\nmkdir\nrmdir\nmv\ncp\nopen\ntouch\nfind\nln\ngzip\ngunzip\ntar\nalias\ncat\nless\ntail\nwc\ngrep\nsort2uniq\ndiff\necho\nchown\nchmod\numask\ndu\ndf\nbasename\ndirname\nps\ntop\nkill\nkillall\njobs\nbg\nfg\ntype\nwhich\nnohup\nxargs\nvim\nemacs\nnano\nwhoami\nwho\nsu\nsudo3passwd\nping\ntraceroute\nclear\nhistory\nexport\ncrontab\nuname\nenv\nprintenv\nConclusion4Preface\nThe Linux Commands Handbook follows the 80/20\nrule: learn in 20% of the time the 80% of a topic.\nI find this approach gives a well-rounded overview .\nThis book does not try to cover  everything under the\nsun related to Linux and its commands. It focuses on\nthe small core commands that you will use the 80% or\n90% of the time, trying to simp lify the usage of the\nmore complex ones.\nAll those commands work on Linux, macOS, WSL,\nand anywhere you have a UNIX environment.\nI hope the contents of this book will help you achieve\nwhat 

In [5]:
def create_text_splitter(chunk_size : int, chunk_overlap : int) -> CharacterTextSplitter:
    t_splitter = CharacterTextSplitter(
        separator = "\n",
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        is_separator_regex = False
    )

    return t_splitter

In [6]:
text_splitter = create_text_splitter(200, 20)
print(type(text_splitter))

<class 'langchain_text_splitters.character.CharacterTextSplitter'>


In [7]:
def create_chunks(text : str) -> list:
    chunks_list = text_splitter.split_text(text)
    nr_of_chunks = len(chunks_list)
    print(f"Nr. of text chunks: {nr_of_chunks}")
    
    return chunks_list

In [8]:
chunks = create_chunks(text_blob)

Nr. of text chunks: 346


In [9]:
def create_collection(doc_chunks : list) -> chromadb.Collection:
    chroma_client = chromadb.Client()
    generated_collection = chroma_client.create_collection(name = 'my_collection')
    generated_collection.add(
        documents = doc_chunks,
        ids=[f"id{n}" for n in range(len(doc_chunks))]
    )

    return generated_collection

In [10]:
collection = create_collection(chunks)

In [11]:
def create_query(question : str, collection : chromadb.Collection) -> dict:
    results_dict = collection.query(
        query_texts = question,
        n_results = 3
    )
    print(results_dict)

    return results_dict

In [12]:
user_input = "What are some of the different Linux distros"
results = create_query(user_input, collection)

{'ids': [['id16', 'id13', 'id21']], 'distances': [[0.630453884601593, 0.8380312919616699, 0.8631106019020081]], 'metadatas': [[None, None, None]], 'embeddings': None, 'documents': [['A "distro " is made by a company or organization and\npackages the Linux core with additional programs and\ntooling.\nFor example you have Debian, Red Hat, and Ubuntu,\nprobably the most popular .', 'what goes into Linux, or the project priorities.6Linux can also be used as your day to day computer . I\nuse macOS because I really enjoy the applications,', "GNU/Linux syste m uses, because GNU/Linux is a free\nalternative to UNIX.\nUNIX  is an umbrella term that groups many\noperating systems used in big corporations and\ninstitutions, starting from the 70's"]], 'uris': None, 'data': None}


In [13]:
for k, v in results.items():
    print(k, v)

ids [['id16', 'id13', 'id21']]
distances [[0.630453884601593, 0.8380312919616699, 0.8631106019020081]]
metadatas [[None, None, None]]
embeddings None
documents [['A "distro " is made by a company or organization and\npackages the Linux core with additional programs and\ntooling.\nFor example you have Debian, Red Hat, and Ubuntu,\nprobably the most popular .', 'what goes into Linux, or the project priorities.6Linux can also be used as your day to day computer . I\nuse macOS because I really enjoy the applications,', "GNU/Linux syste m uses, because GNU/Linux is a free\nalternative to UNIX.\nUNIX  is an umbrella term that groups many\noperating systems used in big corporations and\ninstitutions, starting from the 70's"]]
uris None
data None


In [14]:
print(results['documents'][0])

['A "distro " is made by a company or organization and\npackages the Linux core with additional programs and\ntooling.\nFor example you have Debian, Red Hat, and Ubuntu,\nprobably the most popular .', 'what goes into Linux, or the project priorities.6Linux can also be used as your day to day computer . I\nuse macOS because I really enjoy the applications,', "GNU/Linux syste m uses, because GNU/Linux is a free\nalternative to UNIX.\nUNIX  is an umbrella term that groups many\noperating systems used in big corporations and\ninstitutions, starting from the 70's"]


In [15]:
# Provide background context for the LLM for generating responses
context_prompt = (
    "You are a bot providing answers to queries based only out of the supplied documents. "
    "You will be given information pages. If you can answer the question by using these pages, do so. "
    "Do not give answers to questions that can't be answered using the information pages. "
    )

In [16]:
# Provide specific information for the LLM to base the comment off of

info_page = results['documents'][0]
specific_prompt = [
    f"Information page is {info_page}."
    f"Question is {user_input}"
]

In [17]:
# Construct the final prompt for the LLM based off of the context_prompt and specific_prompt
prompt = f"<s>[INST] <<SYS>>{context_prompt}<</SYS>>\\n\\n"

for specifics in specific_prompt:
        prompt += f"[INST]{specifics}[/INST]"

In [18]:
# Generates the comment with the following parameters
output = together.Complete.create(
    prompt,
    model = "togethercomputer/llama-2-13b-chat",        # Chat model to be used
    max_tokens = 250,                                   # Hard limit on length of comment
    temperature = 0.5,                                  # Measure of comment creativity
    top_k = 90,                                         # Measure of comment diversity
    top_p = 0.8,
    repetition_penalty = 1.1,                           # Slightly discourage repetition between comments
    stop = ['</s>']
)

In [19]:
complete_output = output['output']['choices'][0]['text']
print(complete_output)

  Based on the information provided in the information pages, here are some examples of different Linux distros:

1. Debian
2. Red Hat
3. Ubuntu

These are some of the most popular Linux distros.
