In [1]:
redis_url = ""
index_name = "summary"

## Redis Ingestion

In [2]:
from langchain.document_loaders import PyPDFDirectoryLoader

pdf_folder_path = 'pdfs'

loader = PyPDFDirectoryLoader(pdf_folder_path)
docs = loader.load()

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1024, chunk_overlap = 40)
all_splits = text_splitter.split_documents(docs)

In [4]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores.redis import Redis
schema_name = "summary_schema.yaml"
embeddings = HuggingFaceEmbeddings()
rds = Redis.from_documents(all_splits,
                           embeddings,
                           redis_url=redis_url,
                           index_name=index_name)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
rds.write_schema("summary_schema.yaml") ## Use this index for chatbot deployment

## Imports

In [6]:
from langchain.chains import ConversationChain
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.chains.summarize import load_summarize_chain
from langchain.llms import HuggingFaceTextGenInference
from langchain.chains import AnalyzeDocumentChain

## Initialize the llm

In [7]:
inference_server_url = ""
llm = HuggingFaceTextGenInference(
    inference_server_url=inference_server_url,
    max_new_tokens=512,
    top_k=10,
    top_p=0.95,
    typical_p=0.95,
    temperature=0.1,
    repetition_penalty=1.175
)




## Generate conversation

#### Performant with mistrial model

In [12]:
def generate(topic):
    prompt_template = """Use the context below to write a 400 word blog post about the topic below:
    Context: {context}
    Topic: {topic}
    Blog post:"""

    PROMPT = PromptTemplate(
        template=prompt_template, input_variables=["context", 
                                                   "topic"]
    )

    chain = LLMChain(llm=llm, prompt=PROMPT)
    docs = rds.similarity_search(topic, k=4)
    inputs = [{"context": doc.page_content, 
               "topic": topic} for doc in docs]
    gen = chain.apply(inputs)
    return gen

In [13]:
output = generate("Langchain Concepts")

In [14]:
generate = output[2]['text']

In [15]:
generate

' Understanding Langchain Concepts\n\nUnderstanding Langchain Concepts\n\nLangchain is an open-source library that provides tools for data augmentation in natural language processing (NLP) tasks. It uses generative models to compress longer documents by summarizing them into shorter versions while preserving their meaning. In this blog post, we will explore some of the key concepts behind Langchain and its applications.\n\nData Augmentation\n\nData augmentation is the process of generating new training examples from existing ones. This technique is commonly used in NLP tasks such as text classification, sentiment analysis, and machine translation. By increasing the size of the training dataset, data augmentation helps improve model performance and reduce overfitting.\n\nGenerative Models\n\nGenerative models are a class of machine learning algorithms that generate new samples based on a probability distribution learned from the training data. They differ from discriminative models, whi

## Summary Conversation

#### Perrformant with mistrial model

In [19]:
def summarize(topic):
    prompt_template = """Write a concise summary of the following:


    {text}


    CONCISE SUMMARY IN GERMAN:"""
    PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
    summary_chain = load_summarize_chain(llm, chain_type="map_reduce", map_prompt=PROMPT, combine_prompt=PROMPT)
    summarize_document_chain = AnalyzeDocumentChain(combine_docs_chain=summary_chain)
    db_retriever = rds.as_retriever()
    langchain_qa = RetrievalQA.from_chain_type(llm=llm, 
                                 chain_type="stuff", 
                                 retriever=db_retriever)
    relevant_data = langchain_qa(topic)
    summary = summarize_document_chain.run(relevant_data['result'])
    return summary

In [21]:
summary = summarize(generate)

score_threshold is deprecated. Use distance_threshold instead.score_threshold should only be used in similarity_search_with_relevance_scores.score_threshold will be removed in a future release.


In [22]:
summary

'\n\n    Langchain ist eine öffentliche Quellbibliothek, die Werkzeuge zur Vergrößerung von Daten in natürlicher Sprachverarbeitung (NLP)-Aufgaben anbietet. Sie nutzt generative Modelle, um längere Dokumente durch Zusammenfassungen in kürzere Versionen zu kompressieren, ohne deren Bedeutung zu verlieren. In diesem Blogpost werden einige Schlüsselkonzepte hinter Langchain und ihre Anwendungen erörtert.\n\n    Datenerweiterung\n\n    Datenerweiterung ist die Prozedur von existierenden Beispielen neuen Trainingsbeispielen zu generieren. Diese Technik wird häufig in NLP-Aufgaben wie Textklassifizierung, Stimmungsanalyse und Übersetzung verwendet. Durch den Erhöhung des Trainingsdatasets helft Datenerweiterung die Modellperformance zu verbessern und Überfitting zu reduzieren.\n\n    Generative Modelle\n\n    Generative Modelle sind eine Klasse Maschinenlernung-Algorithmus, die neue Proben auf einer Wahrscheinlichkeitsverteilung aus dem Trainingdatum lernen und dann auf dieser Basis neue Sam

## Document Generation

In [23]:
doc_gen = f'Blog Post: {generate} \n\n\n\n\n Blog Post Summary in German: {summary}'

In [24]:
print(doc_gen)

Blog Post:  Understanding Langchain Concepts

Understanding Langchain Concepts

Langchain is an open-source library that provides tools for data augmentation in natural language processing (NLP) tasks. It uses generative models to compress longer documents by summarizing them into shorter versions while preserving their meaning. In this blog post, we will explore some of the key concepts behind Langchain and its applications.

Data Augmentation

Data augmentation is the process of generating new training examples from existing ones. This technique is commonly used in NLP tasks such as text classification, sentiment analysis, and machine translation. By increasing the size of the training dataset, data augmentation helps improve model performance and reduce overfitting.

Generative Models

Generative models are a class of machine learning algorithms that generate new samples based on a probability distribution learned from the training data. They differ from discriminative models, which