In [1]:
!pip install -qU \
    openai \
    pinecone-client \
    langchain \
    langchain-community \
    tiktoken \
    datasets



In [2]:
from youtube_transcript_api import YouTubeTranscriptApi

def get_video_transcript(video_id):
    """
    Fetch transcript for a given YouTube video ID.
    """
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        transcript_text = " ".join([item['text'] for item in transcript if 'text' in item])
        return transcript_text
    except Exception as e:
        print(f"Error retrieving transcript for video {video_id}: {e}")
        return None

# Example usage with a list of video IDs
video_ids = ['2u4ItZerRac', 'I2zF1I60hPg', '8xqSF-uHCUs', 'LtmS-c1pChY', 'sJNxT-I7L6s']
transcripts = {}

# Fetch the transcript for each video
for video_id in video_ids:
    transcript = get_video_transcript(video_id)
    if transcript:
        transcripts[video_id] = transcript
    else:
        print(f"No transcript available for video {video_id}")

# Display the fetched transcripts
for video_id, transcript in transcripts.items():
    print(f"Transcript for {video_id}:")
    print(transcript[:500])

Transcript for 2u4ItZerRac:
[Music] the pros and cons of living in Germany today we're going to talk about the pros and cons of living in Germany Germany is an amazing country with a strong economy lots of culture great infrastructure and friendly people but that doesn't mean it's the right place for everyone in this video we'll look at all aspects of life in Germany so you can decide if it's the right move for you so let's get started Pros living in Germany is a great experience it offers many advantages both long and sho
Transcript for I2zF1I60hPg:
should you move to Germany that's an easy question to a hard answer but I'm going to try my [Music] best hi my name is Fernando I have been living in Munich Germany for eight years now and yeah I have a lot of experience I say and I'm very happy to kind of like give you some ideas and some stories to will probably help you make a better decision if you should move to Germany or not the first thing of course is that there are many reasons w

In [3]:
from dotenv import load_dotenv, find_dotenv
import os

# Load environment variables from a .env file
_ = load_dotenv(find_dotenv())

# Get API keys from environment variables
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [None]:
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"]="NAME_OF_YOUR_PROJECT"
LANGSMITH_API_KEY = os.getenv("LANGSMITH_API_KEY")

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY,
    model_name='gpt-3.5-turbo',
    temperature=0.0)
llm.invoke()

In [4]:
from getpass import getpass
from langchain.embeddings.openai import OpenAIEmbeddings

# Get API key for OpenAI
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or getpass("Enter your OpenAI API key: ")
model_name = 'text-embedding-ada-002'

# Initialize OpenAI Embeddings
embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

  embed = OpenAIEmbeddings(


In [5]:
from pinecone import Pinecone

# Get API key for Pinecone
api_key = os.getenv("PINECONE_API_KEY") or getpass("Enter your Pinecone API key: ")

# Configure Pinecone client
pc = Pinecone(api_key=api_key)

# Define serverless specification
from pinecone import ServerlessSpec
spec = ServerlessSpec(cloud="aws", region="us-east-1")

In [6]:
import time

index_name = "langchain-retrieval-agent"
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

# Check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # If does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # Dimensionality of ada 002
        metric='dotproduct',
        spec=spec
    )
    # Wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# Connect to index
index = pc.Index(index_name)
time.sleep(1)
# View index stats
index.describe_index_stats()


{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 5}},
 'total_vector_count': 5}

In [7]:
from tqdm.auto import tqdm

batch_size = 100

# Prepare data for upload to Pinecone
data = [{'title': video_id, 'context': transcript, 'id': video_id} for video_id, transcript in transcripts.items()]

for i in tqdm(range(0, len(data), batch_size)):
    # Get end of batch
    i_end = min(len(data), i + batch_size)
    batch = data[i:i_end]
    # First get metadata fields for this batch
    metadatas = [{'title': record['title'], 'text': record['context']} for record in batch]
    # Get the list of contexts / documents
    documents = [record['context'] for record in batch]
    # Create document embeddings
    embeds = embed.embed_documents(documents)
    # Get IDs
    ids = [record['id'] for record in batch]
    # Add everything to Pinecone
    index.upsert(vectors=zip(ids, embeds, metadatas))

index.describe_index_stats()

  0%|          | 0/1 [00:00<?, ?it/s]

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 5}},
 'total_vector_count': 5}

In [8]:
pip install -U :class:`~langchain-pinecone

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: Invalid requirement: ':class:`~langchain-pinecone': Expected package name at the start of dependency specifier
    :class:`~langchain-pinecone
    ^


In [9]:
from langchain.vectorstores import Pinecone

text_field = "text"  # The metadata field that contains our text

# Initialize the vector store object
vectorstore = Pinecone(
    index, embed.embed_query, text_field
)

  vectorstore = Pinecone(


In [10]:
query = "what are the pros and cons of living in Germany?"

# Perform similarity search
vectorstore.similarity_search(
    query,  # Our search query
    k=3  # Return 3 most relevant docs
)

[Document(metadata={'title': '2u4ItZerRac'}, page_content="[Music] the pros and cons of living in Germany today we're going to talk about the pros and cons of living in Germany Germany is an amazing country with a strong economy lots of culture great infrastructure and friendly people but that doesn't mean it's the right place for everyone in this video we'll look at all aspects of life in Germany so you can decide if it's the right move for you so let's get started Pros living in Germany is a great experience it offers many advantages both long and shortterm that make it an attractive destination for those considering making the move here are some of the pros of living in Germany one free education for more than 25 years Germany has offered tuition-free higher education at public universities with no charge for most courses and little no cost for administrative fees this means that students coming from all over the world can pursue their degree programs without worrying about expensiv

In [11]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.chains import RetrievalQA

# Chat completion LLM
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

# Conversational memory
conversational_memory = ConversationBufferWindowMemory(
    memory_key='chat_history',
    k=5,
    return_messages=True
)

# Retrieval QA chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

qa.run(query)

  llm = ChatOpenAI(
  conversational_memory = ConversationBufferWindowMemory(
  qa.run(query)


'Pros of living in Germany:\n1. Free education: Germany offers tuition-free higher education at public universities, with grants and scholarships available for living expenses.\n2. Drinkable tap water: Most German towns have high-quality tap water that is monitored stringently.\n3. Unemployment benefits: The German government provides generous provisions for unemployment insurance, including support for education and training.\n4. Punctuality: Germans are known for their punctuality, which can be beneficial for efficient living.\n5. Quality of life: Germany offers a safe environment, low crime rate, cultural offerings, and beautiful cities.\n6. Efficient public transport: Germany has well-developed public transport systems with discounts for students and affordable options for all.\n7. Honesty and reliability: Germans are known for their honesty and reliability in personal and professional interactions.\n\nCons of living in Germany:\n1. Bureaucracy: Dealing with paperwork and navigatin

In [12]:
from langchain.agents import Tool

tools = [
    Tool(
        name='Knowledge Base',
        func=qa.run,
        description=(
            'Use this tool when answering general knowledge queries to get '
            'more information about the topic'
        )
    )
]

In [13]:
from langchain.agents import initialize_agent

agent = initialize_agent(
    agent='chat-conversational-react-description',
    tools=tools,
    llm=llm,
    verbose=True,
    max_iterations=3,
    early_stopping_method='generate',
    memory=conversational_memory
)

  agent = initialize_agent(


In [14]:
agent(query)

  agent(query)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Knowledge Base",
    "action_input": "Pros and cons of living in Germany"
}
```[0m
Observation: [36;1m[1;3mPros of living in Germany:
1. Free education: Germany offers tuition-free higher education at public universities for more than 25 years.
2. Drinkable tap water: Germany has high-quality tap water that is safe to drink in most towns.
3. Unemployment benefits: The German government provides generous provisions for unemployment insurance, including benefits and opportunities for education or training.
4. Punctuality: Germans are known for their punctuality, which can be seen as a positive aspect of living in Germany.
5. Quality of life: Germany offers a high quality of life with safety, cultural offerings, and efficient public transport.
6. Efficient public transport: Germany has well-developed and efficient public transport systems with discounts for students and affordable options for all.
7. 

{'input': 'what are the pros and cons of living in Germany?',
 'chat_history': [],
 'output': 'The pros of living in Germany include free education, drinkable tap water, unemployment benefits, punctuality, quality of life, efficient public transport, and honesty and reliability. On the other hand, the cons of living in Germany involve bureaucracy, difficulty in making friends, bad customer service, bad internet connection, lack of air conditioning, tough language, and high housing costs.'}

In [15]:
agent("what are the pros and cons of living in Germany?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Knowledge Base",
    "action_input": "Pros and cons of living in Germany"
}
```[0m
Observation: [36;1m[1;3mPros of living in Germany:
1. Free education at public universities for more than 25 years.
2. Drinkable tap water in most German towns.
3. Generous unemployment benefits and provisions for unemployment insurance.
4. Germans are known for their punctuality and reliability.
5. Germany offers a high quality of life with safe cities, cultural offerings, and efficient public transport.
6. Efficient public transport systems with discounts for students and those with limited incomes.
7. Germans are fairly honest and reliable in their interactions.

Cons of living in Germany:
1. Bureaucracy can be overwhelming, especially for newcomers.
2. Difficulty in making friends due to cultural differences and language barriers.
3. Bad customer service experiences can range from rudeness to hostility.
4. Bad in

{'input': 'what are the pros and cons of living in Germany?',
 'chat_history': [HumanMessage(content='what are the pros and cons of living in Germany?', additional_kwargs={}, response_metadata={}),
  AIMessage(content='The pros of living in Germany include free education, drinkable tap water, unemployment benefits, punctuality, quality of life, efficient public transport, and honesty and reliability. On the other hand, the cons of living in Germany involve bureaucracy, difficulty in making friends, bad customer service, bad internet connection, lack of air conditioning, tough language, and high housing costs.', additional_kwargs={}, response_metadata={})],
 'output': 'The pros of living in Germany include free education at public universities, drinkable tap water, generous unemployment benefits, punctuality and reliability, high quality of life, efficient public transport, and honesty and reliability. On the other hand, the cons of living in Germany involve bureaucracy, difficulty in m

In [16]:
agent("what is the topic of the videos?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Knowledge Base",
    "action_input": "topic of the videos"
}
```[0m
Observation: [36;1m[1;3mThe videos discussed the pros and cons of living in Germany, the alternative for Germany (AfD) party's radical new approach to immigration policy, and the potential implications of the proposed immigration restrictions in Germany.[0m
Thought:[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "The pros of living in Germany include free education at public universities, drinkable tap water, generous unemployment benefits, punctuality and reliability, high quality of life, efficient public transport, and honesty and reliability. On the other hand, the cons of living in Germany involve bureaucracy, difficulty in making friends, bad customer service, bad internet connection, lack of air conditioning, tough language learning, and high housing costs."
}
```[0m

[1m> Finished chain.[0m


{'input': 'what is the topic of the videos?',
 'chat_history': [HumanMessage(content='what are the pros and cons of living in Germany?', additional_kwargs={}, response_metadata={}),
  AIMessage(content='The pros of living in Germany include free education, drinkable tap water, unemployment benefits, punctuality, quality of life, efficient public transport, and honesty and reliability. On the other hand, the cons of living in Germany involve bureaucracy, difficulty in making friends, bad customer service, bad internet connection, lack of air conditioning, tough language, and high housing costs.', additional_kwargs={}, response_metadata={}),
  HumanMessage(content='what are the pros and cons of living in Germany?', additional_kwargs={}, response_metadata={}),
  AIMessage(content='The pros of living in Germany include free education at public universities, drinkable tap water, generous unemployment benefits, punctuality and reliability, high quality of life, efficient public transport, an

In [17]:
agent("can you tell me some facts about living in Germany?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Knowledge Base",
    "action_input": "Facts about living in Germany"
}
```[0m
Observation: [36;1m[1;3mLiving in Germany offers many advantages, such as free education at public universities for more than 25 years, drinkable tap water in most areas, unemployment benefits that support individuals in finding work or continuing education, punctual and reliable people, a high quality of life with safe cities, efficient public transport systems, and honest and reliable individuals. However, there are also challenges like bureaucracy, difficulty in making friends due to cultural differences, bad customer service in some instances, slow internet connection, lack of air conditioning in many places, and the tough German language to learn. Germany is actively seeking skilled immigrants to address its aging population and workforce needs, but the country is also facing political debates and pressures regarding

{'input': 'can you tell me some facts about living in Germany?',
 'chat_history': [HumanMessage(content='what are the pros and cons of living in Germany?', additional_kwargs={}, response_metadata={}),
  AIMessage(content='The pros of living in Germany include free education, drinkable tap water, unemployment benefits, punctuality, quality of life, efficient public transport, and honesty and reliability. On the other hand, the cons of living in Germany involve bureaucracy, difficulty in making friends, bad customer service, bad internet connection, lack of air conditioning, tough language, and high housing costs.', additional_kwargs={}, response_metadata={}),
  HumanMessage(content='what are the pros and cons of living in Germany?', additional_kwargs={}, response_metadata={}),
  AIMessage(content='The pros of living in Germany include free education at public universities, drinkable tap water, generous unemployment benefits, punctuality and reliability, high quality of life, efficient p

In [18]:
agent("can you summarize these facts in two short sentences")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "Living in Germany offers benefits like free education, reliable public services, and high quality of life, but challenges include bureaucracy, high housing costs, and language barriers."
}
```[0m

[1m> Finished chain.[0m


{'input': 'can you summarize these facts in two short sentences',
 'chat_history': [HumanMessage(content='what are the pros and cons of living in Germany?', additional_kwargs={}, response_metadata={}),
  AIMessage(content='The pros of living in Germany include free education, drinkable tap water, unemployment benefits, punctuality, quality of life, efficient public transport, and honesty and reliability. On the other hand, the cons of living in Germany involve bureaucracy, difficulty in making friends, bad customer service, bad internet connection, lack of air conditioning, tough language, and high housing costs.', additional_kwargs={}, response_metadata={}),
  HumanMessage(content='what are the pros and cons of living in Germany?', additional_kwargs={}, response_metadata={}),
  AIMessage(content='The pros of living in Germany include free education at public universities, drinkable tap water, generous unemployment benefits, punctuality and reliability, high quality of life, efficient 

In [19]:
agent("can you explain what immigrants faces in germany?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Knowledge Base",
    "action_input": "Challenges faced by immigrants in Germany"
}
```[0m
Observation: [36;1m[1;3mSome challenges faced by immigrants in Germany include navigating the complex bureaucracy, difficulty in finding affordable housing, language barriers, discrimination, difficulties in making friends, and potential issues with integration into German society. Additionally, immigrants may face challenges in finding suitable employment, accessing healthcare, and dealing with cultural differences.[0m
Thought:[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "Living in Germany offers benefits like free education, reliable public services, and high quality of life, but challenges include bureaucracy, high housing costs, and language barriers."
}
```[0m

[1m> Finished chain.[0m


{'input': 'can you explain what immigrants faces in germany?',
 'chat_history': [HumanMessage(content='what are the pros and cons of living in Germany?', additional_kwargs={}, response_metadata={}),
  AIMessage(content='The pros of living in Germany include free education, drinkable tap water, unemployment benefits, punctuality, quality of life, efficient public transport, and honesty and reliability. On the other hand, the cons of living in Germany involve bureaucracy, difficulty in making friends, bad customer service, bad internet connection, lack of air conditioning, tough language, and high housing costs.', additional_kwargs={}, response_metadata={}),
  HumanMessage(content='what are the pros and cons of living in Germany?', additional_kwargs={}, response_metadata={}),
  AIMessage(content='The pros of living in Germany include free education at public universities, drinkable tap water, generous unemployment benefits, punctuality and reliability, high quality of life, efficient pub

Rouge Evaluation 

In [None]:
from pinecone import Pinecone, ServerlessSpec
import numpy as np
import pandas as pd
import torch
import transformers
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
import evaluate
from nltk.tokenize import sent_tokenize

In [None]:
# Fetch YouTube video transcripts
def get_video_transcript(vid_id):
    try:
        transcript_data = YouTubeTranscriptApi.get_transcript(vid_id)
        transcript_text = " ".join([item['text'] for item in transcript_data if 'text' in item])
        return transcript_text
    except Exception as e:
        print(f"Error retrieving transcript for video {vid_id}: {e}")
        return None

video_ids = ['2u4ItZerRac', 'I2zF1I60hPg', '8xqSF-uHCUs', 'LtmS-c1pChY', 'sJNxT-I7L6s']
transcripts = {}

for vid_id in video_ids:
    trans = get_video_transcript(vid_id)
    if trans:
        transcripts[vid_id] = trans
    else:
        print(f"No transcript available for video {vid_id}")

reference_summaries = {
    '2u4ItZerRac': "Living in Germany offers a strong economy and high quality of life. However, there are challenges like language barriers and integration issues.",
    'I2zF1I60hPg': "Germany has a robust healthcare system and rich cultural experiences, but the cost of living is high.",
    '8xqSF-uHCUs': "Germany provides efficient public services and strong social security, but finding housing can be difficult.",
    'LtmS-c1pChY': "Germany's education system and work-life balance are excellent, but the tax system is complex.",
    'sJNxT-I7L6s': "Germany's vibrant cities and safety are attractive, but cultural differences can be a challenge."
}

# Prepare the dataset
dataset = [{"transcript": transcript, "summary": summary} for transcript, summary in zip(transcripts.values(), reference_summaries.values())]

# Load the tokenizer and model
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Tokenize the dataset
def tokenize_function(examples):
    model_inputs = tokenizer_small("summarize: " + examples["transcript"], max_length=512, truncation=True, padding="max_length")
    with tokenizer_small.as_target_tokenizer():
        labels = tokenizer_small(examples["summary"], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the tokenization function to the dataset
tokenized_dataset = list(map(tokenize_function, dataset))

# Convert the tokenized dataset to a format compatible with the Trainer
class SummarizationDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_dataset):
        self.tokenized_dataset = tokenized_dataset

    def __len__(self):
        return len(self.tokenized_dataset)

    def __getitem__(self, idx):
        return {key: torch.tensor(val) for key, val in self.tokenized_dataset[idx].items()}

train_dataset = SummarizationDataset(tokenized_dataset)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset  # Using the same dataset for demonstration
)

# Fine-tune the model
trainer.train()

# Generate summaries for the transcripts
def generate_summaries(texts_list, tokenizer, model, max_length=125):
    summaries = []
    for text in texts_list:
        inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True).input_ids
        outputs = model.generate(inputs, max_length=max_length, num_beams=2, early_stopping=True)
        summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        summaries.append(summary)
    return summaries

generated_summaries = generate_summaries(list(transcripts.values()), tokenizer, model)

# Load ROUGE
rouge_score = evaluate.load("rouge")

# Define the compute_rouge_score function
def compute_rouge_score(generated, reference):
    generated_with_newlines = ["\n".join(sent_tokenize(s.strip())) for s in generated]
    reference_with_newlines = ["\n".join(sent_tokenize(s.strip())) for s in reference]
    return rouge_score.compute(predictions=generated_with_newlines, references=reference_with_newlines, use_stemmer=True)

# Evaluate the generated summaries
generated_summaries_dict = {video_id: summary for video_id, summary in zip(transcripts.keys(), generated_summaries)}
new_rouge_scores = compute_rouge_score(list(generated_summaries_dict.values()), list(reference_summaries.values()))

print(new_rouge_scores)

In [20]:
evaluation_data = [
    {"query": "what are the pros and cons of living in Germany?", "expected": "pros are you can have dual nationality and goverment benefits and so many other things"},
    {"query": "what is the topic of the videos?","expected": "pros and cons of living in germany"},
    {"query": "can you tell me some facts about living in Germany?", "expected": "in germany the transport and health care is good"},
    {"query": "can you summarize these facts in two short sentences", "expected": "good food,transport and healthcare "},
    {"query": "can you explain what immigrants face in Germany?", "expected": "intigration, housing and employment"},
]

In [21]:
import difflib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, data):
    """
    Evaluate the model using the provided data.
    """
    actual_responses = []
    for item in data:
        response = model.run(item["query"])
        actual_responses.append(response)

    # Use difflib to get similarity ratio
    similarities = []
    for actual, expected in zip(actual_responses, [item["expected"] for item in data]):
        similarity = difflib.SequenceMatcher(None, actual, expected).ratio()
        similarities.append(similarity)

    average_similarity = sum(similarities) / len(similarities)
    return {"average_similarity": average_similarity, "similarities": similarities}

# Run evaluation
metrics = evaluate_model(qa, evaluation_data)
print(metrics)

{'average_similarity': 0.10261588042797168, 'similarities': [0.06439024390243903, 0.1870967741935484, 0.08514851485148515, 0.1457286432160804, 0.030715225976305396]}
