In [1]:
from youtube_transcript_api import YouTubeTranscriptApi
import re

In [2]:
def get_video_id(video_url):
    match = re.search(r"(?<=v=)[\w-]+", video_url)
    return match.group(0) if match else None

In [3]:
def get_video_transcript(video_url):
    video_id = get_video_id(video_url)
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        text = ""
        for line in transcript:
            text += line['text'] + " "
        return text.strip()
    except Exception as e:
        print(f"Error getting transcript: {e}")
        return None

In [4]:
data = get_video_transcript("https://www.youtube.com/watch?v=JqQf6-gdEXE")
data

"give us a cricket opinion that you can't say out loud I guess I think Ro should still be the Mumbai Indian Captain I think if you're the Indian T20 Captain you have to be captain in your franchise do you remember a drunk conversation that you had with your England mates about the Indian team we were drunk all the time so we were probably speaking about Sachin what do you make of sachin's statue he deserves it yeah but do you think it looks like him or Steve Smith the bat's skewed in his hands he only ever played the most perfect Drive what's it like being in Mumbai covering the IPL but I look at the I think for a franchise to win you've got to have your Indian players playing well you look at rajastan you ask if SW J fell at the top of the order use you I mean he doesn't really get much of a mention I never understood why RCB got rid of him KY and Rohit Sharma both they've created this identity of the Indian cricket team verac Coy he has given so much love and joy to test match cricke

In [5]:
import ollama
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
import pandas as pd
import chromadb
from chromadb import EmbeddingFunction
import tqdm
import time
from IPython.display import Markdown
import fitz

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,    
    chunk_overlap = 20,
    length_function = len,
    add_start_index = True
)

In [7]:
texts = text_splitter.create_documents([data])
print(texts[0].page_content)

give us a cricket opinion that you can't say out loud I guess I think Ro should still be the Mumbai Indian Captain I think if you're the Indian T20 Captain you have to be captain in your franchise do you remember a drunk conversation that you had with your England mates about the Indian team we were drunk all the time so we were probably speaking about Sachin what do you make of sachin's statue he deserves it yeah but do you think it looks like him or Steve Smith the bat's skewed in his hands he only ever played the most perfect Drive what's it like being in Mumbai covering the IPL but I look at the I think for a franchise to win you've got to have your Indian players playing well you look at rajastan you ask if SW J fell at the top of the order use you I mean he doesn't really get much of a mention I never understood why RCB got rid of him KY and Rohit Sharma both they've created this identity of the Indian cricket team verac Coy he has given so much love and joy to test match cricket

In [8]:
docs = []

for chunk in texts:
    docs.append(chunk.page_content)

print(docs[:2])

["give us a cricket opinion that you can't say out loud I guess I think Ro should still be the Mumbai Indian Captain I think if you're the Indian T20 Captain you have to be captain in your franchise do you remember a drunk conversation that you had with your England mates about the Indian team we were drunk all the time so we were probably speaking about Sachin what do you make of sachin's statue he deserves it yeah but do you think it looks like him or Steve Smith the bat's skewed in his hands he only ever played the most perfect Drive what's it like being in Mumbai covering the IPL but I look at the I think for a franchise to win you've got to have your Indian players playing well you look at rajastan you ask if SW J fell at the top of the order use you I mean he doesn't really get much of a mention I never understood why RCB got rid of him KY and Rohit Sharma both they've created this identity of the Indian cricket team verac Coy he has given so much love and joy to test match crick

In [9]:
from chromadb.utils import embedding_functions
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
from chromadb import Documents, EmbeddingFunction, Embeddings
from tqdm import tqdm
class MyEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        embeddings = sentence_transformer_ef([docs])
        return embeddings

In [11]:
def create_chroma_db(docs,name):
    chroma_client = chromadb.PersistentClient(path="D:/Mini Projects/TubeChat/models/data-yt")
    if name in chroma_client.list_collections():
        chroma_client.drop_collection(name)
    db = chroma_client.get_or_create_collection(
        name=name, embedding_function=sentence_transformer_ef)
    initial_size = db.count()
    for i, d in tqdm(enumerate(docs), total=len(docs), desc="Creating Chroma DB"):
        db.add(
            documents=d,
            ids=str(i + initial_size)
        )
        time.sleep(0.5)
    return db


def get_chroma_db(name):
    chroma_client = chromadb.PersistentClient(path="D:/Mini Projects/TubeChat/models/data-yt")
    return chroma_client.get_collection(name=name, function=EmbeddingFunction())

In [12]:
db = create_chroma_db(docs, "db_rag")
db.count()

Creating Chroma DB: 100%|██████████| 75/75 [00:46<00:00,  1.62it/s]


75

In [13]:
pd.DataFrame(db.peek(5))

Unnamed: 0,ids,embeddings,metadatas,documents,uris,data
0,0,"[-0.020583227276802063, 0.04846305027604103, 0...",,give us a cricket opinion that you can't say o...,,
1,1,"[-0.04560936987400055, 0.0416586734354496, 0.0...",,test match cricket and and when he was the tes...,,
2,10,"[-0.03345503285527229, -0.06322483718395233, -...",,Stokes but flintoff was Flint off yeah I mean ...,,
3,11,"[0.012117676436901093, -0.07399595528841019, -...",,uh since coule so what's the difference in lik...,,
4,12,"[-0.01315551158040762, -0.057236846536397934, ...",,Spinners we've had a couple in in the past but...,,


In [14]:
def get_relevant_passages(query, db, n_results=5):
    passages = db.query(query_texts=[query], n_results=n_results)['documents'][0]
    return passages

In [21]:
ques = "What does the speaker speak about his cricketing career?"
passages = get_relevant_passages(ques, db, n_results=7)

Markdown(passages[0]+passages[1])

um then he got an injury and then he was out of the side he wasn't a part of the 05 team and then 2006 he didn't play and then we went to New Zealand and we got him and broaden to the side they just I never ever felt that either of them were going to go to the the levels that they've achieved it's remarkable the longevity that they they have given the game and um you know waking up as a batter every morning is hard but waking up as a bowler you imagine the bodies what I have a look at is Jimmy he he can't like being at home because he always wants to play cricket and he said something actually um which I saw last week which I think it's exactly the message that all kids have got to understand he said to to be a cricketer you've got to love Fielding so when he first started playing the game I think it was at Burnley Burnley Cricket Club he was just the Fielder diving at Square leg he loves fielding and again I I I I I I didn't like Fielding you couldn't catch so I was always worried inthat he might start to dominate both home and away for for India um for The Amateur Indian cricket fan if our team's traveling to England we look at it as a test for a lot of our youngsters and our test cricket players who've performed well here and over the years on an average we almost always anticipate that we're going to lose a lot of wickets very quickly in a few of those matches at least now I personally believe that any human being is an outcome of his or her own stories and experiences so you white boys in England what what happens to you guys when yall are facing seam bowling constantly in those conditions are you able to play seam bowling fast Bowling better than the subcontinent players like what happens to subcontinent players when they're playing in English conditions like can you break it down a little bit more yeah I mean it's say we we're brought up with the ball swinging you know from the age of four or five when you play the ball swings through the air so you're kind

In [22]:
chats={}

In [23]:
def ollama_llm(question, context, chats):
    formatted_prompt = f'''
    Question: {question}\n\n Chat History :{chats} Context: {context}
    Answer the question in 1250-2500 words strictly based on the context given. If asked explain code. Keep a smooth flow for your answer. If the data is a conversation, explain the individuals before answering the question.If the question is asking for a code then also explain the algorithm.If the question contains equations, explain them. Incase of spelling errors in the context, make spelling corrections in the response.
    If the information is not sufficient then give output as "Info not good to answer".
    '''
    response = ollama.chat(model='mistral', messages=[{'role': 'user', 'content': formatted_prompt}])
    return response['message']['content']

In [18]:
def list_to_string(passages):
    content = ""
    for passage in passages:
        content += passage + "\n"
    return content

In [24]:
answer = ollama_llm(ques,list_to_string(passages),chats)
def chat_history(ques,answer,chats):
    next_id = len(chats)+1
    chats[next_id] = {"question": ques,answer:"answer"}
if len(chats):
    if (chats[len(chats)]['question']!= ques or chats[len(chats)]['question']!=ques):
        chat_history(ques=ques,answer=answer,chats=chats)
    else:
        print("Same")
else:
    chat_history(ques=ques,answer=answer,chats=chats)

In [25]:
Markdown(answer)

 The speaker in this context is having a conversation with someone about their experiences and observations related to cricket careers, specifically focusing on English and Indian cricketers. He shares some of his personal anecdotes about specific players, their dedication to the game, and the challenges they faced during their careers.

Firstly, the speaker mentions that he knew two players, Broad and Anderson, who were initially overlooked by the English cricket team due to injuries. However, they went on to achieve remarkable success in their careers with longevity and consistency. He also shares how both of them loved fielding from a young age and emphasizes its importance in cricket.

Next, he talks about the challenges Indian players face when touring England and performing under constant seam bowling conditions, while also commenting on the reverse situation for English players touring subcontinent countries. He believes that upbringing and experience play significant roles in how cricketers adapt to these conditions.

The speaker then discusses the evolution of cricket careers over the years, mentioning how players nowadays want to be versatile and capable of performing all aspects of the game – batting, bowling, and fielding. He also praises the importance of having all-rounders in a team for balance and morale.

Throughout this conversation, the speaker provides valuable insights into the mindset and challenges faced by cricketers from different backgrounds and conditions, emphasizing the importance of dedication, versatility, and experience. Overall, his observations highlight the complexities and nuances of cricket careers and how various factors contribute to their success or otherwise.

Answer for the question (if it was asked): In this context, no specific question has been asked, so there is no need to provide a detailed answer. However, based on the information provided in the conversation, one could explore topics such as:
- The importance of versatility and adaptability in cricket careers
- How upbringing and experience impact cricket performance
- The role of all-rounders in cricket teams
- Challenges faced by Indian and English cricketers when touring each other's countries.