<a href="https://colab.research.google.com/github/mkozak591/workshop_pythonai_project/blob/main/AI_w_Python_Test3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install Google API client library, Python-docx for Word documents, Transformers for NLP models, and other dependencies
%pip install google-api-python-client python-docx transformers
%pip install faiss-cpu
%pip install sentence-transformers
%pip install langchain
%pip install -U langchain-community
%pip install youtube-transcript-api

import os
from googleapiclient.discovery import build
from docx import Document
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
import numpy as np
import faiss
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Function to obtain YT vid descriptions and transcripts with a high number of likes (initializing values)
def get_youtube_videos(api_key, search_query, max_results=0, min_likes=0):
    youtube = build('youtube', 'v3', developerKey=api_key)
    videos = []
    next_page_token = None

    while len(videos) < max_results:
        try:
            # Search for videos matching the query
            request = youtube.search().list(
                q=search_query,
                part='snippet',
                maxResults=min(max_results - len(videos), 50),  # Limit request to max needed results
                type='video',
                pageToken=next_page_token
            )
            response = request.execute()  # Execute the search request

            for item in response.get('items', []):
                video_id = item['id']['videoId']

                # Retrieve video statistics to check like count
                video_request = youtube.videos().list(
                    part='statistics',
                    id=video_id
                )
                video_response = video_request.execute()

                if video_response['items']:
                    like_count = int(video_response['items'][0]['statistics'].get('likeCount', 0))

                    # Check if the video meets the like count criteria
                    if like_count >= min_likes:
                        title = item['snippet']['title']
                        description = item['snippet']['description']
                        video_url = f"https://www.youtube.com/watch?v={video_id}"

                        # Attempt to retrieve the video transcript
                        try:
                            transcript = YouTubeTranscriptApi.get_transcript(video_id)
                            transcript_text = ' '.join([t['text'] for t in transcript])
                        except (TranscriptsDisabled, NoTranscriptFound):
                            transcript_text = None

                        videos.append({
                            'title': title,
                            'description': description,
                            'url': video_url,
                            'likes': like_count,
                            'transcript': transcript_text
                        })

            next_page_token = response.get('nextPageToken')  # Get next page token for pagination

            if not next_page_token:
                break  # Exit loop if no more pages are available

        except Exception as e:
            print(f"An error occurred: {e}")
            break  # Exit loop on error

    return videos

# Function to summarize text using a pre-trained summarization model
def summarize_text(text):
    summarizer = pipeline('summarization', model="facebook/bart-large-cnn")  # Initialize summarizer pipeline
    summary = summarizer(text, max_length=130, min_length=30, do_sample=False)  # Summarize the text
    return summary[0]['summary_text']

# Function to create a Word document with video summaries
def create_word_document(videos, summaries, filename='summaries.docx'):
    doc = Document()  # Create a new Word document
    doc.add_heading('YouTube Video Summaries', 0)  # Add a title heading

    for video, summary in zip(videos, summaries):
        transcript = video['transcript']
        description = video['description']
        text_to_summarize = transcript if transcript else description  # Use transcript if available

        doc.add_heading(video['title'], level=1)  # Add video title as a heading
        doc.add_paragraph(f"Video URL: {video['url']}")  # Add video URL
        doc.add_paragraph(f"Likes: {video['likes']}")  # Add number of likes
        doc.add_paragraph(f"Description: {description}")  # Add video description
        doc.add_paragraph(f"Summary: {summary}")  # Add summary of the transcript or description
        doc.add_paragraph()  # Add a blank line

    doc.save(filename)  # Save the document to a file

# Function to create a vector store using LangChain and FAISS
def create_vector_store(summaries):
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')  # Initialize embeddings
    vector_store = FAISS.from_texts(summaries, embeddings)  # Create a FAISS vector store from summaries

    return vector_store

# Function to search in the vector store for similar summaries
def search_vector_store(vector_store, query):
    # Perform a similarity search in the vector store
    results = vector_store.similarity_search(query)
    return results

# Function to initialize LLM (Large Language Model) for text generation (smaller and free version of ChatGPT)
def initialize_llm():
    tokenizer = AutoTokenizer.from_pretrained("distilgpt2")  # Load tokenizer
    model = AutoModelForCausalLM.from_pretrained("distilgpt2")  # Load language model
    return tokenizer, model

# Function to generate text based on a prompt using the LLM
def generate_text(prompt, tokenizer, model):
    inputs = tokenizer(prompt, return_tensors="pt")  # Tokenize the prompt
    outputs = model.generate(inputs.input_ids, max_length=150, num_return_sequences=1)  # Generate text
    return tokenizer.decode(outputs[0], skip_special_tokens=True)  # Decode the generated text

# Main function to orchestrate the workflow
def main():
    # Provide your YouTube Data API key here
    api_key = 'AIzaSyCjr-StcK8we39XIkEos1wVgN2siKsqVBw'

    while True:
        # Prompt user for search query, max results, and minimum likes
        search_query = input("Enter the search query: ")
        max_results = int(input("Enter the maximum number of results: "))
        min_likes = int(input("Enter the minimum number of likes a video must have: "))

        videos = get_youtube_videos(api_key, search_query, max_results, min_likes)

        if len(videos) >= max_results:
            break  # Exit loop if the required number of videos are found
        else:
            print(f"Found {len(videos)} videos, which is less than the requested {max_results} results.")
            print("Please adjust your search criteria or increase the number of results.")

    # Summarize video descriptions or transcripts
    descriptions = [video['description'] for video in videos]
    summaries = [summarize_text(video['transcript'] if video['transcript'] else video['description']) for video in videos]

    create_word_document(videos, summaries)  # Create a Word document with summaries

    # Create a vector store for the summaries
    vector_store = create_vector_store(summaries)

    # Retrieve similar summaries from the vector store
    results = search_vector_store(vector_store, search_query)

    # Initialize LLM for further processing
    tokenizer, model = initialize_llm()

    for result in results:
        result_text = result.get('text', '') if isinstance(result, dict) else str(result)

        print(f"Result: {result_text}")
        generated_text = generate_text(result_text, tokenizer, model)  # Generate text based on the result
        print(f"Generated text based on result: {result_text}")
        print(generated_text)

# Run the main function
if __name__ == "__main__":
    main()




KeyboardInterrupt: Interrupted by user