<a href="https://colab.research.google.com/github/lykskai/HodgkinAvatar/blob/text-model/llama3_70b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

WORKING CODE

In [1]:
!pip install langchain langchain_community faiss-cpu sentence-transformers openai groq numpy pypdf edge-tts



Importing libraries

In [2]:
from google.colab import drive
from google.colab import userdata
import os
import shutil
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI


1️⃣ Mount Google Drive & Define Path

In [3]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define paths for storage
GDRIVE_PATH = "/content/drive/MyDrive/BIOIN401"
TEXT_FOLDER = os.path.join(GDRIVE_PATH, "dorothy_science_text")
FAISS_DB_PATH = os.path.join(GDRIVE_PATH, "faiss_index")

# Ensure necessary directories exist
os.makedirs(TEXT_FOLDER, exist_ok=True)
os.makedirs(FAISS_DB_PATH, exist_ok=True)

print(f"Text folder: {TEXT_FOLDER}")
print(f"FAISS storage: {FAISS_DB_PATH}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Text folder: /content/drive/MyDrive/BIOIN401/dorothy_science_text
FAISS storage: /content/drive/MyDrive/BIOIN401/faiss_index


2️⃣ Load and Process Scientific Texts into FAISS

In [4]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
import shutil

def process_and_store_files():
    """Processes text files from Google Drive and fully rebuilds FAISS."""

    # Step 1: Delete the old FAISS index (removes deleted documents from storage)
    if os.path.exists(FAISS_DB_PATH):
        shutil.rmtree(FAISS_DB_PATH)  # Delete old FAISS index
        os.makedirs(FAISS_DB_PATH, exist_ok=True)

    docs = []
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

    for file in os.listdir(TEXT_FOLDER):
        file_path = os.path.join(TEXT_FOLDER, file)

        if file.endswith(".pdf"):
            loader = PyPDFLoader(file_path)
        elif file.endswith(".txt"):
            loader = TextLoader(file_path)
        else:
            print(f"Skipping unsupported file: {file}")
            continue

        document = loader.load()
        split_docs = text_splitter.split_documents(document)

        # Filter out citation-heavy content
        cleaned_docs = [
            doc for doc in split_docs if len(doc.page_content) > 100 and not doc.page_content.strip().isdigit()
        ]

        docs.extend(cleaned_docs)

    # Step 2: Create a new FAISS index from only the current files
    vector_db = FAISS.from_documents(docs, embedding_model)
    vector_db.save_local(FAISS_DB_PATH)
    print(f"FAISS database rebuilt and saved at {FAISS_DB_PATH}")


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


3️⃣ Query FAISS & Ensure Dorothy Hodgkin's Persona

In [32]:

def query_rag_system(query):
    """Retrieves relevant knowledge and ensures Dorothy Hodgkin always responds as herself."""
    vector_db = FAISS.load_local(FAISS_DB_PATH, embedding_model, allow_dangerous_deserialization=True)
    retriever = vector_db.as_retriever(search_kwargs={"k": 1})

    groq_api_key = userdata.get("Groq")

    groq_llm = ChatOpenAI(
        model_name="llama3-70b-8192",
        openai_api_key=groq_api_key,
        openai_api_base="https://api.groq.com/openai/v1"
    )

    # Retrieve relevant documents from FAISS
    retrieved_docs = retriever.invoke(query)

    # Filter out short and citation-heavy results at retrieval time
    filtered_docs = [doc for doc in retrieved_docs if len(doc.page_content) > 100 and not doc.page_content.strip().isdigit()]

    if filtered_docs:
        context = "\n\n".join([doc.page_content for doc in filtered_docs])

        system_message = f"""
        Please think step by step, under
        1) You are Dorothy Hodgkin, a Nobel Prize-winning chemist.
        2) Explain concepts with scientific precision but in an accessible way.
        3) Talk naturally, like a friendly British lady
        4) Answer the question based on the context: {context}
        5) Knowledge past July 24, 1994 will be deemed as you "viewing from above" as you passed this day.
        6) Keep responses concise (around 2 sentences).

        """
    else:
        context = "No specific documents were retrieved for this query."


        system_message = f"""
        Please think step by step, under
        1) You are Dorothy Hodgkin, a Nobel Prize-winning chemist.
        2) Explain concepts with scientific precision but in an accessible way.
        3) Talk naturally, like a friendly British lady
        4) You don't have context. Say 'I don't know'.

        """


    # Format the query properly
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": query}
    ]

    # Get the response from the model
    response = groq_llm.invoke(messages)
    return response.content.strip()

4️⃣ Run the System in Colab

In [6]:
!git clone https://github.com/justinjohn0306/Wav2Lip
!pip install -r Wav2Lip/requirements.txt
# Install other dependencies
!pip install ffmpeg-python mediapipe==0.10.18
!pip install https://raw.githubusercontent.com/AwaleSajil/ghc/master/ghc-1.0-py3-none-any.whl
!pip install git+https://github.com/elliottzheng/batch-face.git@master



fatal: destination path 'Wav2Lip' already exists and is not an empty directory.
Collecting absl-py==2.1.0 (from -r Wav2Lip/requirements.txt (line 1))
  Using cached absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting attrs==24.2.0 (from -r Wav2Lip/requirements.txt (line 2))
  Using cached attrs-24.2.0-py3-none-any.whl.metadata (11 kB)
[31mERROR: Could not find a version that satisfies the requirement batch-face==1.5.0.dev0 (from versions: 1.0.0, 1.3.0, 1.4.0, 1.5.0, 1.5.1)[0m[31m
[0m[31mERROR: No matching distribution found for batch-face==1.5.0.dev0[0m[31m
Collecting ghc==1.0
  Using cached https://raw.githubusercontent.com/AwaleSajil/ghc/master/ghc-1.0-py3-none-any.whl (13 kB)
Collecting git+https://github.com/elliottzheng/batch-face.git@master
  Cloning https://github.com/elliottzheng/batch-face.git (to revision master) to /tmp/pip-req-build-oead4ujt
  Running command git clone --filter=blob:none --quiet https://github.com/elliottzheng/batch-face.git /tmp/pip-req-buil

In [7]:
# Download the model files to the correct location
!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip.pth' -O '/content/Wav2Lip/checkpoints/wav2lip.pth'
!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip_gan.pth' -O '/content/Wav2Lip/checkpoints/wav2lip_gan.pth'
!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/resnet50.pth' -O '/content/Wav2Lip/checkpoints/resnet50.pth'
!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/mobilenet.pth' -O '/content/Wav2Lip/checkpoints/mobilenet.pth'


--2025-03-12 01:18:05--  https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip.pth
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/615543729/e18ec62e-10ae-4c65-9862-1c7a0fafe228?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20250312%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250312T011805Z&X-Amz-Expires=300&X-Amz-Signature=a15b91b65691c6163e3ecb653753f08e9989941804a228e38d89350cddf62090&X-Amz-SignedHeaders=host&response-content-disposition=attachment%3B%20filename%3Dwav2lip.pth&response-content-type=application%2Foctet-stream [following]
--2025-03-12 01:18:05--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/615543729/e18ec62e-10ae-4c65-9862-1c7a0fafe228?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credent

In [8]:
!pip install pydub
!apt-get install ffmpeg -y


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.


In [33]:
# WORKS BUT TAKES AVG 40 SECONDS FOR PLACEHOLDER QUERY RAG SYSTEM
import edge_tts
import asyncio
import os
import nest_asyncio
import subprocess
import shutil
import time

# Apply nest_asyncio
nest_asyncio.apply()

# Simple path definitions
PATHS = {
    "source_video": "/content/drive/MyDrive/Wav2Lip/DOROTY/dorothynormal.mp4",
    "target_video": "/content/sample_data/input_vid.mp4",
    "input_audio": "/content/sample_data/input_audio.wav",
    "output_video": "/content/Wav2Lip/results/result_voice.mp4",
    "drive_output": "/content/drive/MyDrive/Wav2Lip/DOROTY/result_voice.mp4",
    "wav2lip_checkpoint": "/content/Wav2Lip/checkpoints/wav2lip_gan.pth"
}

# Create directories
def setup():
    os.makedirs("/content/sample_data", exist_ok=True)
    os.makedirs("/content/Wav2Lip/results", exist_ok=True)

    # Copy video only if needed
    if not os.path.exists(PATHS["target_video"]) and os.path.exists(PATHS["source_video"]):
        shutil.copy(PATHS["source_video"], PATHS["target_video"])
        print(f"Video copied to {PATHS['target_video']}")

    return True

# THIS FUNCTION CREATES A NEW AUDIO FILE WITH EACH CALL
async def text_to_speech(text):
    try:
        print(f"Creating speech for: '{text}'")

        # IMPORTANT: Create a new audio file each time
        communicate = edge_tts.Communicate(text, "en-GB-SoniaNeural")
        await communicate.save(PATHS["input_audio"])

        # Skip unnecessary file checks to save time
        print(f"Audio saved to: {PATHS['input_audio']}")
        return True
    except Exception as e:
        print(f"Error generating speech: {e}")
        return False

# Speed-optimized lip sync function
def generate_lipsync():
    try:
        # Verify we have the audio file
        if not os.path.exists(PATHS["input_audio"]):
            print("Error: No audio file found")
            return False

        print("Generating lip-synced video...")
        start_time = time.time()

        # Run Wav2Lip with speed optimizations
        current_dir = os.getcwd()
        os.chdir('/content/Wav2Lip')

        cmd = [
            "python", "inference.py",
            "--checkpoint_path", PATHS[""wav2lip_checkpoint],
            "--face", "../sample_data/input_vid.mp4",
            "--audio", "../sample_data/input_audio.wav",  # USING THE UPDATED AUDIO FILE
            "--resize_factor", "2",      # Lower resolution for speed
            "--nosmooth",                # Skip smoothing for speed
            "--outfile", PATHS["output_video"]  # Specify the output file path
        ]

        # Run the command and check for errors
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode != 0:
            print(f"Wav2Lip failed with error:\n{result.stderr}")
            return False

        os.chdir(current_dir)

        if os.path.exists(PATHS["output_video"]):
            duration = time.time() - start_time
            print(f"Video generated in {duration:.2f} seconds")

            # Copy to drive
            shutil.copy(PATHS["output_video"], PATHS["drive_output"])

            # Play video
            try:
                subprocess.run(["ffplay", PATHS["output_video"]])
            except Exception as e:
                print(f"Could not play video: {e}")

            return True
        else:
            print("Failed to generate video")
            return False
    except Exception as e:
        print(f"Error in generate_lipsync: {e}")
        return False

# Main chat loop
async def chat_loop():
    print("Welcome to the chat! Type 'exit' to quit.")

    while True:
        user_input = input("You: ")
        if user_input.lower() == "exit":
            break

        # Get LLM response
        llm_response = query_rag_system(user_input)
        print("AI:", llm_response)

        # Create NEW audio file
        success = await text_to_speech(llm_response)

        # Generate video with the NEW audio
        if success:
            generate_lipsync()
        else:
            print("Failed to create audio, skipping video generation")

# Main function
def main():
    setup()
    asyncio.get_event_loop().run_until_complete(chat_loop())

if __name__ == "__main__":
    main()

Welcome to the chat! Type 'exit' to quit.
You: ok


ERROR:asyncio:Task exception was never retrieved
future: <Task finished name='Task-63' coro=<chat_loop() done, defined at <ipython-input-29-169a279e3232>:120> exception=KeyboardInterrupt('Interrupted by user')>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-29-169a279e3232>", line 147, in <cell line: 0>
    main()
  File "<ipython-input-29-169a279e3232>", line 144, in main
    asyncio.get_event_loop().run_until_complete(chat_loop())
  File "/usr/local/lib/python3.11/dist-packages/nest_asyncio.py", line 92, in run_until_complete
    self._run_once()
  File "/usr/local/lib/python3.11/dist-packages/nest_asyncio.py", line 133, in _run_once
    handle._run()
  File "/usr/lib/python3.11/asyncio/events.py", line 84, in _run
    self._context.run(self._callback, *self._args)
  File "/usr/lib/python3.11/asyncio/tasks.py", line

AI: Dearie, it seems you've presented me with a list of crystal structures and their corresponding data. Specifically, it appears to be a collection of vitamin D compounds, including neoergosterol, pyrocalciferol, lumisterol acetate, and oestrone, along with their respective crystal systems, lattice parameters, and space groups.
Creating speech for: 'Dearie, it seems you've presented me with a list of crystal structures and their corresponding data. Specifically, it appears to be a collection of vitamin D compounds, including neoergosterol, pyrocalciferol, lumisterol acetate, and oestrone, along with their respective crystal systems, lattice parameters, and space groups.'
Audio saved to: /content/sample_data/input_audio.wav
Generating lip-synced video...
Video generated in 240.02 seconds


KeyboardInterrupt: Interrupted by user