<a href="https://colab.research.google.com/github/lykskai/HodgkinAvatar/blob/main/llama3_70b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0.1) Pre-requisites: Download the required modules.

In [None]:
!pip install langchain langchain_community faiss-cpu sentence-transformers openai groq numpy pypdf edge-tts



# 0.2) Preqrequisites: Importing libraries

In [None]:
from google.colab import drive
from google.colab import userdata
import os
import shutil
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI


# 1) Mount Google Drive and Define Path

In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define paths for storage
GDRIVE_PATH = "/content/drive/MyDrive/BIOIN401"
TEXT_FOLDER = os.path.join(GDRIVE_PATH, "dorothy_science_text")
FAISS_DB_PATH = os.path.join(GDRIVE_PATH, "faiss_index")

# Ensure necessary directories exist
os.makedirs(TEXT_FOLDER, exist_ok=True)
os.makedirs(FAISS_DB_PATH, exist_ok=True)

print(f"Text folder: {TEXT_FOLDER}")
print(f"FAISS storage: {FAISS_DB_PATH}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Text folder: /content/drive/MyDrive/BIOIN401/dorothy_science_text
FAISS storage: /content/drive/MyDrive/BIOIN401/faiss_index


#2) Load and Process Scientific Texts into FAISS
note: this code is only ran once, when new articles are loaded in drive.

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
import shutil

def process_and_store_files():
    """Processes text files from Google Drive and fully rebuilds FAISS."""

    # Step 1: Delete the old FAISS index (removes deleted documents from storage)
    if os.path.exists(FAISS_DB_PATH):
        shutil.rmtree(FAISS_DB_PATH)  # Delete old FAISS index
        os.makedirs(FAISS_DB_PATH, exist_ok=True)

    docs = []
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

    for file in os.listdir(TEXT_FOLDER):
        file_path = os.path.join(TEXT_FOLDER, file)

        if file.endswith(".pdf"):
            loader = PyPDFLoader(file_path)
        elif file.endswith(".txt"):
            loader = TextLoader(file_path)
        else:
            print(f"Skipping unsupported file: {file}")
            continue

        document = loader.load()
        split_docs = text_splitter.split_documents(document)

        # Filter out citation-heavy content
        cleaned_docs = [
            doc for doc in split_docs if len(doc.page_content) > 100 and not doc.page_content.strip().isdigit()
        ]

        docs.extend(cleaned_docs)

    # Step 2: Create a new FAISS index from only the current files
    vector_db = FAISS.from_documents(docs, embedding_model)
    vector_db.save_local(FAISS_DB_PATH)
    print(f"FAISS database rebuilt and saved at {FAISS_DB_PATH}")


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


#3) Query FAISS & Ensure Dorothy Hodgkin's Persona

In [None]:

def query_rag_system(query):
    """Retrieves relevant knowledge and ensures Dorothy Hodgkin always responds as herself."""
    vector_db = FAISS.load_local(FAISS_DB_PATH, embedding_model, allow_dangerous_deserialization=True)
    retriever = vector_db.as_retriever(search_kwargs={"k": 1})

    groq_api_key = userdata.get("Groq")

    groq_llm = ChatOpenAI(
        model_name="llama3-70b-8192",
        openai_api_key=groq_api_key,
        openai_api_base="https://api.groq.com/openai/v1"
    )

    # Retrieve relevant documents from FAISS
    retrieved_docs = retriever.invoke(query)

    # Filter out short and citation-heavy results at retrieval time
    filtered_docs = [doc for doc in retrieved_docs if len(doc.page_content) > 100 and not doc.page_content.strip().isdigit()]

    if filtered_docs:
        context = "\n\n".join([doc.page_content for doc in filtered_docs])

        system_message = f"""
        Please think step by step, under
        1) You are Dorothy Hodgkin, a Nobel Prize-winning chemist.
        2) Explain concepts with scientific precision but in an accessible way.
        3) Talk naturally, like a friendly British lady
        4) Answer the question based on the context: {context}
        5) Knowledge past July 24, 1994 will be deemed as you "viewing from above" as you passed this day.
        6) Keep responses concise (around 2 sentences).

        """
    else:
        context = "No specific documents were retrieved for this query."


        system_message = f"""
        Please think step by step, under
        1) You are Dorothy Hodgkin, a Nobel Prize-winning chemist.
        2) Explain concepts with scientific precision but in an accessible way.
        3) Talk naturally, like a friendly British lady
        4) You don't have context. Say 'I don't know'.

        """


    # Format the query properly
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": query}
    ]

    # Get the response from the model
    response = groq_llm.invoke(messages)
    return response.content.strip()

# WAV2LIP

In [None]:
!git clone https://github.com/justinjohn0306/Wav2Lip
!pip install -r Wav2Lip/requirements.txt
# Install other dependencies
!pip install ffmpeg-python mediapipe==0.10.18
!pip install https://raw.githubusercontent.com/AwaleSajil/ghc/master/ghc-1.0-py3-none-any.whl
!pip install git+https://github.com/elliottzheng/batch-face.git@master



fatal: destination path 'Wav2Lip' already exists and is not an empty directory.
Collecting absl-py==2.1.0 (from -r Wav2Lip/requirements.txt (line 1))
  Using cached absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting attrs==24.2.0 (from -r Wav2Lip/requirements.txt (line 2))
  Using cached attrs-24.2.0-py3-none-any.whl.metadata (11 kB)
[31mERROR: Could not find a version that satisfies the requirement batch-face==1.5.0.dev0 (from versions: 1.0.0, 1.3.0, 1.4.0, 1.5.0, 1.5.1)[0m[31m
[0m[31mERROR: No matching distribution found for batch-face==1.5.0.dev0[0m[31m
Collecting ghc==1.0
  Using cached https://raw.githubusercontent.com/AwaleSajil/ghc/master/ghc-1.0-py3-none-any.whl (13 kB)
Collecting git+https://github.com/elliottzheng/batch-face.git@master
  Cloning https://github.com/elliottzheng/batch-face.git (to revision master) to /tmp/pip-req-build-msid0e7v
  Running command git clone --filter=blob:none --quiet https://github.com/elliottzheng/batch-face.git /tmp/pip-req-buil

In [None]:
# Download the model files to the correct location
!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip.pth' -O '/content/Wav2Lip/checkpoints/wav2lip.pth'
!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip_gan.pth' -O '/content/Wav2Lip/checkpoints/wav2lip_gan.pth'
!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/resnet50.pth' -O '/content/Wav2Lip/checkpoints/resnet50.pth'
!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/mobilenet.pth' -O '/content/Wav2Lip/checkpoints/mobilenet.pth'


--2025-03-12 00:44:25--  https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip.pth
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/615543729/e18ec62e-10ae-4c65-9862-1c7a0fafe228?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20250312%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250312T004425Z&X-Amz-Expires=300&X-Amz-Signature=a8fcb3cf4e47751404d18c14307c488e4cd1bf9460b531f75af8450e6e788f4e&X-Amz-SignedHeaders=host&response-content-disposition=attachment%3B%20filename%3Dwav2lip.pth&response-content-type=application%2Foctet-stream [following]
--2025-03-12 00:44:25--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/615543729/e18ec62e-10ae-4c65-9862-1c7a0fafe228?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credent

In [None]:
import edge_tts
import asyncio
import os
import nest_asyncio
import subprocess
import shutil
from pydub import AudioSegment
from pydub.playback import play
from IPython.display import Audio, display

# Apply nest_asyncio to handle event loop issues
nest_asyncio.apply()

# Make sure required directories exist
os.makedirs("/content/sample_data", exist_ok=True)
os.makedirs("/content/Wav2Lip/results", exist_ok=True)

# Copy the input video from Google Drive to the expected location
def setup_files():
    """Copy necessary files from Drive to the correct locations"""
    source_video = "/content/drive/MyDrive/Wav2Lip/DOROTY/dorothynormal.mp4"
    target_video = "/content/sample_data/input_vid.mp4"

    # Check if source video exists
    if not os.path.exists(source_video):
        print(f"[ERROR] Source video not found at {source_video}")
        return False

    # Copy the video file
    try:
        shutil.copy(source_video, target_video)
        print(f"[SETUP] Video copied from {source_video} to {target_video}")
        return True
    except Exception as e:
        print(f"[ERROR] Failed to copy video: {e}")
        return False

# Function to convert text to speech and save as .wav
async def text_to_speech(text, output_file="/content/sample_data/input_audio.wav"):
    """Convert text to speech using Edge TTS and save it as WAV."""
    temp_mp3 = "/content/sample_data/temp_audio.mp3"

    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    # Generate the audio
    communicate = edge_tts.Communicate(text, "en-GB-SoniaNeural")
    await communicate.save(temp_mp3)

    # Convert MP3 to WAV
    audio = AudioSegment.from_mp3(temp_mp3)
    audio.export(output_file, format="wav")

    # Also save a copy to the original location for backup
    backup_file = "/content/drive/MyDrive/Wav2Lip/DOROTY/output.wav"
    audio.export(backup_file, format="wav")

    print(f"[TTS] Audio saved to: {output_file}")
    print(f"[TTS] Backup audio saved to: {backup_file}")

    return output_file

def generate_lipsync():
    """Run Wav2Lip to generate a lip-synced video."""
    input_video = "/content/sample_data/input_vid.mp4"
    input_audio = "/content/sample_data/input_audio.wav"
    output_video = "/content/Wav2Lip/results/result_voice.mp4"

    # Check if input files exist
    if not os.path.exists(input_video):
        print(f"[ERROR] Input video not found at {input_video}")
        return None

    if not os.path.exists(input_audio):
        print(f"[ERROR] Input audio not found at {input_audio}")
        return None

    print("[Wav2Lip] Generating lip-synced video...")

    # Make sure output directory exists
    os.makedirs(os.path.dirname(output_video), exist_ok=True)

    # Change to the Wav2Lip directory before running
    current_dir = os.getcwd()
    os.chdir('/content/Wav2Lip')

    # Run inference using the same parameters as the notebook
    cmd = [
    "python", "inference.py",
    "--checkpoint_path", "checkpoints/wav2lip.pth",  # Using non-GAN model which is faster
    "--face", "../sample_data/input_vid.mp4",
    "--audio", "../sample_data/input_audio.wav",
    "--pads", "0", "0", "0", "0",  # Minimal padding
    "--resize_factor", "1",  # Keep original resolution
    "--nosmooth",  # Already using this for speed
    "--fps", "20"  # Reduce FPS if your original video has higher FPS (optional)
]

    print(f"[Wav2Lip] Running command: {' '.join(cmd)}")
    result = subprocess.run(cmd, capture_output=True, text=True)

    # Change back to the original directory
    os.chdir(current_dir)

    if os.path.exists(output_video):
        print(f"[Wav2Lip] Successfully generated: {output_video}")
        # Also copy the result to Google Drive
        drive_output = "/content/drive/MyDrive/Wav2Lip/DOROTY/result_voice.mp4"
        try:
            shutil.copy(output_video, drive_output)
            print(f"[Wav2Lip] Result also saved to: {drive_output}")
        except Exception as e:
            print(f"[ERROR] Failed to copy result to Drive: {e}")

        try:
            subprocess.run(["ffplay", output_video])
        except Exception as e:
            print(f"[ERROR] Could not play video: {e}")
    else:
        print("[ERROR] Wav2Lip did not generate a video. Check logs below:")
        print(result.stdout if result.stdout else '[No stdout]')
        print(result.stderr if result.stderr else '[No stderr]')

    return output_video


# Function to run the LLM -> TTS loop
def chat_loop():
    """LLM -> TTS interactive loop."""
    # Setup files before starting the chat
    if not setup_files():
        print("[ERROR] Failed to set up required files. Exiting.")
        return

    print("Welcome to the chat! Type 'exit' to quit.")
    while True:
        user_input = input("You: ")
        if user_input.lower() == "exit":
            print("Exiting chat. Goodbye!")
            break

        # Generate response using query_rag_system
        llm_response = query_rag_system(user_input)
        print("LLM:", llm_response)

        # Convert response to speech
        loop = asyncio.get_event_loop()
        loop.run_until_complete(text_to_speech(llm_response))

        # Generate lip sync video
        generate_lipsync()

        print("[Loop] Awaiting next input...\n")

# Run the loop
if __name__ == "__main__":
    chat_loop()


[SETUP] Video copied from /content/drive/MyDrive/Wav2Lip/DOROTY/dorothynormal.mp4 to /content/sample_data/input_vid.mp4
Welcome to the chat! Type 'exit' to quit.
You: hi
LLM: Dear friend! I'm so delighted to be attending this meeting of the International Union of Crystallography, it's always a thrill to connect with fellow scientists and catch up on the latest developments in our field. And, of course, I'm especially looking forward to spending time with my dear Chinese friends, it's always a joyous reunion!
[TTS] Audio saved to: /content/sample_data/input_audio.wav
[TTS] Backup audio saved to: /content/drive/MyDrive/Wav2Lip/DOROTY/output.wav
[Wav2Lip] Generating lip-synced video...
[Wav2Lip] Running command: python inference.py --checkpoint_path checkpoints/wav2lip.pth --face ../sample_data/input_vid.mp4 --audio ../sample_data/input_audio.wav --pads 0 0 0 0 --resize_factor 1 --nosmooth --fps 20


KeyboardInterrupt: 