This script processes twitter transcripts,YouTube transcript data to extract embeddings and perform sentiment analysis, enabling downstream use cases like text-based search or conversational AI. The code integrates various NLP techniques and utilizes cloud services for storage and deployment.

In [None]:
import os
import re
import logging
import pandas as pd
import numpy as np
import torch
from textblob import TextBlob
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer, BertModel
from google.cloud import storage
from pinecone import Pinecone, ServerlessSpec
from flask import jsonify

# Initialize logging
logging.basicConfig(level=logging.INFO)

# Initialize Google Cloud Storage client
storage_client = storage.Client()

# Set bucket and file information
bucket_name = 'elon-musk-chatbot-data'
source_blob_name = 'youtube_transcripts.txt'
destination_file_name = 'local_youtube_transcripts.txt'

# Pinecone API key and environment
api_key = "b8e351ec-57a1-46eb-96eb-31e7ef11fd77"
environment = "us-east-1"
os.environ["PINECONE_API_KEY"] = api_key

# Step 1: Download file from Google Cloud Storage
def download_from_gcs(bucket_name, source_blob_name, destination_file_name):
    logging.info("Starting file download from Google Cloud Storage.")
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)
    logging.info(f"Downloaded {source_blob_name} from GCS to local file {destination_file_name}.")

# Step 2: Load and clean the data
def load_and_clean_data(file_name):
    logging.info("Loading and cleaning data.")
    with open(file_name, 'r') as file:
        transcript_text = file.readlines()
    df = pd.DataFrame(transcript_text, columns=["text"])

    # Cleaning function
    def clean_text(text):
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespaces
        return text

    df['cleaned_text'] = df['text'].apply(clean_text)
    logging.info("Data cleaning completed.")
    return df

# Step 3: Sentiment analysis
def perform_sentiment_analysis(df):
    logging.info("Performing sentiment analysis.")
    def get_sentiment(text):
        analysis = TextBlob(text)
        return analysis.sentiment.polarity
    df['sentiment'] = df['cleaned_text'].apply(get_sentiment)
    logging.info("Sentiment analysis completed.")
    return df

# Step 4: TF-IDF feature extraction
def extract_tfidf_features(df):
    logging.info("Extracting TF-IDF features.")
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=1000)
    tfidf_matrix = vectorizer.fit_transform(df['cleaned_text'])
    feature_names = vectorizer.get_feature_names_out()
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
    logging.info("TF-IDF extraction completed.")
    return tfidf_df

# Step 5: Word2Vec embeddings
def generate_word2vec_embeddings(df):
    logging.info("Generating Word2Vec embeddings.")
    tokenized_texts = df['cleaned_text'].apply(lambda x: x.split())
    word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)
    logging.info("Word2Vec embeddings generated.")
    return word2vec_model

# Step 6: BERT embeddings
def generate_bert_embeddings(df):
    logging.info("Generating BERT embeddings.")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    def get_bert_embedding(text):
        inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512, padding=True)
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state[:, 0, :].detach().numpy().flatten()
        return embedding

    df['bert_embedding'] = df['cleaned_text'].apply(get_bert_embedding)
    logging.info("BERT embeddings generated.")
    return df

# Step 7: Upload to Google Cloud Storage
def upload_to_gcs(bucket_name, local_file_path, destination_blob_name):
    logging.info(f"Uploading {local_file_path} to Google Cloud Storage.")
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(local_file_path)
    logging.info(f"File {local_file_path} uploaded to {destination_blob_name} in bucket {bucket_name}.")

# Step 8: Store embeddings in Pinecone
def store_embeddings_in_pinecone(df):
    logging.info("Storing embeddings in Pinecone.")
    pc = Pinecone(api_key=api_key)

    index_name = "elon-musk-embeddings-youtube"
    if index_name not in pc.list_indexes().names():
        pc.create_index(
            name=index_name,
            dimension=768,
            metric='cosine',
            spec=ServerlessSpec(cloud='aws', region=environment)
        )
    index = pc.Index(index_name)

    for i, row in df.iterrows():
        embedding = row['bert_embedding']
        index.upsert([(str(i), embedding)])
    logging.info("Embeddings stored in Pinecone successfully.")

# Main function
def process_transcripts(request):
    logging.info("Starting transcript processing.")
    download_from_gcs(bucket_name, source_blob_name, destination_file_name)

    df = load_and_clean_data(destination_file_name)
    df = perform_sentiment_analysis(df)

    # Save cleaned and sentiment data
    cleaned_sentiment_path = 'cleaned_sentiment_youtube.csv'
    df[['cleaned_text', 'sentiment']].to_csv(cleaned_sentiment_path, index=False)
    upload_to_gcs(bucket_name, cleaned_sentiment_path, 'cleaned_sentiment_youtube.csv')

    # TF-IDF
    tfidf_df = extract_tfidf_features(df)
    tfidf_path = 'tfidf_features_youtube.csv'
    tfidf_df.to_csv(tfidf_path, index=False)
    upload_to_gcs(bucket_name, tfidf_path, 'tfidf_features_youtube.csv')

    # Word2Vec
    word2vec_model = generate_word2vec_embeddings(df)
    word2vec_path = 'word2vec_vectors_youtube.txt'
    with open(word2vec_path, 'w') as f:
        for word in word2vec_model.wv.index_to_key:
            vector = word2vec_model.wv[word]
            f.write(f"{word} {' '.join(map(str, vector))}\n")
    upload_to_gcs(bucket_name, word2vec_path, 'word2vec_vectors_youtube.txt')

    # BERT embeddings
    df = generate_bert_embeddings(df)
    bert_path = 'bert_embeddings_youtube.csv'
    df[['cleaned_text', 'bert_embedding']].to_csv(bert_path, index=False)
    upload_to_gcs(bucket_name, bert_path, 'bert_embeddings_youtube.csv')

    # Store in Pinecone
    store_embeddings_in_pinecone(df)

    logging.info("Transcript processing and upload completed successfully.")
    return jsonify(message="Processing and upload completed successfully."), 200



1. **Cloud Integration:** Downloads transcript data from Google Cloud Storage and uploads processed files back to the cloud.
2. **Data Cleaning:** Cleans raw text data by removing URLs, special characters, and extra spaces.
3. **Sentiment Analysis:** Computes sentiment polarity for each transcript using TextBlob.
4. **Feature Extraction:** Generates TF-IDF features, Word2Vec embeddings, and BERT embeddings for the transcripts.
5. **Storage:** Saves processed data and embeddings locally and to Google Cloud Storage.
6. **Pinecone Indexing:** Optionally indexes BERT embeddings in Pinecone for efficient similarity search and retrieval.

This Streamlit application enables a chatbot that mimics Elon Musk's conversational style. It leverages precomputed BERT embeddings for efficient context retrieval and generates responses using an NVIDIA-based LLM.



In [None]:
import os
import ast
import numpy as np
import pandas as pd
import streamlit as st
import torch
from transformers import BertTokenizer, BertModel
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_nvidia_ai_endpoints import ChatNVIDIA

# Set up Streamlit
st.set_page_config(page_title="Musk Bot")
st.header("💬 In Conversation with Elon Musk 🚘🚀")

# Load embeddings from CSV
embeddings_file = "bert_embeddings.csv"  # Path to your embeddings CSV
if not os.path.exists(embeddings_file):
    st.error("Embeddings file not found. Please ensure bert_embeddings_youtube.csv is present.")
    st.stop()

df = pd.read_csv(embeddings_file)

# The 'bert_embedding' column might be stored as a string representation of a list.
# We'll convert each embedding back to a NumPy array.
def parse_embedding_string(emb_str):
    # ast.literal_eval safely evaluates the string to a Python list
    arr = ast.literal_eval(emb_str)
    return np.array(arr, dtype=np.float32)

df['embedding_array'] = df['bert_embedding'].apply(parse_embedding_string)

# Extract embeddings and texts
doc_embeddings = np.vstack(df['embedding_array'].values)  # shape: (num_docs, embedding_dim)
doc_texts = df['cleaned_text'].tolist()

# Load the same BERT model and tokenizer used in code 1 for queries
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def embed_text_with_bert(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # We used the [CLS] token embedding: outputs.last_hidden_state[:, 0, :]
    embedding = outputs.last_hidden_state[:, 0, :].numpy().flatten()
    return embedding

# Function to compute cosine similarity
def cosine_similarity(a, b):
    a_norm = a / np.linalg.norm(a, axis=1, keepdims=True)
    b_norm = b / np.linalg.norm(b)
    return np.dot(a_norm, b_norm)

# Define Prompt Template (same as before)
prompt = PromptTemplate(
    input_variables=["document", "question"],
    template=(
        "You are Elon Musk. Respond to questions like Elon, using his characteristic style: informal, bold, with occasional slang, "
        "thought-provoking statements, and a hint of humor. Speak directly to the question, referencing details from your knowledge "
        "and the content provided below.\n\n"
        "Your current data includes Elon Musk's discussions, reflecting his thoughts on technology, humanity, "
        "space exploration, renewable energy, AI, and sustainability. Use these details to base your responses, adding additional "
        "knowledge when needed.\n\n"
        "Question: {question}\n\n"
        "Relevant Document Excerpts:\n{document}\n\n"
        "Response in Elon Musk's tone, using slang where appropriate:"
    )
)

# Initialize the LLM
llm = ChatNVIDIA(model="meta/llama-3.2-3b-instruct")
chain = LLMChain(llm=llm, prompt=prompt)

st.write("Ask Elon Musk a question:")

question = st.text_input("Your question")

if question:
    # Embed the question
    query_embedding = embed_text_with_bert(question)

    # Compute similarity to all documents
    similarities = cosine_similarity(doc_embeddings, query_embedding)

    # Get top-k most similar documents (e.g., top 5)
    top_k = 5
    top_indices = np.argsort(similarities)[-top_k:][::-1]
    relevant_docs = [doc_texts[i] for i in top_indices]

    # Create the document context by concatenating the top documents
    document_context = "\n".join(relevant_docs)

    # Run the chain
    answer = chain.run({"document": document_context, "question": question})
    st.write(answer)



1. **Embedding Loading and Parsing:**
   - Loads precomputed BERT embeddings from a CSV file.
   - Parses embedding strings into NumPy arrays.

2. **Similarity Search:**
   - Computes cosine similarity between user questions and document embeddings to retrieve the most relevant context.

3. **Chatbot Initialization:**
   - Uses the NVIDIA Llama 3.2 model to generate responses.
   - Incorporates context from the retrieved documents to provide coherent and relevant answers.

4. **Streamlit Integration:**
   - Offers an interactive user interface for asking questions and viewing Elon Musk-style responses.

This implementation provides a scalable approach to building a personality-driven chatbot by combining BERT for context retrieval with a large language model for response generation.