#YouTube Transcript Processing Pipeline with Cloud and Vector Storage Integration

In [None]:
import os
import re
import pandas as pd
import numpy as np
import torch
from textblob import TextBlob
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer, BertModel
from google.cloud import storage
from pinecone import Pinecone, ServerlessSpec
from flask import jsonify

# Initialize Google Cloud Storage client
storage_client = storage.Client()

# Set bucket and file information
bucket_name = 'elon-musk-chatbot-data'
source_blob_name = 'youtube_transcripts.txt'
destination_file_name = 'youtube_transcripts.txt'

# Pinecone API key and environment
api_key = "b8e351ec-57a1-46eb-96eb-31e7ef11fd77"
environment = "us-east-1"
os.environ["PINECONE_API_KEY"] = api_key

# Step 1: Download file from Google Cloud Storage
def download_from_gcs(bucket_name, source_blob_name, destination_file_name):
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)
    print(f"Downloaded {source_blob_name} from GCS to local file {destination_file_name}.")

# Step 2: Load and clean the data
def load_and_clean_data(file_name):
    with open(file_name, 'r') as file:
        transcript_text = file.readlines()
    df = pd.DataFrame(transcript_text, columns=["text"])

    # Cleaning function
    def clean_text(text):
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespaces
        return text

    df['cleaned_text'] = df['text'].apply(clean_text)
    return df

# Step 3: Sentiment analysis
def perform_sentiment_analysis(df):
    def get_sentiment(text):
        analysis = TextBlob(text)
        return analysis.sentiment.polarity
    df['sentiment'] = df['cleaned_text'].apply(get_sentiment)
    return df

# Step 4: TF-IDF feature extraction
def extract_tfidf_features(df):
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=1000)
    tfidf_matrix = vectorizer.fit_transform(df['cleaned_text'])
    feature_names = vectorizer.get_feature_names_out()
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
    return tfidf_df

# Step 5: Word2Vec embeddings
def generate_word2vec_embeddings(df):
    tokenized_texts = df['cleaned_text'].apply(lambda x: x.split())
    word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)
    return word2vec_model

# Step 6: BERT embeddings
def generate_bert_embeddings(df):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    def get_bert_embedding(text):
        inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512, padding=True)
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state[:, 0, :].detach().numpy().flatten()
        return embedding

    df['bert_embedding'] = df['cleaned_text'].apply(get_bert_embedding)
    return df

# Step 7: Upload to Google Cloud Storage
def upload_to_gcs(bucket_name, local_file_path, destination_blob_name):
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(local_file_path)
    print(f"File {local_file_path} uploaded to {destination_blob_name} in bucket {bucket_name}.")

# Step 8: Store embeddings in Pinecone
def store_embeddings_in_pinecone(df):
    pc = Pinecone(api_key=api_key)

    index_name = "elon-musk-embeddings-youtube"
    if index_name not in pc.list_indexes().names():
        pc.create_index(
            name=index_name,
            dimension=768,
            metric='cosine',
            spec=ServerlessSpec(cloud='aws', region=environment)
        )
    index = pc.Index(index_name)

    for i, row in df.iterrows():
        embedding = row['bert_embedding']
        index.upsert([(str(i), embedding)])
    print("Embeddings stored in Pinecone successfully!")

# Main function
def process_transcripts(request):
    download_from_gcs(bucket_name, source_blob_name, destination_file_name)

    df = load_and_clean_data(destination_file_name)
    df = perform_sentiment_analysis(df)

    # Save cleaned and sentiment data
    cleaned_sentiment_path = 'cleaned_sentiment_youtube.csv'
    df[['cleaned_text', 'sentiment']].to_csv(cleaned_sentiment_path, index=False)
    upload_to_gcs(bucket_name, cleaned_sentiment_path, 'cleaned_sentiment_youtube.csv')

    # TF-IDF
    tfidf_df = extract_tfidf_features(df)
    tfidf_path = 'tfidf_features_youtube.csv'
    tfidf_df.to_csv(tfidf_path, index=False)
    upload_to_gcs(bucket_name, tfidf_path, 'tfidf_features_youtube.csv')

    # Word2Vec
    word2vec_model = generate_word2vec_embeddings(df)
    word2vec_path = 'word2vec_vectors_youtube.txt'
    with open(word2vec_path, 'w') as f:
        for word in word2vec_model.wv.index_to_key:
            vector = word2vec_model.wv[word]
            f.write(f"{word} {' '.join(map(str, vector))}\n")
    upload_to_gcs(bucket_name, word2vec_path, 'word2vec_vectors_youtube.txt')

    # BERT embeddings
    df = generate_bert_embeddings(df)
    bert_path = 'bert_embeddings_youtube.csv'
    df[['cleaned_text', 'bert_embedding']].to_csv(bert_path, index=False)
    upload_to_gcs(bucket_name, bert_path, 'bert_embeddings_youtube.csv')

    # Store in Pinecone
    store_embeddings_in_pinecone(df)

    return jsonify(message="Processing and upload completed successfully."), 200




1. **Data Download:**
   - Downloads a transcript file from a specified Google Cloud Storage bucket.

2. **Data Cleaning:**
   - Cleans the text data to remove URLs, special characters, and extra whitespace.

3. **Sentiment Analysis:**
   - Analyzes the sentiment polarity of each transcript line using TextBlob.

4. **Feature Extraction:**
   - Extracts key features using:
     - **TF-IDF:** Generates n-gram-based term frequency-inverse document frequency features.
     - **Word2Vec:** Computes vector representations of words in the transcript.
     - **BERT:** Generates contextual embeddings for each line of text using a pretrained BERT model.

5. **Cloud Integration:**
   - Saves cleaned data, sentiment data, TF-IDF features, Word2Vec vectors, and BERT embeddings to Google Cloud Storage.

6. **Vector Storage:**
   - Stores BERT embeddings in a Pinecone index for efficient similarity search.

7. **Incremental Pipeline:**
   - Processes and saves intermediate results incrementally to ensure robustness.


