In [12]:
import pandas as pd

df = pd.read_csv(r'C:\Users\nrosso\Documents\thesis_project\notebooks\Active_Learning\llama_cleaned_final_classification.csv')

In [13]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,tweet,preprocessed,final_classification,confidence,individual_results,sparse_embedding,dense_embedding,cleaned_classification
0,0,3,God-Man vs. Science-Hero: http://www.gocomics....,godman vs sciencehero,proscience,1.0,{'llama3.1:8b': 'proscience'},"SparseEmbedding(indices=[1998, 2080, 2114, 238...","[-0.019969597458839417, 0.06492504477500916, -...",proscience
1,1,4,#Sciencemob #Proscience The Mass Libel Reform ...,the mass libel reform blog fight for free spee...,neutral,1.0,{'llama3.1:8b': 'neutral'},"SparseEmbedding(indices=[2114, 2375, 2489, 249...","[-0.02676120400428772, -0.005197218619287014, ...",neutral
2,2,5,What is wrong w people?! #proscience RT @NewHu...,what is wrong w people rt and just like that t...,neutral,1.0,{'llama3.1:8b': 'neutral'},"SparseEmbedding(indices=[1059, 2025, 2054, 205...","[0.005084160715341568, -0.016203228384256363, ...",neutral
3,3,6,I’m too pro-science to be pro-choice http://ow...,im too proscience to be prochoice,proscience,1.0,{'llama3.1:8b': 'proscience'},"SparseEmbedding(indices=[2017, 2022, 2025, 210...","[-0.00469502666965127, 0.027932850643992424, -...",proscience
4,4,7,Video: Glenn Beck - MSNBC Anti-God Network htt...,video glenn beck msnbc antigod network antigod...,antiscience,1.0,{'llama3.1:8b': 'antiscience'},"SparseEmbedding(indices=[1998, 2040, 2143, 227...","[-0.05969066917896271, 0.009564031846821308, -...",antiscience


In [15]:
df = df.drop(['sparse_embedding', 'dense_embedding'], axis=1)

In [17]:
df.to_csv('llama_no_embed.csv')

In [20]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import torch
from tqdm import tqdm
import os
import pickle
from typing import List, Dict, Any

class SentenceTransformerEmbedder:
    def __init__(self, model: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model)

    def embed(self, texts: List[str]) -> np.ndarray:
        return self.model.encode(texts, show_progress_bar=False)

def process_chunk(chunk: pd.DataFrame, embedder: SentenceTransformerEmbedder) -> pd.DataFrame:
    chunk['dense_embedding'] = list(embedder.embed(chunk['preprocessed'].tolist()))
    return chunk

def save_progress(chunk: pd.DataFrame, chunk_id: int, output_dir: str):
    output_path = os.path.join(output_dir, f"chunk_{chunk_id}.pkl")
    with open(output_path, 'wb') as f:
        pickle.dump(chunk, f)

def load_progress(output_dir: str) -> List[pd.DataFrame]:
    chunks = []
    for filename in os.listdir(output_dir):
        if filename.startswith("chunk_") and filename.endswith(".pkl"):
            with open(os.path.join(output_dir, filename), 'rb') as f:
                chunks.append(pickle.load(f))
    return chunks

def create_embedding_pipeline(df: pd.DataFrame, chunk_size: int, output_dir: str) -> pd.DataFrame:
    os.makedirs(output_dir, exist_ok=True)

    embedder = SentenceTransformerEmbedder('all-MiniLM-L6-v2')

    # Check for existing progress
    existing_chunks = load_progress(output_dir)
    if existing_chunks:
        print(f"Found {len(existing_chunks)} existing chunks. Resuming from the last processed chunk.")
        df = pd.concat([chunk for chunk in existing_chunks if 'dense_embedding' in chunk.columns])
        start_index = len(df)
    else:
        start_index = 0

    # Process remaining data in chunks
    for i in tqdm(range(start_index, len(df), chunk_size), desc="Processing chunks"):
        chunk = df.iloc[i:i+chunk_size].copy()
        processed_chunk = process_chunk(chunk, embedder)
        save_progress(processed_chunk, i // chunk_size, output_dir)

    # Merge all processed chunks
    all_chunks = load_progress(output_dir)
    final_df = pd.concat(all_chunks, ignore_index=True)

    return final_df

# Usage example
if __name__ == "__main__":
    # Assuming df_preprocessed is already loaded
    chunk_size = 1000  # Adjust based on your available memory
    output_dir = "embedding_progress"

    result_df = create_embedding_pipeline(df, chunk_size, output_dir)

    # Save the final result
    result_df.to_pickle("final_embedded_df.pkl")

    print(f"Final DataFrame shape: {result_df.shape}")
    print(f"Sample embedding shape: {result_df['dense_embedding'].iloc[0].shape}")
    assert result_df['dense_embedding'].iloc[0].shape[0] == 384, "Embedding dimension is not 384"
    print("Embedding dimension verified: 384")

Processing chunks: 100%|██████████| 80/80 [09:09<00:00,  6.87s/it] 


Final DataFrame shape: (79763, 9)
Sample embedding shape: (384,)
Embedding dimension verified: 384


# 0. Clean the original dataset + delete the classfied examples

In [None]:
## Already done and saves as 'original_preprocessed_en.csv

# 1. Create Embeddings for the Original Dataset

In [None]:
import pandas as pd
import numpy as np
from fastembed import TextEmbedder
import torch
from tqdm import tqdm
import os
import pickle
from typing import List, Dict, Any

class FastembedTextEmbedder:
    def __init__(self, model: str, prefix: str, dimensions: int = 384):
        self.embedder = TextEmbedder(model, max_length=512, dim=dimensions)
        self.prefix = prefix
        self.dimensions = dimensions

    def embed(self, texts: List[str]) -> np.ndarray:
        prefixed_texts = [f"{self.prefix}{text}" for text in texts]
        embeddings = list(self.embedder.embed(prefixed_texts))
        return np.array(embeddings)

def process_chunk(chunk: pd.DataFrame, embedder: FastembedTextEmbedder) -> pd.DataFrame:
    chunk['dense_embedding'] = list(embedder.embed(chunk['preprocessed_text'].tolist()))
    return chunk

def save_progress(chunk: pd.DataFrame, chunk_id: int, output_dir: str):
    output_path = os.path.join(output_dir, f"chunk_{chunk_id}.pkl")
    with open(output_path, 'wb') as f:
        pickle.dump(chunk, f)

def load_progress(output_dir: str) -> List[pd.DataFrame]:
    chunks = []
    for filename in os.listdir(output_dir):
        if filename.startswith("chunk_") and filename.endswith(".pkl"):
            with open(os.path.join(output_dir, filename), 'rb') as f:
                chunks.append(pickle.load(f))
    return chunks

def create_embedding_pipeline(df: pd.DataFrame, chunk_size: int, output_dir: str) -> pd.DataFrame:
    os.makedirs(output_dir, exist_ok=True)

    embedder = FastembedTextEmbedder(
        model="BAAI/bge-small-en-v1.5",
        prefix="Represent this sentence for searching relevant passages: ",
        dimensions=384
    )

    # Check for existing progress
    existing_chunks = load_progress(output_dir)
    if existing_chunks:
        print(f"Found {len(existing_chunks)} existing chunks. Resuming from the last processed chunk.")
        df = pd.concat([chunk for chunk in existing_chunks if 'dense_embedding' in chunk.columns])
        start_index = len(df)
    else:
        start_index = 0

    # Process remaining data in chunks
    for i in tqdm(range(start_index, len(df), chunk_size), desc="Processing chunks"):
        chunk = df.iloc[i:i+chunk_size].copy()
        processed_chunk = process_chunk(chunk, embedder)
        save_progress(processed_chunk, i // chunk_size, output_dir)

    # Merge all processed chunks
    all_chunks = load_progress(output_dir)
    final_df = pd.concat(all_chunks, ignore_index=True)

    return final_df

# Usage example
if __name__ == "__main__":
    # Assuming df_preprocessed is already loaded
    chunk_size = 1000  # Adjust based on your available memory
    output_dir = "embedding_progress"

    result_df = create_embedding_pipeline(df_preprocessed, chunk_size, output_dir)

    # Save the final result
    result_df.to_pickle("final_embedded_df.pkl")

    print(f"Final DataFrame shape: {result_df.shape}")
    print(f"Sample embedding shape: {result_df['dense_embedding'].iloc[0].shape}")
    assert result_df['dense_embedding'].iloc[0].shape[0] == 384, "Embedding dimension is not 384"
    print("Embedding dimension verified: 384")