### Pre-Requisites

##### Place the following .py files:
- config.py
- main.py
- batch_processing.py
- run_pipeline.py

In [1]:
# import sys
# print(sys.executable)

In [2]:
%load_ext autoreload
%autoreload 2

#### Step 0: Imports

In [3]:
# Step A: Import core Python libraries
import sys
import os
import pandas as pd
import logging
from tqdm.notebook import tqdm  # Import tqdm for progress bar in Jupyter Notebook

# Step B: Add the 'src' directory to the Python path
sys.path.append(os.path.join(os.getcwd(), 'src'))
from config import CONFIG

# Step B (continued): Import core functions from main.py
from main import analyze_sentiment, detect_sarcasm, generate_gemma_response

2024-10-21 21:26:18,563 - INFO - Loading models...


NameError: name 'load_model' is not defined

In [None]:
# Step C: Load Sarcasm Detection Model
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

logging.info("Loading sarcasm detection model and tokenizer...")
sarcasm_tokenizer = AutoTokenizer.from_pretrained(CONFIG["models"]["sarcasm"])
sarcasm_model = AutoModelForSequenceClassification.from_pretrained(CONFIG["models"]["sarcasm"]).to(torch.device(CONFIG["device"]))
logging.info("Sarcasm detection model and tokenizer loaded successfully.")

In [None]:
# Step D: Load Sentiment Analysis Model
logging.info("Loading sentiment analysis model and tokenizer...")
sentiment_tokenizer = AutoTokenizer.from_pretrained(CONFIG["models"]["sentiment"])
sentiment_model = AutoModelForSequenceClassification.from_pretrained(CONFIG["models"]["sentiment"]).to(torch.device(CONFIG["device"]))
logging.info("Sentiment analysis model and tokenizer loaded successfully.")

In [None]:
# Step E: Load GEMMA Text Generation Model
from transformers import pipeline

logging.info("Loading GEMMA text generation model...")
generator = pipeline(
    "text-generation",
    model=CONFIG["models"]["text_generation"],
    device=torch.device(CONFIG["device"])
)
logging.info("GEMMA text generation model loaded successfully.")

#### Step 1: Define Dataset PATH 

In [None]:
# Step 1: Define the path to the cleaned dataset
DATASET_PATH = os.path.join(os.getcwd(), "datasets", "UScomments_final_cleaned.csv")

#### Step 2: Load the "better" dataset (UScomments_final_cleaned.csv) 

In [None]:
# Step 2: Load the cleaned dataset
data = pd.read_csv(DATASET_PATH, low_memory=False)
logging.info(f"Loaded dataset from {DATASET_PATH}")

#### Step 3: Extract comments and pre-labeled sentiment from the dataset
#### Step 4: Init a List to store sarcasm detection results


In [None]:
# Step 3: Extract comments and pre-labeled sentiment from the dataset
comments = data["cleaned_comment"].astype(str).tolist()
pre_labeled_sentiments = data["sentiment"].tolist()

# Step 4: Initialize a list to store sarcasm detection results
sarcasm_labels = []

# Pre Step 5: 
from batch_processing import batch_process

#### Step 5: re computation :( 

In [None]:
import shutil

# Restore the backup if needed
original_path = "/Users/kihun/Documents/gemma_NON-GIT files/gemma-sprint-project/checkpoints/sarcasm_labels.pkl"
backup_path = "/Users/kihun/Documents/gemma_NON-GIT files/gemma-sprint-project/checkpoints/sarcasm_labels_backup.pkl"
shutil.copy2(backup_path, original_path)
print(f"Restored checkpoint from backup: {backup_path} to {original_path}")

In [None]:
# Step 5: Perform sarcasm detection in batches with a progress bar
import pickle  # Import pickle for checkpointing
from tqdm.notebook import tqdm  # Import tqdm for progress bar in Jupyter Notebook
from main import detect_sarcasm  # Import the sarcasm detection function

# Define checkpoint directory
checkpoint_dir = os.path.join(os.getcwd(), "checkpoints")
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

# Ensure write permission to the checkpoint directory
if os.access(checkpoint_dir, os.W_OK):
    print(f"Write permission confirmed for checkpoint directory: {checkpoint_dir}")
else:
    print(f"Warning: No write permission for checkpoint directory: {checkpoint_dir}")

# Start sarcasm detection
sarcasm_labels = []  # List to store sarcasm labels
total_batches = len(comments) // CONFIG["batch_size"] + (1 if len(comments) % CONFIG["batch_size"] != 0 else 0)

# Iterate over each batch and generate sarcasm labels
for idx, sarcasm_batch in enumerate(tqdm(batch_process(comments, CONFIG["batch_size"], detect_sarcasm), total=total_batches, desc="Performing Sarcasm Detection")):
    # Remove .tolist() since sarcasm_batch is already a list
    sarcasm_labels.extend(sarcasm_batch)
    logging.info(f"Processed batch {idx + 1}/{total_batches} in sarcasm detection.")

    # Save checkpoint after each batch to avoid data loss
    with open(os.path.join(checkpoint_dir, 'sarcasm_labels.pkl'), 'wb') as f:
        pickle.dump(sarcasm_labels, f)

logging.info("Step 5: Sarcasm detection completed and checkpoint saved.")

In [None]:
# import shutil
# import os
# 
# # Define paths for the original and backup
# original_path = os.path.join(os.getcwd(), "checkpoints", "sarcasm_labels.pkl")
# backup_path = os.path.join(os.getcwd(), "checkpoints", "sarcasm_labels_backup.pkl")
# 
# # Create a backup
# try:
#     shutil.copy2(original_path, backup_path)
#     print(f"Backup created at: {backup_path}")
# except FileNotFoundError:
#     print("Checkpoint file not found. Make sure Step 5 has generated a checkpoint before backing up.")

#### Step 6: Generate responses using the Gemma-2b-it model with parallel processing and progress bar


In [None]:
# Step 6: Generate responses using the Gemma-2b-it model (parallelized)
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm.notebook import tqdm  # Progress bar for Jupyter Notebook

# Set the number of workers for parallel processing
max_workers = 4  # Increased from 2 to 4 for better utilization

responses = []  # List to hold the generated responses

# Start parallel processing using ProcessPoolExecutor
with ProcessPoolExecutor(max_workers=max_workers) as executor:
    # Submit tasks to the executor for processing
    future_to_comment = {executor.submit(generate_gemma_response, comment): comment for comment in comments}
    
    # Use tqdm to track progress of tasks being completed
    for future in tqdm(as_completed(future_to_comment), total=len(comments), desc="Generating GEMMA Responses"):
        try:
            # Retrieve the result of each completed task
            response = future.result()
            responses.append(response)
        except Exception as e:
            # Log any errors during the process
            logging.error(f"Error generating GEMMA response in parallel: {e}")
            responses.append("")  # Append an empty response if there's an error

# Log that step 6 is completed
logging.info("Step 6: GEMMA response generation completed.")

# Save the results after Step 6 to track progress
df_results = pd.DataFrame({
    "Comment": comments,
    "Sentiment": pre_labeled_sentiments,
    "Sarcasm": sarcasm_labels,
    "Response": responses
})

# Save the DataFrame to CSV after Step 6
df_results.to_csv("results_after_step_6.csv", index=False)
print("Results after Step 6 have been saved to 'results_after_step_6.csv'.")

#### Step 7: Combine the results into a DataFrame
#### Step 8: Save the results to a CSV file 

In [None]:
# Step 7: Combine all results into a DataFrame
assert len(comments) == len(pre_labeled_sentiments) == len(sarcasm_labels) == len(responses), "List lengths do not match. Check the previous steps for errors."

df_results = pd.DataFrame({
    "Comment": comments,
    "Sentiment": pre_labeled_sentiments,  # Using the pre-labeled sentiment
    "Sarcasm": sarcasm_labels,
    "Response": responses
})
logging.info(f"Step 7: Combined results into DataFrame with shape {df_results.shape}.")

# Step 8: Save the results to a CSV file
OUTPUT_PATH = os.path.join(os.getcwd(), "outputs", "Processed_Comments.csv")
df_results.to_csv(OUTPUT_PATH, index=False)
print(f"Processing complete. Results saved to '{OUTPUT_PATH}'.")
logging.info(f"Step 8: Results saved to '{OUTPUT_PATH}'.")

#### Step 9: Display a sample of the final results

In [None]:
# Step 9: Display a sample of the final results
df_results.head()

In [None]:
# Monitor logs in real-time (separate cell)
import os
import glob

# Define the path to the logs directory
logs_directory = os.path.join(os.getcwd(), "logs")

# Get the most recently created log file in the logs directory
log_files = glob.glob(os.path.join(logs_directory, "*.log"))
if log_files:
    latest_log_file = max(log_files, key=os.path.getctime)
    print(f"Monitoring latest log file: {latest_log_file}")

    # Properly handle spaces in the path by quoting it
    safe_log_file_path = f'"{latest_log_file}"'
    
    # Monitor the selected log file in real-time
    os.system(f'tail -f {safe_log_file_path}')
else:
    print("No log files found in the logs directory.")