# Importing Libraries

In [None]:
# Install necessary libraries using pip
# Uncomment and run these lines in your environment if the libraries are not already installed
# !pip install pandas
# !pip install numpy
# !pip install torch
# !pip install transformers

# Import necessary libraries for data manipulation, processing, and machine learning.
import pandas as pd
import numpy as np
import os
import re
import torch
from transformers import pipeline, DistilBertTokenizer, DistilBertForSequenceClassification, BertTokenizer, BertForSequenceClassification
import gc
import warnings
import time
from torch.nn.functional import softmax

# Suppress specific warnings to avoid cluttering the output
warnings.filterwarnings('ignore', category=UserWarning, module='torch.utils.data.dataloader')

# Data Cleaning & Processing

In [None]:
# Check and announce if CUDA (GPU support) is available for PyTorch, which accelerates computations
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} for processing.")

# Load the dataset into a DataFrame and drop certain columns that are not needed for further analysis
df = pd.read_csv('packages.csv')
df = df.drop(['first_place', 'winner', 'share_image', 'slug', 'image_id'], axis=1)

# Define a function to clean text data by removing HTML tags, punctuation, and trimming whitespace
def clean_text(text):
    if not isinstance(text, str):
        return ''  # Return an empty string for non-string inputs
    text = re.sub('<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.strip()  # Remove leading/trailing whitespace
    return text

# Apply the text cleaning function to specific columns in the DataFrame
df['lede'] = df['lede'].apply(clean_text)
df['lede']=df['lede'].str.replace('</p>', '', regex=False).str.replace('<p>', '', regex=False)
df['headline'] = df['headline'].apply(clean_text)

# Combine 'headline' and 'lede' columns into a single text column for processing
df['combined_text'] = df[['headline', 'lede']].apply(lambda x: ' '.join(x.dropna().values.tolist()), axis=1)
df['cleaned_text'] = df['combined_text'].fillna('')  # Replace NaNs with blank strings
df = df[['cleaned_text']].copy()  # Retain only the cleaned text column

# Topic Classification

In [None]:
# Load the zero-shot classification pipeline from Hugging Face transformers
classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli', device=0 if device == "cuda" else -1)

# Define a set of candidate labels for classification
candidate_labels = [
    "Society and Social Issues",
    "Relationships, Gender and Family",
    "Pop Culture, Media and Entertainment",
    "Authenticity, Lifestyle and Health"
]

# Define a function to classify batches of text using the zero-shot classifier
def classify_batch(texts, labels):
    if not texts or not labels:
        raise ValueError("Texts and labels must not be empty.")
    results = classifier(texts, candidate_labels=labels, truncation=True)
    return results

# Process the DataFrame in segments to manage memory usage and enhance performance
segment_size = 1000  # Define segment size based on system capability
total_segments = len(df) // segment_size + (1 if len(df) % segment_size != 0 else 0)

for segment in range(total_segments):
    start = segment * segment_size
    end = min((segment + 1) * segment_size, len(df))
    segment_df = df.iloc[start:end].copy()

    # Classify each text and handle any exceptions
    for i, row in segment_df.iterrows():
        text = row['cleaned_text']
        if pd.isna(text) or text.strip() == '':
            segment_df.at[i, 'labels'] = np.nan
            segment_df.at[i, 'scores'] = np.nan
            continue
        try:
            result = classify_batch([text], candidate_labels)
            segment_df.at[i, 'labels'] = result[0]['labels'][0]
            segment_df.at[i, 'scores'] = result[0]['scores'][0]
        except Exception as e:
            print(f"Error processing index {i}: {e}")
            segment_df.at[i, 'labels'] = np.nan
            segment_df.at[i, 'scores'] = np.nan

        # Optional: Print progress every 100 records
        if (i - start) % 100 == 0:
            print(f"Processed up to index {i}")

    # Save segment results to a CSV file and clear memory
    interim_save_filename = f'packages_topics_segment_{segment}.csv'
    segment_df.to_csv(interim_save_filename)
    print(f"Segment {segment} data saved to {interim_save_filename}")
    gc.collect()

# Save the final DataFrame after processing
df.to_csv('packages_topics_final.csv')

# Initialize a folder path and a list to store DataFrames for merging
folder_path = 'packages_topics_folder'
dataframes = []

# Iterate through each file in the folder, read the CSV files, and append them to the list
for filename in os.listdir(folder_path):
    if filename.endswith


# Sentiment Classification

In [None]:
# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')

# Load the model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
# Function to classify text for sentiment

def classify_for_sentiment(text):
    # Tokenize and prepare input for the model
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    
    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Process the output to extract sentiment information
    # outputs.logits contains the model's predictions
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    sentiment = ['NEGATIVE', 'POSITIVE'][probabilities.argmax().item()]
    score = probabilities.max().item()

    return {'label': sentiment, 'score': score}
    
# Process each row and update DataFrame
batch_size = 32  # You can tune this size
total_batches = (len(df) + batch_size - 1) // batch_size
times = []
start_time = time.time()

for batch_number,batch_start in enumerate(list(range(0, len(df), batch_size))):
    batch_end = batch_start + batch_size
    batch_texts = df['cleaned_text'][batch_start:batch_end].tolist()

    # Perform classification with truncated texts
    batch_results = [classify_for_sentiment(text) for text in batch_texts]

    # Update DataFrame
    for i, result in enumerate(batch_results):
        df.at[batch_start + i, 'sentiment'] = result['label']
        df.at[batch_start + i, 'sentiment_score'] = result['score']

    # Time estimation
    batch_time = (time.time() - start_time) / (batch_start // batch_size + 1)
    estimated_time_remaining = batch_time * (total_batches - (batch_start // batch_size + 1))
    print(f"Batch {batch_start // batch_size + 1}/{total_batches} processed. Estimated time remaining: {estimated_time_remaining/60:.2f} minutes")
    
    # Save every 100 batches
    if batch_number % 100 == 0:
        interim_save_filename = f'packages_sentiment_batch_{batch_number}.csv'
        df.to_csv(interim_save_filename)
        print(f"Interim data saved to {interim_save_filename}")

end_time = time.time()
print(f"Total processing time: {end_time - start_time:.2f} seconds")

# Save the updated DataFrame
df.to_csv('packages_sentiment.csv')

## Emotion Classification

In [None]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained model (weights)
model = BertForSequenceClassification.from_pretrained('nateraw/bert-base-uncased-emotion')

# Encode and add special tokens to a text sequence (for BERT: [CLS] and [SEP])
# with handling for maximum sequence length
def encode_text(text, tokenizer, max_length=512):
    return tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        truncation=True,
        return_attention_mask=False,
        return_tensors='pt'
    )['input_ids']

# Function to predict emotion with handling for maximum sequence length
def predict_emotion(text, tokenizer, model, max_length=512):
    # Encode text
    input_ids = encode_text(text, tokenizer, max_length)
    
    # Predict
    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits
    
    # Apply softmax to logits to get probabilities
    probabilities = softmax(logits, dim=1).numpy().flatten()

    # Get the list of labels from the model config
    labels = list(model.config.id2label.values())
    
    # Pair each label with its corresponding probability
    emotion_probs = list(zip(labels, probabilities))
    
    # Get the emotion with the highest probability and its score
    emotion, confidence = max(emotion_probs, key=lambda x: x[1])
    
    return emotion, confidence

# Add new columns for emotion and confidence to the dataframe
df['emotion'] = None
df['confidence'] = None

# Iterate over the dataframe and predict emotions
for index, row in df.iterrows():
    print(f'Processing index {index} of {len(df)}')
    # Check if the text is not null
    if pd.notnull(row['cleaned_text']):
        # Get emotion and confidence
        emotion, confidence = predict_emotion(row['cleaned_text'], tokenizer, model)
        # Update the row with the new data
        df.at[index, 'emotion'] = emotion
        df.at[index, 'confidence'] = confidence

# Save the updated dataframe to a new CSV file
df.to_csv('packages_emotions.csv', index=False)