In [22]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import pandas as pd
import os
from tqdm.auto import tqdm  # For progress visualization
from math import ceil
import logging

In [23]:
def get_device():
    """
    Detects and returns the available device for PyTorch computations.
    Priority: MPS (for Apple Silicon) > CUDA > CPU
    """
    if torch.backends.mps.is_available():
        device = torch.device("mps")
        print("MPS (Metal Performance Shaders) is available. Using GPU for inference.")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
        print("CUDA is available. Using GPU for inference.")
    else:
        device = torch.device("cpu")
        print("No GPU available. Using CPU for inference.")
    return device

device = get_device()

MPS (Metal Performance Shaders) is available. Using GPU for inference.


In [25]:
csv_file_path = "./processed_news_turkish_entities_keywords.csv"

# Check if the CSV file exists
if not os.path.exists(csv_file_path):
    raise FileNotFoundError(f"CSV file not found at path: {csv_file_path}")

# Load the CSV into a pandas DataFrame
df = pd.read_csv(csv_file_path)

# Verify the presence of the 'normalized_text' column
if 'normalized_text' not in df.columns:
    raise ValueError("The 'normalized_text' column does not exist in the provided CSV.")

print("Data loaded successfully.")
print(f"Number of rows: {len(df)}")
print("DataFrame columns:", df.columns.tolist())

Data loaded successfully.
Number of rows: 1000
DataFrame columns: ['source', 'article_url', 'title', 'date', 'shortened_full_text', 'combined_text', 'sentiment_label', 'sentiment_score', 'title_tr', 'shortened_full_text_tr', 'title_and_text', 'normalized_text', 'entities', 'keywords']


In [26]:
model_name = "savasy/bert-base-turkish-sentiment-cased"

try:
    # Load the tokenizer with truncation settings
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    
    # Load the pre-trained model for sequence classification
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    
    print("Model and tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading model '{model_name}': {e}")
    raise e

Model and tokenizer loaded successfully.


In [28]:
# Determine the device index for the pipeline
if device.type == "cuda":
    device_index = 0  # Assuming using the first CUDA device
elif device.type == "mps":
    device_index = -1  # Transformers library uses -1 for MPS
else:
    device_index = -1  # CPU

# Initialize the sentiment analysis pipeline with truncation
sentiment_analyzer = pipeline(
    task="sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    device=device_index,
    truncation=True,       
    max_length=512            
)

print("Sentiment analysis pipeline initialized with truncation.")

Sentiment analysis pipeline initialized with truncation.


In [31]:
def split_text(text, tokenizer, max_length=512, stride=50):
    """
    Splits the input text into chunks of tokens not exceeding max_length.
    
    Args:
        text (str): The input text to split.
        tokenizer (transformers.Tokenizer): The tokenizer used to encode the text.
        max_length (int): The maximum number of tokens per chunk.
        stride (int): The number of tokens to overlap between chunks.
        
    Returns:
        List[str]: A list of text chunks.
    """
    # Encode the text to get token IDs
    encoded = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        return_attention_mask=False,
        return_token_type_ids=False,
        return_tensors=None
    )
    input_ids = encoded['input_ids']
    
    # Calculate the number of chunks
    total_length = len(input_ids)
    chunks = []
    start = 0
    while start < total_length:
        end = start + max_length
        chunk_ids = input_ids[start:end]
        # Decode back to text
        chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
        chunks.append(chunk_text)
        if end >= total_length:
            break
        start += max_length - stride  # Move by (max_length - stride) tokens
    return chunks

In [32]:
# Define new columns for predicted sentiment and scores
predicted_label_column = "turkish_sentiment_label"
predicted_score_column = "turkish_sentiment_score"

# Initialize lists to store predictions
predicted_labels = []
predicted_scores = []

# Configure logging to capture errors
logging.basicConfig(filename='inference_errors.log', level=logging.ERROR)

# Iterate over each text entry
for text in tqdm(df["normalized_text"], desc="Performing sentiment analysis"):
    # Handle missing or non-string text
    if not isinstance(text, str):
        text = ""
    
    # Split text into chunks if necessary
    tokenized = tokenizer.encode(text, add_special_tokens=True)
    if len(tokenized) > 512:
        chunks = split_text(text, tokenizer, max_length=512, stride=50)
    else:
        chunks = [text]
    
    chunk_labels = []
    chunk_scores = []
    
    # Analyze each chunk
    for chunk in chunks:
        try:
            result = sentiment_analyzer(chunk)[0]
            # Example result: {'label': 'POSITIVE', 'score': 0.998}
            chunk_labels.append(result["label"])
            chunk_scores.append(result["score"])
        except Exception as e:
            # Log the error and append default values
            logging.error(f"Error processing chunk: {e}")
            chunk_labels.append("UNKNOWN")
            chunk_scores.append(0.0)
    
    # Aggregate results for the entire text
    if chunk_scores:
        # Example aggregation: Majority vote for labels, average for scores
        label_counts = {}
        for label in chunk_labels:
            label_counts[label] = label_counts.get(label, 0) + 1
        aggregated_label = max(label_counts, key=label_counts.get)
        aggregated_score = sum(chunk_scores) / len(chunk_scores)
    else:
        aggregated_label = "UNKNOWN"
        aggregated_score = 0.0
    
    # Append aggregated results to the lists
    predicted_labels.append(aggregated_label)
    predicted_scores.append(aggregated_score)

# Add the prediction results to the DataFrame
df[predicted_label_column] = predicted_labels
df[predicted_score_column] = predicted_scores

print("Inference completed. Sentiment columns added to the DataFrame.")

Performing sentiment analysis:   0%|          | 0/1000 [00:00<?, ?it/s]

Inference completed. Sentiment columns added to the DataFrame.


In [33]:
# Display the distribution of predicted sentiment labels
print("Label Distribution:")
print(df[predicted_label_column].value_counts())

# Optional: Display sample predictions
print("\nSample Predictions:")
print(df[[predicted_label_column, predicted_score_column]].head())

Label Distribution:
turkish_sentiment_label
negative    560
positive    440
Name: count, dtype: int64

Sample Predictions:
  turkish_sentiment_label  turkish_sentiment_score
0                positive                 0.979294
1                positive                 0.721456
2                negative                 0.717170
3                positive                 0.991640
4                positive                 0.735413


In [34]:
# Specify the output CSV file path
output_csv_file_path = "updated_data_with_turkish_sentiment.csv"  # Replace with desired output filename

# Save the DataFrame to CSV
df.to_csv(output_csv_file_path, index=False)

print(f"Updated CSV saved to '{output_csv_file_path}' with new sentiment columns.")

Updated CSV saved to 'updated_data_with_turkish_sentiment.csv' with new sentiment columns.
