In [None]:
# Import necessary libraries
import os
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, AutoModelForSeq2SeqLM, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset

In [None]:
def augment_and_extract_metadata(dataset, extractor, topic_labels, file_path, debug=False):
    if os.path.exists(file_path):
        # If the file exists, load the augmented dataset from the CSV file
        print(f"Loading augmented dataset from {file_path}")
        augmented_dataset = pd.read_csv(file_path)
    else:
        # If the file does not exist, proceed with augmenting the dataset
        print(f"Augmenting dataset and saving to {file_path}")
        total_rows = len(dataset)
        count = 0

        topics = []

        # Iterate over each row in the dataset
        for index, row in dataset.iterrows():
            # Extract topic using the extractor
            topic = extractor.extract_topic(row['text'], topic_labels)
            topics.append(topic)

            for label in topic_labels:
                dataset.at[index, label] = 1 if topic == label else 0

            # If debug mode is enabled, print debug information
            percentage_complete = ((count + 1) / total_rows) * 100
            if debug:
                print(f"Text: {row['text']}")
                print(f"Generated Metadata: Topic - {topic}")
                print(f"Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}")

            if percentage_complete % 5 == 0:
                print(f"Percentage of Completion: {percentage_complete:.2f}%, {count + 1} of {total_rows}")

            count += 1

        dataset['topic'] = topics

        dataset.to_csv(file_path, index=False)
        augmented_dataset = dataset

    return augmented_dataset


def predict_sentiment(dataset, sentiment_analyzer, file_path, debug=False, batch_size=32):
    if os.path.exists(file_path):
        # If the file exists, load the sentiment-augmented dataset from the CSV file
        print(f"Loading sentiment-augmented dataset from {file_path}")
        sentiment_augmented_dataset = pd.read_csv(file_path)
    else:
        # If the file does not exist, proceed with sentiment prediction
        print(f"Predicting sentiment and saving to {file_path}")
        total_rows = len(dataset)
        sentiments = []

        # Process the dataset in batches
        for start in range(0, total_rows, batch_size):
            end = min(start + batch_size, total_rows)
            # Extract a batch of texts from the dataset
            batch_texts = dataset['text'][start:end].tolist()
            # Use the sentiment analyzer to classify the sentiments of the batch of texts
            batch_results = sentiment_analyzer.classifier(batch_texts)
            # Map the sentiment labels to target values for each result in the batch
            batch_sentiments = [sentiment_analyzer.map_label_to_target(result['label']) for result in batch_results]
            # Extend the sentiments list with the batch sentiments
            sentiments.extend(batch_sentiments)
            # Calculate the percentage of completion
            percentage_complete = ((end) / total_rows) * 100
            if debug:
                print(f"Processed batch {start // batch_size + 1}: {start} to {end}")
                print(f"Percentage of Completion: {percentage_complete:.2f}%, {end} of {total_rows}")
            if percentage_complete % 5 == 0:
                print(f"Percentage of Completion: {percentage_complete:.2f}%")

        dataset['sentiment'] = sentiments
        dataset.to_csv(file_path, index=False)
        sentiment_augmented_dataset = dataset

    return sentiment_augmented_dataset

In [None]:
class MetadataExtractor:
    def __init__(self):
        # Check if GPUs are available and set the devices accordingly
        self.devices = [i for i in range(torch.cuda.device_count())]

        # Initialize the zero-shot classification pipelines with specific models
        self.MODEL = "roberta-large-mnli"
        self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL)
        self.models = [
            AutoModelForSequenceClassification.from_pretrained(self.MODEL, ignore_mismatched_sizes=True).to(f'cuda:{device}')
            for device in self.devices
        ]
        self.classifiers = [
            pipeline("zero-shot-classification", model=model, tokenizer=self.tokenizer, device=device)
            for model, device in zip(self.models, self.devices)
        ]
        self.current_device_index = 0

    def _get_next_classifier(self):
        """
        Get the next classifier in a round-robin manner to distribute the workload.
        """
        classifier = self.classifiers[self.current_device_index]
        self.current_device_index = (self.current_device_index + 1) % len(self.devices)
        return classifier

    def extract_attribute(self, text, candidate_labels, hypothesis_template):
        """
        Extracts an attribute from the given text using the zero-shot classification model.

        :param text: The text to classify.
        :param candidate_labels: A list of strings representing candidate labels.
        :param hypothesis_template: A template for the hypothesis.
        :return: The label with the highest probability.
        """
        # Get the classifier for the current task
        classifier = self._get_next_classifier()
        # Perform zero-shot classification
        result = classifier(text, candidate_labels, hypothesis_template=hypothesis_template)
        # Get the label with the highest probability
        top_label = result['labels'][0]
        return top_label

    def extract_topic(self, text, candidate_labels):
        """
        Extracts the topic from the given text.

        :param candidate_labels:
        :param text: The text to classify.
        :return: The topic label with the highest probability.
        """
        hypothesis_template = "The topic of this text is {}."
        return self.extract_attribute(text, candidate_labels, hypothesis_template)


In [None]:
class DatasetLoad:
    def __init__(self, dataset_type, base_path, percentage=100.0, debug=False):
        """
        Initialize the DatasetLoad object.

        :param dataset_type: Type of the dataset ('emotion', 'sarcasm', or 'tweets').
        :param base_path: Base path where dataset files are located.
        :param percentage: Percentage of the dataset to use.
        """
        self.dataset_type = dataset_type
        self.base_path = base_path
        self.percentage = percentage
        self.train_data = None
        self.test_data = None
        self.val_data = None

    def load_data(self, file_path):
        """
        Load the tweet dataset from a CSV file.

        :param file_path: Relative path to the tweet dataset file.
        :return: DataFrame containing the tweet data.
        """
        full_path = os.path.join(self.base_path, file_path)
        if not os.path.exists(full_path):
            raise FileNotFoundError(f"File not found: {full_path}")
        data = pd.read_csv(full_path, delimiter=',')
        return data

    def load_datasets(self):
        """
        Load the datasets based on the dataset type and apply percentage sampling if needed.
        Ensure the first column is 'text' and the second column is 'category'.
        """
        if self.dataset_type == 'reddit':
            print("Loading Reddit dataset...")
            data = self.load_data('datasets/Reddit_Data.csv')
            data = data.rename(columns={'clean_comment': 'text'})
            # truncate the text in the text column with over 512 characters
            data['text'] = data['text'].str.slice(0, 512)


        elif self.dataset_type == 'tweets':
            print("Loading Twitter dataset...")
            data = self.load_data('datasets/Twitter_Data.csv')
            # drop the ID column, axis=1
            data = data.drop('Id', axis=1)
            # convert category from text to -1, 0, 1
            data['category'] = data['Category'].map({'negative': 0, 'neutral': 1, 'positive': 2})
            data = data.drop('Category', axis=1)
            # rename 'tweet' column to 'text'
            data = data.rename(columns={'Tweet': 'text'})
            # remove the rows of the text column in which the text is "Not Available"
            data = data[data['text'] != 'Not Available']
            data = data.dropna()

        # Ensure the first column is 'text' and the second column is 'category'
        data = data[['text', 'category'] + [col for col in data.columns if col not in ['text', 'category']]]

        train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
        self.val_data, self.test_data = train_test_split(temp_data, test_size=0.5, random_state=42)
        self.train_data = train_data

        if self.percentage < 100.0:
            self.train_data = self.train_data.sample(frac=self.percentage / 100.0, random_state=42)
            self.val_data = self.val_data.sample(frac=self.percentage / 100.0, random_state=42)
            self.test_data = self.test_data.sample(frac=self.percentage / 100.0, random_state=42)

In [None]:
class SentimentAnalyzer:
    def __init__(self):
        self.model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
        self.device = 0 if torch.cuda.is_available() else -1  # Use GPU if available
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, ignore_mismatched_sizes=True).to(self.device)
        self.classifier = pipeline("sentiment-analysis", model=self.model, tokenizer=self.tokenizer, device=self.device)

        # Initialize FLAN model for synthetic data generation
        self.flan_model_name = "google/flan-t5-small"
        self.flan_tokenizer = AutoTokenizer.from_pretrained(self.flan_model_name)
        self.flan_model = AutoModelForSeq2SeqLM.from_pretrained(self.flan_model_name).to(self.device)
    def analyze_sentiment(self, text):
        results = self.classifier(text)
        return results[0]['label']

    def map_label_to_target(self, label):
        # Map the sentiment label to the target value
        if label == "negative" or label == "Negative":
            return 0
        elif label == "neutral" or label == "Neutral":
            return 1
        elif label == "positive" or label == "Positive":
            return 2
        else:
            return None

    # Generate synthetic data using the FLAN model
    def generate_synthetic_data(self, topic, text, sentiment, n_samples):
        synthetic_data = []
        for _ in range(n_samples):
            prompt = f"Generate a tweet related to {topic} that expresses a {sentiment} sentiment similar to: '{text}' "
            inputs = self.flan_tokenizer(prompt, return_tensors="pt").to(self.device)
            outputs = self.flan_model.generate(inputs.input_ids, max_length=60, num_return_sequences=1)
            generated_text = self.flan_tokenizer.decode(outputs[0], skip_special_tokens=True)
            synthetic_data.append(generated_text)
        return synthetic_data

    # Augment the training data with synthetic data
    def augment_training_data(self, topics, texts, sentiments, n_samples=6):
        augmented_data = {'text': [], 'label': []}
        augmented_data_with_topics = {'text': [], 'label': [], 'topic': []}

        for topic, text, sentiment in zip(topics, texts, sentiments):
            synthetic_texts = self.generate_synthetic_data(topic, text, sentiment, n_samples)
            sentiment_label = self.map_label_to_target(sentiment)
            augmented_data['text'].extend(synthetic_texts)
            augmented_data['label'].extend([sentiment_label] * len(synthetic_texts))
            augmented_data_with_topics['text'].extend(synthetic_texts)
            augmented_data_with_topics['label'].extend([sentiment_label] * len(synthetic_texts))
            augmented_data_with_topics['topic'].extend([topic] * len(synthetic_texts))

        augmented_df = pd.DataFrame(augmented_data)
        augmented_df_with_topics = pd.DataFrame(augmented_data_with_topics)
        return augmented_df, augmented_df_with_topics

    # Fine-tune the model with augmented data
    def fine_tune_with_augmented_data(self, topics, texts, sentiments, n_samples=6, epochs=3, batch_size=16,
                                      learning_rate=2e-5):
        augmented_train_data, augmented_train_data_with_topics = self.augment_training_data(topics, texts, sentiments,
                                                                                            n_samples)
        return self.fine_tune(augmented_train_data, epochs, batch_size, learning_rate), augmented_train_data_with_topics

    # Fine-tune the model on a custom dataset
    def fine_tune(self, df, epochs=3, batch_size=16, learning_rate=2e-5):
        # Preprocess the dataset
        df = df.rename(columns={"text": "text", "category": "label"})     # Rename the columns
        df['label'] = df['label'].astype(int)   # Ensure the labels are integers
        train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)        # Split the dataset

        train_dataset = Dataset.from_pandas(train_df)   # Load the dataset
        test_dataset = Dataset.from_pandas(test_df)

        def tokenize_function(examples):    # Tokenize the text
            return self.tokenizer(examples["text"], padding="max_length", truncation=True)

        train_dataset = train_dataset.map(tokenize_function, batched=True)  # Tokenize the dataset
        test_dataset = test_dataset.map(tokenize_function, batched=True)

        train_dataset = train_dataset.remove_columns(["text"])  # Remove the text column after tokenization
        test_dataset = test_dataset.remove_columns(["text"])

        train_dataset.set_format("torch")   # Set the format to PyTorch
        test_dataset.set_format("torch")

        # Define the data collator
        data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)

        # Define training arguments
        training_args = TrainingArguments(  # Define the training arguments
            output_dir="./results",
            run_name="finetuning_sentiment_classifier",
            eval_strategy="epoch",
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=epochs,
            weight_decay=0.01,
        )

        # Define the trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            data_collator=data_collator,
        )

        # Fine-tune the model
        trainer.train()

        # Evaluate the model
        results = trainer.evaluate()
        print(results)
        return results


In [None]:
# Set the base path
base_path = os.getcwd()

# Load the dataset
dataset_loader = DatasetLoad('tweets', base_path, 100.0)
dataset_loader.load_datasets()
original_train_data = dataset_loader.train_data
original_test_data = dataset_loader.test_data
original_val_data = dataset_loader.val_data

# Initialize the sentiment analyzer
sentiment_analyzer = SentimentAnalyzer()

# Fine-tune or load the sentiment model
model_save_path = os.path.join(base_path, 'sentiment_model_tweets_100.pt')
if os.path.exists(model_save_path):
    sentiment_analyzer.model = torch.load(model_save_path)
else:
    sentiment_analyzer.fine_tune(original_train_data)
    torch.save(sentiment_analyzer.model, model_save_path)

# Predict sentiment for the datasets
train_sentiment_file_name = os.path.join(base_path, 'train_sentiment_tweets_100.csv')
test_sentiment_file_name = os.path.join(base_path, 'test_sentiment_tweets_100.csv')
val_sentiment_file_name = os.path.join(base_path, 'val_sentiment_tweets_100.csv')

train_data_with_sentiment = predict_sentiment(original_train_data.copy(), sentiment_analyzer, train_sentiment_file_name)
test_data_with_sentiment = predict_sentiment(original_test_data.copy(), sentiment_analyzer, test_sentiment_file_name)
val_data_with_sentiment = predict_sentiment(original_val_data.copy(), sentiment_analyzer, val_sentiment_file_name)

In [None]:
# Compute and print metrics
train_true_labels = original_train_data['category']
train_predicted_labels = train_data_with_sentiment['sentiment']
print("\nTrain Classification Report:")
print(classification_report(train_true_labels, train_predicted_labels, labels=[0, 1, 2], zero_division=0))

test_true_labels = original_test_data['category']
test_predicted_labels = test_data_with_sentiment['sentiment']
print("\nTest Classification Report:")
print(classification_report(test_true_labels, test_predicted_labels, labels=[0, 1, 2], zero_division=0))

val_true_labels = original_val_data['category']
val_predicted_labels = val_data_with_sentiment['sentiment']
print("\nValidation Classification Report:")
print(classification_report(val_true_labels, val_predicted_labels, labels=[0, 1, 2], zero_division=0))

In [None]:
# Initialize the metadata extractor
extractor = MetadataExtractor()
topic_labels = ["politics", "entertainment", "sports", "technology", "health", "education", "finance", "food", "other"]

# Extract metadata for the datasets
train_file_name = os.path.join(base_path, 'train_augmented_tweets_100.csv')
test_file_name = os.path.join(base_path, 'test_augmented_tweets_100.csv')
val_file_name = os.path.join(base_path, 'val_augmented_tweets_100.csv')

train_data_with_metadata = augment_and_extract_metadata(train_data_with_sentiment.copy(), extractor, topic_labels, train_file_name)
test_data_with_metadata = augment_and_extract_metadata(test_data_with_sentiment.copy(), extractor, topic_labels, test_file_name)
val_data_with_metadata = augment_and_extract_metadata(val_data_with_sentiment.copy(), extractor, topic_labels, val_file_name)

# Function to create subgroups based on metadata
def create_subgroups(dataset):
    subgroups = {}
    for topic in topic_labels:
        subgroup_name = f"{topic}"
        subgroups[subgroup_name] = dataset[dataset['topic'] == topic]
    return subgroups

train_subgroups = create_subgroups(train_data_with_metadata)
test_subgroups = create_subgroups(test_data_with_metadata)
val_subgroups = create_subgroups(val_data_with_metadata)

In [None]:
# Function to compute metrics for the subgroups
def compute_metrics(subgroups, true_labels_column='category', pred_labels_column='sentiment'):
    metrics = []
    for topic, subgroup in subgroups.items():
        if not subgroup.empty:
            true_labels = subgroup[true_labels_column]
            pred_labels = subgroup[pred_labels_column]
            report = classification_report(true_labels, pred_labels, output_dict=True, zero_division=0)
            metrics.append({
                'topic': topic,
                'accuracy': report['accuracy'],
                'precision': report['weighted avg']['precision'],
                'recall': report['weighted avg']['recall'],
                'f1-score': report['weighted avg']['f1-score']
            })
    return pd.DataFrame(metrics)

train_metrics = compute_metrics(train_subgroups)
test_metrics = compute_metrics(test_subgroups)
val_metrics = compute_metrics(val_subgroups)

print("Train Metrics per Topic")
print(train_metrics)
print("\nTest Metrics per Topic")
print(test_metrics)
print("\nValidation Metrics per Topic")
print(val_metrics)

In [None]:
# Function to analyze disparities in sentiment predictions
def analyze_disparities(subgroups):
    analysis_results = []
    for subgroup_name, subgroup_data in subgroups.items():
        if not subgroup_data.empty:
            sentiment_counts = subgroup_data['sentiment'].value_counts(normalize=True) * 100
            analysis_results.append({
                'subgroup': subgroup_name,
                'total': len(subgroup_data),
                'negative': sentiment_counts.get(0, 0),
                'neutral': sentiment_counts.get(1, 0),
                'positive': sentiment_counts.get(2, 0),
            })
    return pd.DataFrame(analysis_results)

train_analysis = analyze_disparities(train_subgroups)
test_analysis = analyze_disparities(test_subgroups)
val_analysis = analyze_disparities(val_subgroups)

print("Train Percentage Analysis")
print(train_analysis)
print("\nTest Percentage Analysis")
print(test_analysis)
print("\nValidation Percentage Analysis")
print(val_analysis)

In [None]:
def weighted_metrics(metrics_df, support_df, metric='accuracy'):
    metrics_df = metrics_df.copy()
    metrics_df = metrics_df.merge(support_df, left_on='topic', right_on='subgroup')
    metrics_df['weighted_metric'] = metrics_df[metric] * metrics_df['support']
    return metrics_df

def get_top_lower_topics(test_metrics_df, test_percentage_analysis_df, metric='accuracy'):
    support_df = test_percentage_analysis_df[['subgroup', 'total']].rename(columns={'total': 'support'})
    weighted_metrics_df = weighted_metrics(test_metrics_df, support_df, metric)
    baseline_accuracy = weighted_metrics_df['accuracy'].mean()
    sorted_metrics = weighted_metrics_df.sort_values(by='weighted_metric', ascending=False)
    top_3_topics = sorted_metrics.head(3)['topic'].tolist()
    bottom_3_topics = sorted_metrics.tail(3)['topic'].tolist()
    bottom_3_topics_below_baseline = sorted_metrics[sorted_metrics['accuracy'] < baseline_accuracy].tail(3)['topic'].tolist()
    return top_3_topics, bottom_3_topics_below_baseline

topics = get_top_lower_topics(val_metrics, val_analysis, metric='accuracy')
print(f"Top 3 (lower score) validation topics: {topics[0]}")

In [None]:
def generate_and_augment_data(sentiment_analyzer, topics, train_data_with_metadata, n_samples):
    if isinstance(topics[0], list):
        topics = [item for sublist in topics for item in sublist]
    synthetic_texts = []
    for topic in topics:
        topic_data = train_data_with_metadata[train_data_with_metadata['topic'] == topic]
        topic_samples = topic_data.sample(n_samples, replace=True)
        for index, row in topic_samples.iterrows():
            synthetic_texts.extend(sentiment_analyzer.generate_synthetic_data(row['topic'], row['text'], n_samples))
    return synthetic_texts

synthetic_texts = generate_and_augment_data(sentiment_analyzer, topics, train_data_with_metadata, n_samples=10)
synthetic_df = pd.DataFrame({
    'text': synthetic_texts,
    'category': [1] * len(synthetic_texts),
    'topic': topics[1] * (len(synthetic_texts) // len(topics[1]))
})

augmented_train_data = pd.concat([original_train_data, synthetic_df], ignore_index=True)
augmented_fine_tuning_results = sentiment_analyzer.fine_tune(augmented_train_data)
print(f"Fine-tuning results with augmented data: {augmented_fine_tuning_results}")
