In [1]:
import pandas as pd

# Loading the dataset
df_original = pd.read_csv('Reviews.csv')

In [2]:
# Make a backup of the original dataframe
df_backup = df_original.copy()

# List of columns to keep
columns_to_keep = ['Text', 'Score']

# Drop all other columns
df_selected = df_original.drop(columns=[col for col in df_original.columns if col not in columns_to_keep])
df_selected = df_selected.rename(columns={'Score': 'label'})

In [50]:
from sklearn.base import TransformerMixin, BaseEstimator # Import necessary classes for creating custom transformers

# Define a custom transformer to modify the score column
class modify_score(TransformerMixin, BaseEstimator):
  """
  This transformer modifies the values in a specified column based on a provided mapping (dictionary).
  It can be used to convert numerical scores into categorical labels or to remap values in general.
  """
  # Initialize the transformer with a mapping of labels and the column to modify
  def __init__(self, labels, column):
    """
    Initializes the modify_score transformer.

    Args:
      labels: A dictionary mapping old values to new values.
      column: The name of the column to be modified.
    """
    self.labels = labels # Store the mapping of values
    self.column = column # Store the name of the column to be modified

  # Fit method (does nothing in this case as there's no learning involved)
  def fit(self, X, y=None):
    """
    This method is required by scikit-learn's transformer interface but doesn't perform any fitting here.

    Args:
      X: The input data (DataFrame).
      y: The target labels (not used here).

    Returns:
      self: Returns the transformer instance.
    """
    return self

  # Transform method to apply the mapping to the specified column
  def transform(self, X, y=None):
    """
    Modifies the values in the specified column using the provided mapping.

    Args:
      X: The input data (DataFrame).
      y: The target labels (not used here).

    Returns:
      X_copy: A copy of the input DataFrame with the modified column.
    """
    X_copy = X.copy() # Create a copy of the input DataFrame to avoid modifying the original
    X_copy[self.column] = X_copy[self.column].map(self.labels) # Apply the mapping to the specified column
    return X_copy # Return the modified DataFrame

In [49]:
from sklearn.base import TransformerMixin, BaseEstimator # Importing necessary classes from scikit-learn

# Define a custom transformer class to equalize the number of samples for each sentiment
class equalize_sentiments(TransformerMixin, BaseEstimator):
  """
  This transformer ensures that the dataset has an equal number of samples for each sentiment category.
  It achieves this by undersampling the majority classes to match the sample size of the minority class.
  """
  # Initialize the transformer with the random state, column name, and target count
  def __init__(self, random_state, column, target_count):
    """
    Initializes the equalize_sentiments transformer.

    Args:
      random_state: An integer used to ensure reproducibility of the random sampling.
      column: The name of the column containing the sentiment labels.
      target_count: The desired number of samples for each sentiment category.
    """
    self.random_state = random_state # Store the random state for consistent results
    self.column = column # Store the column name containing sentiment labels
    self.target_count = target_count # Store the desired sample size per sentiment

  # Fit method (required by scikit-learn, but does nothing in this case)
  def fit(self, X, y=None):
    """
    This method is required by scikit-learn's transformer interface, but it doesn't perform any fitting in this case.

    Args:
      X: The input data (DataFrame).
      y: The target labels (not used here).

    Returns:
      self: Returns the transformer instance.
    """
    return self

  # Transform method to perform the sentiment balancing
  def transform(self, X, y=None):
    """
    Balances the dataset by sampling an equal number of instances from each sentiment category.

    Args:
      X: The input data (DataFrame).
      y: The target labels (not used here).

    Returns:
      df_balanced: A balanced DataFrame with an equal number of samples per sentiment.
    """
    X_copy = X.copy() # Create a copy of the input DataFrame to avoid modifying the original
    # Group data by sentiment and sample a fixed number of instances from each group
    df_balanced = X_copy.groupby(self.column).apply(
        lambda x: x.sample(n=self.target_count, random_state=self.random_state) # Sample target_count instances per sentiment
        ).reset_index(drop=True) # Reset the index to have a continuous range
    # Shuffle the balanced DataFrame to ensure randomness
    df_balanced = df_balanced.sample(frac=1, random_state=self.random_state).reset_index(drop=True) # Shuffle and reset index
    return df_balanced # Return the balanced DataFrame

In [51]:
import re # Import the regular expression library for text manipulation

# Define a class for cleaning text data
class clean_text(TransformerMixin, BaseEstimator):
  """
  This transformer cleans text data by removing HTML tags, leading/trailing whitespace,
  and replacing multiple spaces with a single space.
  """
  # Function to perform the text cleaning operations
  def clean_text_fn(self, text):
    """
    Cleans a single text string.

    Args:
      text: The input text string.

    Returns:
      text: The cleaned text string.
    """
    text = re.sub(r'<.*?>', '', text) # Remove HTML tags using a regular expression
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower() # Remove non-alphanumeric characters and convert to lowercase
    text = text.strip() # Remove leading and trailing whitespace
    text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with a single space
    return text # Return the cleaned text

  # Initialize the transformer with the column name to be cleaned
  def __init__(self, column):
    """
    Initializes the clean_text transformer.

    Args:
      column: The name of the column containing text data.
    """
    self.column = column # Store the column name

  # Fit method (does nothing in this case)
  def fit(self, X, y=None):
    """
    This method is required by scikit-learn's transformer interface but doesn't perform any fitting here.

    Args:
      X: The input data (DataFrame).
      y: The target labels (not used here).

    Returns:
      self: Returns the transformer instance.
    """
    return self

  # Transform method to apply the cleaning function to the specified column
  def transform(self, X, y=None):
    """
    Cleans the text data in the specified column of the DataFrame.

    Args:
      X: The input data (DataFrame).
      y: The target labels (not used here).

    Returns:
      X_copy: A copy of the input DataFrame with the cleaned text column.
    """
    X_copy = X.copy()  # Create a copy of the input DataFrame to avoid modifying the original
    X_copy[self.column] = X_copy[self.column].apply(self.clean_text_fn) # Apply the cleaning function to each text value in the column
    return X_copy # Return the DataFrame with the cleaned text column

In [52]:
from sklearn.base import TransformerMixin, BaseEstimator

class shrink_dataset(TransformerMixin, BaseEstimator):
  """
  This transformer reduces the size of the dataset by randomly sampling a specified number of instances.
  """
  def __init__(self, n_samples, random_state):
    """
    Initializes the shrink_dataset transformer.

    Args:
      n_samples: The desired number of samples in the reduced dataset.
      random_state: An integer used for reproducible random sampling.
    """
    self.n_samples = n_samples
    self.random_state = random_state

  def fit(self, X, y=None):
    """
    This method is required by scikit-learn but doesn't perform any fitting here.

    Args:
      X: The input data (DataFrame).
      y: The target labels (not used here).

    Returns:
      self: Returns the transformer instance.
    """
    return self

  def transform(self, X, y=None):
    """
    Reduces the dataset size by random sampling.

    Args:
      X: The input data (DataFrame).
      y: The target labels (not used here).

    Returns:
      X_sampled: A DataFrame containing the randomly sampled instances.
    """
    X_sampled = X.sample(n=self.n_samples, random_state=self.random_state) # Directly sample from the DataFrame
    return X_sampled

In [53]:
from sklearn.base import TransformerMixin, BaseEstimator

class filter_large_texts(TransformerMixin, BaseEstimator):
  """
  This transformer filters out texts that exceed a specified token length.
  """
  def __init__(self, max_length, column, tokenizer):
    """
    Initializes the filter_large_texts transformer.

    Args:
      max_length: The maximum allowed token length.
      column: The name of the column containing text data.
      tokenizer: The tokenizer used to calculate token lengths.
    """
    self.max_length = max_length
    self.column = column
    self.tokenizer = tokenizer

  def fit(self, X, y=None):
    """
    This method is required by scikit-learn but doesn't perform any fitting here.

    Args:
      X: The input data (DataFrame).
      y: The target labels (not used here).

    Returns:
      self: Returns the transformer instance.
    """
    return self

  def transform(self, X, y=None):
    """
    Filters out texts with token lengths exceeding the specified maximum.

    Args:
      X: The input data (DataFrame).
      y: The target labels (not used here).

    Returns:
      X_filtered: A DataFrame containing only texts within the allowed token length.
    """
    X['num_tokens'] = X[self.column].apply(lambda text: len(self.tokenizer.encode(text, truncation=False)))
    X_filtered = X[X['num_tokens'] <= self.max_length].reset_index(drop=True)
    return X_filtered

In [54]:
import tensorflow as tf
from sklearn.base import TransformerMixin, BaseEstimator

class personal_tokenizer(TransformerMixin, BaseEstimator):
  """
  This transformer tokenizes text data and prepares it for model input.
  """
  def __init__(self, tokenizer, column, max_length):
    """
    Initializes the personal_tokenizer transformer.

    Args:
      tokenizer: The tokenizer to use for text tokenization.
      column: The name of the column containing text data.
      max_length: The maximum token length for padding/truncation.
    """
    self.tokenizer = tokenizer
    self.column = column
    self.max_length = max_length

  def fit(self, X, y=None):
    """
    This method is required by scikit-learn but doesn't perform any fitting here.

    Args:
      X: The input data (DataFrame).
      y: The target labels (not used here).

    Returns:
      self: Returns the transformer instance.
    """
    return self

  def transform(self, X, y=None):
    """
    Tokenizes text data, adds padding/truncation, and creates a dictionary
    containing input IDs, attention masks, and labels.

    Args:
      X: The input data (DataFrame).
      y: The target labels (not used here).

    Returns:
      tokenized_data: A dictionary containing tokenized inputs and labels.
    """
    tokenized_texts = self.tokenizer(
        X[self.column].tolist(),
        padding='max_length',
        truncation=True,
        max_length=self.max_length,
        return_tensors='tf'
    )
    tokenized_data = {
        'input_ids': tokenized_texts['input_ids'],
        'attention_mask': tokenized_texts['attention_mask'],
        'labels': tf.convert_to_tensor(X['label'].values)
    }
    return tokenized_data

In [None]:
# Setting up the model and tokenizer
from transformers import TFRobertaForSequenceClassification, RobertaTokenizer

# Load the pre-trained tokenizer and sentiment fine-tuned model
tokenizer = RobertaTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')
model = TFRobertaForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')

In [59]:
from sklearn.pipeline import Pipeline

# Score mapping for sentiment classification
score_to_label = {
    1: 0,  # Negative sentiment
    2: 0,  # Negative sentiment
    3: 1,  # Neutral sentiment
    4: 2,  # Positive sentiment
    5: 2   # Positive sentiment
}

# Preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('modify_score', modify_score(score_to_label, 'label')),
    ('equalize_sentiments', equalize_sentiments(random_state=42, column='label', target_count=42640)),
    ('clean_text', clean_text(column='Text')),
    ('shrink_dataset', shrink_dataset(n_samples=10000, random_state=42)),
    ('filter_large_texts', filter_large_texts(max_length=512, column='Text', tokenizer=tokenizer)),
    ('personal_tokenizer', personal_tokenizer(tokenizer=tokenizer, column='Text', max_length=512))
])

In [60]:
df_balanced = preprocessing_pipeline.fit_transform(df_selected)
df_balanced

{'input_ids': <tf.Tensor: shape=(9930, 512), dtype=int32, numpy=
 array([[    0,   627,  2272, ...,     1,     1,     1],
        [    0,  9226,  4076, ...,     1,     1,     1],
        [    0, 17075, 10928, ...,     1,     1,     1],
        ...,
        [    0,   808,    57, ...,     1,     1,     1],
        [    0, 25252,  2115, ...,     1,     1,     1],
        [    0,   627,   449, ...,     1,     1,     1]], dtype=int32)>,
 'attention_mask': <tf.Tensor: shape=(9930, 512), dtype=int32, numpy=
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>,
 'labels': <tf.Tensor: shape=(9930,), dtype=int64, numpy=array([2, 1, 0, ..., 2, 2, 2])>}

In [61]:
df_balanced['input_ids']

<tf.Tensor: shape=(9930, 512), dtype=int32, numpy=
array([[    0,   627,  2272, ...,     1,     1,     1],
       [    0,  9226,  4076, ...,     1,     1,     1],
       [    0, 17075, 10928, ...,     1,     1,     1],
       ...,
       [    0,   808,    57, ...,     1,     1,     1],
       [    0, 25252,  2115, ...,     1,     1,     1],
       [    0,   627,   449, ...,     1,     1,     1]], dtype=int32)>

In [62]:
df_balanced['attention_mask']

<tf.Tensor: shape=(9930, 512), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>

In [63]:
df_balanced['labels']

<tf.Tensor: shape=(9930,), dtype=int64, numpy=array([2, 1, 0, ..., 2, 2, 2])>

In [44]:
# prompt: use the modify score layer of the pipeline

# Assuming you have a DataFrame named 'df_selected' with a 'label' column containing scores
# and you want to apply the 'modify_score' step of the pipeline.

# Create an instance of the modify_score transformer
score_modifier = modify_score(score_to_label, 'label')

# Apply the transformer to your DataFrame
df_modified_score = score_modifier.fit_transform(df_selected)

# Now df_modified_score will have the 'label' column transformed according to the mapping in score_to_label.
print(df_modified_score)
