## Loading the Data and Processing the Data

In [2]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

def load_and_filter_data(file_path, text_column):
    """
    Load the dataset from a CSV file and filter rows where the text column contains text.
    
    Args:
    file_path (str): Path to the CSV file.
    text_column (str): Name of the column containing text comments.
    
    Returns:
    pd.DataFrame: Filtered DataFrame with non-empty text comments.
    """
    data = pd.read_csv(file_path)
    filtered_data = data[data[text_column].notna() & (data[text_column] != '')]
    return filtered_data[['text']]  # Return only the text column

def preprocess_text(texts):
    """
    Preprocess the text data.
    
    Args:
    texts (pd.Series): Series containing text data.
    
    Returns:
    pd.Series: Preprocessed text data.
    """
    # Convert to lowercase and remove digits and non-alphabetic characters
    texts = texts.str.lower().str.replace(r'\d+', '', regex=True).str.replace(r'[^a-zA-Z\s]', '', regex=True)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    texts = texts.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split() if word not in stop_words]))
    
    return texts

# Paths to your dataset files
file_path1 = 'dataset_tiktok-comments-scraper_2024-04-28_23-16-10-409.csv'
file_path2 = 'dataset_free-tiktok-scraper_2024-04-28_21-22-00-488.csv'

# Load and filter datasets
dataset1 = load_and_filter_data(file_path1, 'text')
dataset2 = load_and_filter_data(file_path2, 'text')

# Preprocess datasets
dataset1['text'] = preprocess_text(dataset1['text'])
dataset2['text'] = preprocess_text(dataset2['text'])

# Display the first few rows of the preprocessed datasets
print("Preprocessed Dataset 1:")
print(dataset1.head())
print("\nPreprocessed Dataset 2:")
print(dataset2.head())

# Print dataset shapes
print("Number of rows in Dataset 1:", dataset1.shape[0])
print("Number of rows in Dataset 2:", dataset2.shape[0])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ulugsali/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ulugsali/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  data = pd.read_csv(file_path)


Preprocessed Dataset 1:
                                                text
0                         maybe simpson real cartoon
1                             think designer purpose
2           waiting model back flip like one simpson
3  collab balenciaga yall look thing simpson put ...
4                                      video created

Preprocessed Dataset 2:
                                                text
0                    simpson v balenciaga fyp foryou
1  responder estatudoaquitudoo simpson x model th...
2  balenciaga x simpson balenciaga thesimpsons ca...
3  somebody think shoe balenciaga mudpit pfw fash...
4          balenciaga balenciaga balenciagacancelled
Number of rows in Dataset 1: 21136
Number of rows in Dataset 2: 2285


In [4]:
!pip install --upgrade openai

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
Collecting openai
  Downloading openai-1.24.0-py3-none-any.whl (312 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m312.3/312.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.12.0
    Uninstalling openai-1.12.0:
      Successfully uninstalled openai-1.12.0
[33m  DEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m[33mDEPRECATION: Configuring installation

In [14]:
merged_dataset = pd.read_csv('Merged_Cleaned_Dataset.csv')
merged_dataset.head(30)

Unnamed: 0,text
0,maybe simpson real cartoon
1,think designer purpose
2,waiting model back flip like one simpson
3,collab balenciaga yall look thing simpson put ...
4,video created
5,nobody nitice queen purse ground
6,comment english
7,yes
8,oop
9,actually plan create like plan u believe simps...


In [16]:
import pandas as pd
import requests

# Set up OpenAI API
api_key = "sk-proj-nzxRWSLF2BlxuIZDtD50T3BlbkFJEbLOkIlSA9KlwIJJuCQz"  # Set your OpenAI API key. Don't share this key and don't distribute a notebook that contains your key.

# Load your dataset
merged_dataset = pd.read_csv('Merged_Cleaned_Dataset.csv')

# Define function to perform sentiment analysis using OpenAI API
def perform_sentiment_analysis(text):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    data = {
        "model": "davinci-002",  # Update with a supported model
        "prompt": f"Label the element of brand perception using one of these labels ('product quality', 'reputation & heritage', 'customer service', 'social impact', 'ethical practices', and 'sustainability') you can choose more than one based on this text as: '{text}'",
        "max_tokens": 60,
        "temperature": 0,  # Ensure deterministic output
        "logprobs": 10,    # Include log probabilities for each token
    }
    try:
        response = requests.post("https://api.openai.com/v1/completions", json=data, headers=headers)
        response.raise_for_status()  # Raise exception for HTTP errors
        response_json = response.json()
        print("Response JSON:", response_json)  # Debug print
        # Extract sentiment label from the response
        sentiment_label = response_json['choices'][0]['text'].strip()
        return sentiment_label
    except requests.exceptions.RequestException as e:
        print("Error performing API request:", e)
        return "Error"
    except (KeyError, IndexError) as e:
        print("Error processing API response:", e)
        return "Error"

# Apply sentiment analysis to each text instance in the dataset
merged_dataset['sentiment'] = merged_dataset['text'].apply(perform_sentiment_analysis)

# Save the updated dataset with sentiment labels
merged_dataset.to_csv('Labeled_Dataset_Sentiment.csv', index=False)

Response JSON: {'id': 'cmpl-9JYeRoShZO3kXscRXdtbQrBPGaNb8', 'object': 'text_completion', 'created': 1714448459, 'model': 'davinci-002', 'choices': [{'text': " and 'maybe simpson real cartoon' and 'maybe simpson real cartoon' and 'maybe simpson real cartoon' and 'maybe simpson real cartoon' and 'maybe simpson real cartoon' and 'maybe simpson real cartoon' and 'maybe simpson real cartoon' and 'maybe simp", 'index': 0, 'logprobs': {'tokens': [' and', " '", 'maybe', ' simp', 'son', ' real', ' cartoon', "'", ' and', " '", 'maybe', ' simp', 'son', ' real', ' cartoon', "'", ' and', " '", 'maybe', ' simp', 'son', ' real', ' cartoon', "'", ' and', " '", 'maybe', ' simp', 'son', ' real', ' cartoon', "'", ' and', " '", 'maybe', ' simp', 'son', ' real', ' cartoon', "'", ' and', " '", 'maybe', ' simp', 'son', ' real', ' cartoon', "'", ' and', " '", 'maybe', ' simp', 'son', ' real', ' cartoon', "'", ' and', " '", 'maybe', ' simp'], 'token_logprobs': [-2.2934031, -0.44233558, -1.721588, -0.9156443, -

KeyboardInterrupt: 

In [None]:
import pandas as pd
import openai
from openai import OpenAI
client = OpenAI()

# Load your CSV file
df = pd.read_csv('Merged_Cleaned_Dataset.csv')

# Set your API key
openai.api_key = "sk-proj-nzxRWSLF2BlxuIZDtD50T3BlbkFJEbLOkIlSA9KlwIJJuCQz"

def label_data(text):
    try:
        response = client.chat.completions.create(
            model="gpt-4",
            max_tokens = 200,
            messages=[
                        {"role": "system", "content": "You are a helpful label assistant. Label the element of brand perception using one of these labels ('product quality', 'reputation & heritage', 'customer service', 'social impact', 'ethical practices', and 'sustainability') you can choose more than one."},
                        {"role": "user", "content": f"Label it based on this text as: '{text}'"},
                    ],
        )
        print(response)
        choices = response.choices[0]
        text = choices.message.content
    except Exception as e:
        print("An error occurred:", e)
        return None

# Apply the function to your DataFrame
df['label'] = df['text'].apply(label_data)

# Save the updated DataFrame
df.to_csv('Merged_Cleaned_Dataset_Labeled_API.csv', index=False)


In [41]:
import pandas as pd
# Read the CSV file
df_label = pd.read_csv('Merged_Cleaned_Dataset_Labeled_API.csv')

# Convert all values to strings
df_label = df_label.astype(str)

# Save the updated DataFrame to a new CSV file
# df.to_csv('updated_file.csv', index=False)
df_label.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23421 entries, 0 to 23420
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    23421 non-null  object
 1   label   23421 non-null  object
dtypes: object(2)
memory usage: 366.1+ KB


In [43]:
# Create separate fields for each label
df_label['product quality'] = 0
df_label['reputation & heritage'] = 0
df_label['customer service'] = 0
df_label['social impact'] = 0
df_label['ethical practices'] = 0
df_label['sustainability'] = 0

# Update the values based on the "label" field
df_label.loc[df_label['label'].str.contains('product quality'), 'product quality'] = 1
df_label.loc[df_label['label'].str.contains('reputation & heritage'), 'reputation & heritage'] = 1
df_label.loc[df_label['label'].str.contains('customer service'), 'customer service'] = 1
df_label.loc[df_label['label'].str.contains('social impact'), 'social impact'] = 1
df_label.loc[df_label['label'].str.contains('ethical practices'), 'ethical practices'] = 1
df_label.loc[df_label['label'].str.contains('sustainability'), 'sustainability'] = 1

df_label[df_label["text"] == "yall know stuff yeah shes shopper people basically maid rich people shop gt time use service"]


Unnamed: 0,text,label,product quality,reputation & heritage,customer service,social impact,ethical practices,sustainability
846,yall know stuff yeah shes shopper people basic...,customer service,0,0,1,0,0,0


In [44]:
df_label.to_csv('Labeled_Df.csv', index=False)

# Training

In [12]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from modules.BrandPerceptionModel import BrandPerceptionModel
from datasets.brand_perception_dataset import BrandPerceptionDataset

# Uploading file:
df_label = pd.read_csv('Labeled_Df.csv')

# Define your training parameters
num_epochs = 5
batch_size = 32
learning_rate = 2e-5



# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions")

# Define maximum sequence length
max_length = 128

# Correcting the data preparation for tokenizer input
df_label['text'] = df_label['text'].fillna('').astype(str)
texts = df_label['text'].tolist()
aspect_labels = df_label.loc[:, "product quality":"sustainability"].values.tolist()

# Ensure aspect_labels is correctly formatted as a list of lists

# Create dataset
train_dataset = BrandPerceptionDataset(texts, aspect_labels, tokenizer, max_length)

# Continue with your existing DataLoader and training setup
# Define your training parameters
num_epochs = 5
batch_size = 32
learning_rate = 2e-5


# Example data (replace with your actual data)
# Correcting the data preparation for tokenizer input
texts = df_label["text"].tolist()  # Convert Series to list of strings

# Ensure aspect_labels is correctly formatted as a list of lists
aspect_labels = df_label.loc[:, "product quality":"sustainability"].values.tolist()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions")

# Create dataset
train_dataset = BrandPerceptionDataset(texts, aspect_labels, tokenizer, max_length)

# Define maximum sequence length
max_length = 128

# Create dataset
train_dataset = BrandPerceptionDataset(texts, aspect_labels, tokenizer, max_length)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Define your loss function for aspect identification
criterion = nn.BCEWithLogitsLoss()  # Binary Cross-Entropy Loss for multi-label classification

model = BrandPerceptionModel()

# Define your optimizer
optimizer = AdamW(model.aspect_classifier.parameters(), lr=learning_rate)  # Only optimize parameters of aspect identification layer

# Define your learning rate scheduler
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Set model to training mode
model.train()

# Training loop
for epoch in range(num_epochs):
    total_loss = 0
    
    for batch in train_dataloader:
        # Extract input data and labels from batch
        input_ids, attention_mask, aspect_labels = batch
        
        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        aspect_logits = outputs.logits  # Directly use logits from the model output
        
        # Compute loss
        loss = criterion(aspect_logits, aspect_labels.float())
        total_loss += loss.item()
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping to prevent exploding gradients
        optimizer.step()
        scheduler.step()

    # Calculate average loss for the epoch
    avg_loss = total_loss / len(train_dataloader)
    
    # Print epoch loss
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

    



AttributeError: 'SequenceClassifierOutput' object has no attribute 'pooler_output'