## Loading the Data and Processing the Data

In [13]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

def load_and_filter_data(file_path, text_column):
    """
    Load the dataset from a CSV file and filter rows where the text column contains text.
    
    Args:
    file_path (str): Path to the CSV file.
    text_column (str): Name of the column containing text comments.
    
    Returns:
    pd.DataFrame: Filtered DataFrame with non-empty text comments.
    """
    data = pd.read_csv(file_path)
    filtered_data = data[data[text_column].notna() & (data[text_column] != '')]
    return filtered_data

def preprocess_data(df):
    """
    Preprocess the DataFrame with text and numerical data.
    
    Args:
    df (pd.DataFrame): DataFrame to preprocess.
    
    Returns:
    pd.DataFrame: Preprocessed DataFrame.
    """
    
    # Preprocess the 'text' column
    df['text'] = df['text'].str.lower().str.replace(r'\d+', '', regex=True).str.replace(r'[^a-zA-Z\s]', '', regex=True)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    df['text'] = df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split() if word not in stop_words]))

    # Normalize numerical columns with separate scalers
    scaler_digg = MinMaxScaler()
    scaler_reply = MinMaxScaler()
    df['diggCount'] = scaler_digg.fit_transform(df[['diggCount']])
    df['replyCommentTotal'] = df['replyCommentTotal'].fillna(0)
    df['replyCommentTotal'] = scaler_reply.fit_transform(df[['replyCommentTotal']])

    # Convert and extract datetime features
    df['createTime'] = pd.to_datetime(df['createTimeISO'])
    df['day_of_week'] = df['createTime'].dt.dayofweek
    df['hour'] = df['createTime'].dt.hour

    return df

# Paths to your dataset files
file_path1 = 'dataset_tiktok-comments-scraper_2024-04-28_23-16-10-409.csv'
# file_path2 = 'dataset_free-tiktok-scraper_2024-04-28_21-22-00-488.csv'

# Load and filter datasets
dataset1 = load_and_filter_data(file_path1, 'text')  # Assuming 'text' is the column for Dataset 1
# dataset2 = load_and_filter_data(file_path2, 'text')  # Update 'text' if a different column name for Dataset 2

# Preprocess datasets
dataset1 = preprocess_data(dataset1)
# dataset2 = preprocess_data(dataset2)

# Display the first few rows of the preprocessed datasets
print("Preprocessed Dataset 1:")
print(dataset1.head())
display(dataset1.describe())
# print("\nPreprocessed Dataset 2:")
# print(dataset2.head())

rows, columns = dataset1.shape

print("Number of rows:", rows)
print("Number of columns:", columns)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ulugsali/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ulugsali/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocessed Dataset 1:
                                     avatarThumbnail                  cid  \
0  https://p16-sign-va.tiktokcdn.com/tos-maliva-a...  7247915369984312070   
1  https://p16-sign-va.tiktokcdn.com/tos-maliva-a...  7216521968188392238   
2  https://p16-sign-va.tiktokcdn.com/tos-maliva-a...  7247888310218408710   
3  https://p16-sign-va.tiktokcdn.com/tos-maliva-a...  7216606811569488682   
4  https://p16-sign-va.tiktokcdn.com/tos-maliva-a...  7216822967849566982   

                 createTime             createTimeISO  diggCount  repliesToId  \
0 2023-06-23 16:13:32+00:00  2023-06-23T16:13:32.000Z   0.016535          NaN   
1 2023-03-31 01:51:10+00:00  2023-03-31T01:51:10.000Z   0.020418          NaN   
2 2023-06-23 14:28:30+00:00  2023-06-23T14:28:30.000Z   0.004627          NaN   
3 2023-03-31 07:20:56+00:00  2023-03-31T07:20:56.000Z   0.011495          NaN   
4 2023-03-31 21:19:16+00:00  2023-03-31T21:19:16.000Z   0.008653          NaN   

   replyCommentTotal      

Unnamed: 0,cid,diggCount,repliesToId,replyCommentTotal,uid,day_of_week,hour
count,21136.0,21136.0,0.0,21136.0,21136.0,21136.0,21136.0
mean,7.183996e+18,0.011551,,0.004865,6.489387e+18,3.030564,12.168149
std,1.631011e+17,0.040078,,0.015513,1.607452e+18,1.969297,7.23082
min,1614742000000000.0,0.0,,0.0,97201.0,0.0,0.0
25%,7.098999e+18,1.1e-05,,0.0,6.745077e+18,1.0,5.0
50%,7.212062e+18,0.000359,,0.000418,6.837581e+18,3.0,13.0
75%,7.306264e+18,0.004803,,0.003343,7.003809e+18,5.0,19.0
max,7.363034e+18,1.0,,1.0,7.362077e+18,6.0,23.0


Number of rows: 21136
Number of columns: 14


In [14]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the pretrained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions")
model = AutoModelForSequenceClassification.from_pretrained("SamLowe/roberta-base-go_emotions")

# Example of how to prepare text data for the model
text = "This is a sample text."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


tokenizer_config.json:   0%|          | 0.00/380 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

ImportError: 
AutoModelForSequenceClassification requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.


In [18]:
from torch.utils.data import Dataset

class BrandPerceptionDataset(Dataset):
    def __init__(self, texts, aspect_labels, tokenizer, max_length):
        self.texts = texts
        self.aspect_labels = aspect_labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        aspect_label = self.aspect_labels[idx]
        
        # Tokenize text and convert to input_ids and attention_mask
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        
        return input_ids, attention_mask, aspect_label

BrandPerceptionDataset(dataset1)

TypeError: BrandPerceptionDataset.__init__() missing 3 required positional arguments: 'aspect_labels', 'tokenizer', and 'max_length'