In [None]:
!pip install datasets dask keras tensorflow bitsandbytes transformers

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Preprocess

In [None]:
import nltk
import re
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
# preprocessing function
from tqdm.auto import tqdm
from datasets import load_dataset
nltk.download('punkt_tab')

def process_text(text):
    lower_text = text.lower().strip()
    text_only = re.sub(r'[^\w\s]', '', lower_text)
    tokens = nltk.word_tokenize(text_only)
    return tokens

def preprocess_texts(texts):
    processed_texts = []
    for text in tqdm(texts):
        tokens = process_text(text)
        processed_texts.append(tokens)

    return processed_texts

import pickle
import os

if os.path.exists('preprocessed_texts.pkl'):
    with open('preprocessed_texts.pkl', 'rb') as f:
        preprocessed_texts = pickle.load(f)
else:
    import datasets
    dataset = load_dataset('alexcadillon/SemEval2016Task5', 'restaurants')
    texts = [item[0]['text'] for item in dataset['train']['sentences']]
    preprocessed_texts = preprocess_texts(texts)
    with open('preprocessed_texts.pkl', 'wb') as f:
        pickle.dump(preprocessed_texts, f)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


SemEval2016Task5.py:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/146k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/58.1k [00:00<?, ?B/s]

Generating trial split:   0%|          | 0/10 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/350 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/90 [00:00<?, ? examples/s]

  0%|          | 0/350 [00:00<?, ?it/s]

# Embedding

In [None]:
# Import necessary libraries
import dask.array as da
import numpy as np
from tqdm import tqdm

# Define the number of documents to process (1/4 of the total)
n_docs = len(preprocessed_texts) // 4

# Count word occurrences in the first n_docs
words_count = {}
for doc in tqdm(preprocessed_texts[:n_docs], desc="Counting words"):
    for word in doc:
        words_count[word] = words_count.get(word, 0) + 1

Counting words: 100%|██████████| 87/87 [00:00<00:00, 120709.38it/s]


In [None]:
# Filter words that appear at least 10 times
words_count = {word: count for word, count in words_count.items() if count >= 10}
words = list(words_count.keys())
# Create a mapping from words to indices
word_to_index = {word: i for i, word in enumerate(words)}

# Initialize the word-word co-occurrence matrix
word_word_matrix = np.zeros((len(words), len(words)), dtype=np.uint8)
window_size = 1
for doc in tqdm(preprocessed_texts[:n_docs], desc="Creating word-word matrix"):
    for i in range(len(doc)):
        for j in range(max(0, i - window_size), min(len(doc), i + window_size + 1)):
            if i != j:
                try:
                    word_i_index = word_to_index[doc[i]]
                    word_j_index = word_to_index[doc[j]]
                    word_word_matrix[word_i_index, word_j_index] += 1
                except KeyError:
                    pass

# Convert the numpy array to a dask array for distributed computing
word_word_matrix_dask = da.from_array(word_word_matrix)

Creating word-word matrix: 100%|██████████| 87/87 [00:00<00:00, 17310.46it/s]


In [None]:
# Convert the numpy array to a dask array for distributed computing
word_word_matrix_dask = da.from_array(word_word_matrix)

# Perform Singular Value Decomposition (SVD) on the word-word matrix
U, S, Vt = da.linalg.svd_compressed(word_word_matrix_dask, k=50)

In [None]:
# Create word embeddings by multiplying U and S
word_embedding_matrix = U * S

# Compute the final word embedding matrix
word_embedding_matrix = word_embedding_matrix.compute()

# Function to get the embedding for a given word
def embed_word(word):
    try:
        return word_embedding_matrix[word_to_index[word]]
    except KeyError:
        return np.zeros(word_embedding_matrix.shape[1])

# Modelling

In [None]:
import pandas as pd

df_train = dataset['train'].to_pandas()
df_train

Unnamed: 0,reviewId,sentences,opinions
0,1004293,"[{'sentenceId': '1004293:0', 'text': 'Judging ...","[{'category': {'entity': 'RESTAURANT', 'attrib..."
1,1014458,"[{'sentenceId': '1014458:0', 'text': 'I have e...","[{'category': {'entity': 'FOOD', 'attribute': ..."
2,1016296,"[{'sentenceId': '1016296:0', 'text': 'I was ve...","[{'category': {'entity': 'RESTAURANT', 'attrib..."
3,1028246,"[{'sentenceId': '1028246:0', 'text': 'Went on ...","[{'category': {'entity': 'RESTAURANT', 'attrib..."
4,1032695,"[{'sentenceId': '1032695:0', 'text': 'Every ti...","[{'category': {'entity': 'RESTAURANT', 'attrib..."
...,...,...,...
345,CLF#8,"[{'sentenceId': 'CLF#8:0', 'text': 'Poor servi...","[{'category': {'entity': 'SERVICE', 'attribute..."
346,ADLT#11,"[{'sentenceId': 'ADLT#11:0', 'text': 'What a h...","[{'category': {'entity': 'RESTAURANT', 'attrib..."
347,WE#1,"[{'sentenceId': 'WE#1:0', 'text': 'Expensive',...","[{'category': {'entity': 'RESTAURANT', 'attrib..."
348,FF#6,"[{'sentenceId': 'FF#6:0', 'text': 'So rude!!!'...","[{'category': {'entity': 'SERVICE', 'attribute..."


In [None]:
import pandas as pd

# Create empty lists to store the extracted data
texts = []
targets = []
entity = []
attribute = []
polarities = []

# Iterate over each row in the dataframe
for index, row in df_train.iterrows():
    # Extract sentences and opinions from each row
    sentences = row['sentences']

    # If opinions are available
    for sentence in sentences:
      if sentence['opinions'].size>0:
          for opinion in sentence['opinions']:
            # Access the text and polarity directly from the opinion dictionary
            if opinion['category'] is not None:  # Check if 'text' key exists and is not None
                fromIdx = int(opinion['from'])
                toIdx = int(opinion['to'])
                texts.append(sentence['text'])
                targets.append(sentence['text'][fromIdx:toIdx])
                entity.append(opinion['category']['entity'])
                attribute.append(opinion['category']['attribute'])
                polarities.append(opinion['polarity'])
            else:
                pass

# Create the new dataframe
train = pd.DataFrame({'text': texts,'target': targets, 'entity':entity, 'attribute': attribute, 'polarity': polarities})

In [None]:
train

Unnamed: 0,text,target,entity,attribute,polarity
0,Judging from previous posts this used to be a ...,place,RESTAURANT,GENERAL,negative
1,"We, there were four of us, arrived at noon - t...",staff,SERVICE,GENERAL,negative
2,"They never brought us complimentary noodles, i...",,SERVICE,GENERAL,negative
3,The food was lousy - too sweet or too salty an...,food,FOOD,QUALITY,negative
4,The food was lousy - too sweet or too salty an...,portions,FOOD,STYLE_OPTIONS,negative
...,...,...,...,...,...
2502,The waitress came to check in on us every few ...,waitress,SERVICE,GENERAL,negative
2503,I couldn't ignore the fact that she reach over...,,SERVICE,GENERAL,negative
2504,She then put the check down without asking if ...,,SERVICE,GENERAL,negative
2505,"I wish I could like this place more, and I wis...",place,RESTAURANT,GENERAL,negative


In [None]:
df_test = dataset['test'].to_pandas()
df_test

Unnamed: 0,reviewId,sentences,opinions
0,en_BlueRibbonSushi_478218171,[{'sentenceId': 'en_BlueRibbonSushi_478218171:...,"[{'category': {'entity': 'FOOD', 'attribute': ..."
1,en_BlueRibbonSushi_478218345,[{'sentenceId': 'en_BlueRibbonSushi_478218345:...,"[{'category': {'entity': 'FOOD', 'attribute': ..."
2,en_SchoonerOrLater_477965690,[{'sentenceId': 'en_SchoonerOrLater_477965690:...,"[{'category': {'entity': 'SERVICE', 'attribute..."
3,en_SchoonerOrLater_477965849,[{'sentenceId': 'en_SchoonerOrLater_477965849:...,"[{'category': {'entity': 'FOOD', 'attribute': ..."
4,en_PagodaRestaurant_478006817,[{'sentenceId': 'en_PagodaRestaurant_478006817...,"[{'category': {'entity': 'FOOD', 'attribute': ..."
...,...,...,...
85,en_CatalRestaurant__UvaBar_477862052,[{'sentenceId': 'en_CatalRestaurant__UvaBar_47...,"[{'category': {'entity': 'FOOD', 'attribute': ..."
86,en_StackRestaurant__Bar_478538111,[{'sentenceId': 'en_StackRestaurant__Bar_47853...,"[{'category': {'entity': 'FOOD', 'attribute': ..."
87,en_MercedesRestaurant_478010600,[{'sentenceId': 'en_MercedesRestaurant_4780106...,"[{'category': {'entity': 'FOOD', 'attribute': ..."
88,en_SnoozeanAMEatery_480171723,[{'sentenceId': 'en_SnoozeanAMEatery_480171723...,"[{'category': {'entity': 'FOOD', 'attribute': ..."


In [None]:
import pandas as pd

# Create empty lists to store the extracted data
texts = []
targets = []
entity = []
attribute = []
polarities = []

# Iterate over each row in the dataframe
for index, row in df_test.iterrows():
    # Extract sentences and opinions from each row
    sentences = row['sentences']

    # If opinions are available
    for sentence in sentences:
      if sentence['opinions'].size>0:
          for opinion in sentence['opinions']:
            # Access the text and polarity directly from the opinion dictionary
            if opinion['category'] is not None:  # Check if 'text' key exists and is not None
                fromIdx = int(opinion['from'])
                toIdx = int(opinion['to'])
                texts.append(sentence['text'])
                targets.append(sentence['text'][fromIdx:toIdx])
                entity.append(opinion['category']['entity'])
                attribute.append(opinion['category']['attribute'])
                polarities.append(opinion['polarity'])
            else:
                pass
# Create the new dataframe
test = pd.DataFrame({'text': texts,'target': targets, 'entity':entity, 'attribute': attribute, 'polarity': polarities})

In [None]:
test

Unnamed: 0,text,target,entity,attribute,polarity
0,Yum!,,FOOD,QUALITY,positive
1,Serves really good sushi.,sushi,FOOD,QUALITY,positive
2,Not the biggest portions but adequate.,portions,FOOD,STYLE_OPTIONS,neutral
3,Green Tea creme brulee is a must!,Green Tea creme brulee,FOOD,QUALITY,positive
4,Don't leave the restaurant without it.,,FOOD,QUALITY,positive
...,...,...,...,...,...
854,"All considered, I have to say that Ray's Boath...",Ray's Boathouse,RESTAURANT,GENERAL,positive
855,While I could have done without the youth who ...,server,SERVICE,GENERAL,positive
856,While I could have done without the youth who ...,food,FOOD,QUALITY,positive
857,While I could have done without the youth who ...,,RESTAURANT,MISCELLANEOUS,negative


In [None]:
# Create a bi-lstm model
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Bidirectional
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

model_map = {}

# Define the bi-lstm model
bilstm_model = Sequential()
bilstm_model.add(Bidirectional(LSTM(128, return_sequences=True, input_shape=(None, 50))))
bilstm_model.add(Bidirectional(LSTM(128, return_sequences=False)))
bilstm_model.add(Dropout(0.5))
bilstm_model.add(Dense(64, activation='relu'))
bilstm_model.add(Dropout(0.5))
bilstm_model.add(Dense(1, activation='sigmoid'))

# Compile the bi-lstm model
bilstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Add the bi-lstm model to the model_map
model_map['bi-lstm'] = bilstm_model

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
# Add the random forest model to the model_map
model_map['random-forest'] = rf_model

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, objective='binary:logistic')
model_map['xgboost'] = xgb_model

  super().__init__(**kwargs)


# Training Functions

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from sklearn.preprocessing import LabelEncoder
def preprocess_df(df_input):
    df = df_input.copy()
    df['text'] = df['text'].apply(process_text)
    # Convert sentiment labels to numerical values
    df['polarity'] = df['polarity'].apply(lambda x: 1 if x == 'positive' else 0)
    df['target'] = df['target'].fillna('unknown')
    # Extract entity and attribute if available
    df['entity'] = df['entity'].fillna('unknown') # handle missing values
    df['attribute'] = df['attribute'].fillna('unknown') # handle missing values
    return df

def preprocess_bilstm(df_input, embed_word):
    df = preprocess_df(df_input)

    # Manually pad the sequences
    max_sequence_length = max(len(seq) for seq in df['text'])
    X_text = []
    for seq in df['text']:
        if len(seq) < max_sequence_length:
            seq = seq + ['0'] * (max_sequence_length - len(seq))
        X_text.append(seq)
    X_text = np.array(X_text)

    # Embed the text data using the precomputed word embedding matrix
    embedding_dim = word_embedding_matrix.shape[1]
    X_text_embedded = np.zeros((X_text.shape[0], max_sequence_length, embedding_dim))

    for i, seq in enumerate(X_text):
        for j, word in enumerate(seq):
            if word != '0':
                X_text_embedded[i, j] = embed_word(word)

    # Prepare entity and attribute data
    X_entity = df['entity'].values
    X_target = df['target'].values
    X_attribute = df['attribute'].values

    # Combine text, entity, and attribute data
    X = [X_text_embedded, X_target, X_entity, X_attribute]
    y = df['polarity'].values

    return X, y

def preprocess_other(df_input, embed_word):
    df = preprocess_df(df_input)
    X_text_embedded = []
    embedding_dim = word_embedding_matrix.shape[1]
    max_sequence_length = max(len(seq) for seq in df['text'])
    for text in df['text']:
        embedded_words = [embed_word(word) for word in text]
        if embedded_words:
            X_text_embedded.append(np.mean(embedded_words, axis=0))
        else:
            X_text_embedded.append(np.zeros((X_text_embedded.shape[0], max_sequence_length, embedding_dim)))

    # Prepare entity and attribute data
    X_entity = df['entity'].values
    X_target = df['target'].values
    X_attribute = df['attribute'].values

    # Combine text, entity, and attribute data
    X = [np.array(X_text_embedded), X_entity, X_entity, X_attribute]  # or any other desired structure
    y = df['polarity'].values
    return X, y

# Generalized training function
def train_model(model_name, df, embed_word, epochs=100, batch_size=32):
    preprocess_function = preprocess_map[model_name]
    X, y = preprocess_function(df, embed_word)
    model = model_map[model_name]

    if model_name == 'bi-lstm':
        model.fit(X[0], y, epochs=epochs, batch_size=batch_size) # Assuming X[0] is the text embedding
    else:
        # For Random Forest and XGBoost, concatenate entity and attribute features
        # After label encoding them

        # Label encode entity and attribute
        le_entity = LabelEncoder()
        le_attribute = LabelEncoder()
        le_target = LabelEncoder()
        X_entity_encoded = le_entity.fit_transform(X[1])
        X_attribute_encoded = le_attribute.fit_transform(X[2])
        X_target_encoded = le_target.fit_transform(X[3])

        # Reshape encoded features for concatenation
        X_entity_encoded = X_entity_encoded.reshape(-1, 1)
        X_attribute_encoded = X_attribute_encoded.reshape(-1, 1)
        X_target_encoded = X_target_encoded.reshape(-1, 1)

        # Concatenate features
        X_combined = np.concatenate([X[0], X_entity_encoded, X_attribute_encoded, X_target_encoded], axis=1)

        model.fit(X_combined, y)

# Training (SVD)


In [None]:
preprocess_map = {}

preprocess_map['bi-lstm'] = preprocess_bilstm
preprocess_map['random-forest'] = preprocess_other
preprocess_map['xgboost'] = preprocess_other

# Train the bi-lstm model using the embedded text data
train_model('bi-lstm', train, embed_word)


Epoch 1/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.7639 - loss: 0.4580
Epoch 2/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - accuracy: 0.7736 - loss: 0.4439
Epoch 3/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.7502 - loss: 0.4612
Epoch 4/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - accuracy: 0.7750 - loss: 0.4288
Epoch 5/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - accuracy: 0.7855 - loss: 0.4403
Epoch 6/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.7846 - loss: 0.4148
Epoch 7/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 34ms/step - accuracy: 0.7999 - loss: 0.3846
Epoch 8/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step - accuracy: 0.7834 - loss: 0.3979
Epoch 9/100
[1m79/79[0m [32m━━━━━━━━━

In [None]:
train_model('random-forest', train, embed_word)

In [None]:
train_model('xgboost', train, embed_word)

Parameters: { "use_label_encoder" } are not used.



# Evaluating (SVD)


In [None]:
from sklearn.metrics import classification_report

def evaluate_model(model_name, df_test, embed_function):
    preprocess_function = preprocess_map[model_name]
    X_test, y_true = preprocess_function(test, embed_function)  # Get X_test and y_true

    model = model_map[model_name]

    if model_name == 'bi-lstm':
        y_pred = model.predict(X_test[0])  # Use X_test[0] for bi-lstm
    else:
        # For Random Forest and XGBoost, concatenate entity and attribute features
        # After label encoding them

        # Label encode entity and attribute
        le_entity = LabelEncoder()
        le_attribute = LabelEncoder()
        le_target = LabelEncoder()
        X_entity_encoded = le_entity.fit_transform(X_test[1])
        X_attribute_encoded = le_attribute.fit_transform(X_test[2])
        X_target_encoded = le_target.fit_transform(X_test[3])

        # Reshape encoded features for concatenation
        X_entity_encoded = X_entity_encoded.reshape(-1, 1)
        X_attribute_encoded = X_attribute_encoded.reshape(-1, 1)
        X_target_encoded = X_target_encoded.reshape(-1, 1)

        # Concatenate features
        X_combined = np.concatenate([X_test[0], X_entity_encoded, X_attribute_encoded, X_target_encoded], axis=1)

        y_pred = model.predict(X_combined)

    # Binarize the prediction into 0 and 1
    y_pred = np.where(y_pred > 0.5, 1, 0)

    # Use classification_report from sklearn
    report = classification_report(y_true, y_pred, target_names=['negative', 'positive'])
    print(f'Classification Report for {model_name}:')
    print(report)

evaluate_model('bi-lstm', df_test, embed_word)
evaluate_model('random-forest', df_test, embed_word)
evaluate_model('xgboost', df_test, embed_word)

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
Classification Report for bi-lstm:
              precision    recall  f1-score   support

    negative       0.35      0.31      0.33       248
    positive       0.73      0.76      0.75       611

    accuracy                           0.63       859
   macro avg       0.54      0.54      0.54       859
weighted avg       0.62      0.63      0.62       859

Classification Report for random-forest:
              precision    recall  f1-score   support

    negative       0.55      0.33      0.41       248
    positive       0.77      0.89      0.82       611

    accuracy                           0.73       859
   macro avg       0.66      0.61      0.62       859
weighted avg       0.70      0.73      0.70       859

Classification Report for xgboost:
              precision    recall  f1-score   support

    negative       0.49      0.45      0.47       248
    positive       0.78      0.81      0.80       61

# BERT

In [None]:
from sklearn.metrics import classification_report
from transformers import AutoTokenizer

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  return classification_report(labels, preds)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
train.describe()

Unnamed: 0,text,target,entity,attribute,polarity
count,2507,2507.0,2507,2507,2507
unique,1703,722.0,6,5,3
top,We are very particular about sushi and were bo...,,FOOD,GENERAL,positive
freq,8,624.0,1076,1154,1657


In [None]:
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification
import numpy as np
import torch

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

def preprocess_function(examples):
    # Ensure 'target', 'attribute', 'entity', and 'text' keys are present in examples
    for key in ['target', 'attribute', 'entity', 'text']:
        if key not in examples:
            examples[key] = [""] * len(examples["label"])  # or any suitable default value

    input_text = [f"{attribute} {entity} [SEP] {text}"
                  for target, attribute, entity, text in zip(
                      examples['target'], examples['attribute'], examples['entity'], examples['text'])]
    return tokenizer(input_text, padding="max_length", truncation=True, return_tensors="pt")
 # return tokenizer(examples["text"], padding="max_length", truncation=True)

def modify_labels(example):
    if example["label"] == "positive":
        example["label"] = 2
    elif example["label"] == "negative":
        example["label"] = 0
    else:
      example["label"] = 1

    if example["target"] == "":
      example["target"] == "unknown"
    return example


from datasets import Dataset
train_dataset = Dataset.from_pandas(train)
train_dataset = train_dataset.rename_column("polarity", "label").map(modify_labels)
test_dataset = Dataset.from_pandas(test)
test_dataset = test_dataset.rename_column("polarity", "label").map(modify_labels)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2507 [00:00<?, ? examples/s]

Map:   0%|          | 0/859 [00:00<?, ? examples/s]

In [None]:
train_dataset

Dataset({
    features: ['text', 'target', 'entity', 'attribute', 'label'],
    num_rows: 2507
})

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=25
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset.map(preprocess_function, batched=True),
    eval_dataset=test_dataset.map(preprocess_function, batched=True),
)

trainer.train()




Map:   0%|          | 0/2507 [00:00<?, ? examples/s]

Map:   0%|          | 0/859 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.5781,0.399111
2,0.2886,0.358034
3,0.235,0.365101


TrainOutput(global_step=471, training_loss=0.3556114397231181, metrics={'train_runtime': 802.8803, 'train_samples_per_second': 9.368, 'train_steps_per_second': 0.587, 'total_flos': 1978876014732288.0, 'train_loss': 0.3556114397231181, 'epoch': 3.0})

In [None]:
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")
predictions = trainer.predict(test_dataset.map(preprocess_function, batched=True))

# Calculate the classification report
report = compute_metrics(predictions)
print(report)

Evaluation results: {'eval_loss': 0.36510077118873596, 'eval_runtime': 25.6906, 'eval_samples_per_second': 33.436, 'eval_steps_per_second': 0.545, 'epoch': 3.0}


Map:   0%|          | 0/859 [00:00<?, ? examples/s]

              precision    recall  f1-score   support

           0       0.73      0.85      0.79       204
           1       0.70      0.16      0.26        44
           2       0.93      0.93      0.93       611

    accuracy                           0.87       859
   macro avg       0.79      0.65      0.66       859
weighted avg       0.87      0.87      0.86       859



In [None]:
training_args2 = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=25
)

trainer2 = Trainer(
    model=model,
    args=training_args2,
    train_dataset=train_dataset.map(preprocess_function, batched=True),
    eval_dataset=test_dataset.map(preprocess_function, batched=True),
)

trainer2.train()




Map:   0%|          | 0/2507 [00:00<?, ? examples/s]

Map:   0%|          | 0/859 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
1,0.3527,0.370463
2,0.1739,0.418094
3,0.2275,0.365878
4,0.1248,0.471632
5,0.0694,0.554828
6,0.0838,0.567557
7,0.0301,0.635706
8,0.0054,0.602424
9,0.0208,0.663039
10,0.0049,0.667561


TrainOutput(global_step=1570, training_loss=0.0939517431684835, metrics={'train_runtime': 2504.8053, 'train_samples_per_second': 10.009, 'train_steps_per_second': 0.627, 'total_flos': 6596253382440960.0, 'train_loss': 0.0939517431684835, 'epoch': 10.0})

In [None]:
predictions = trainer2.predict(test_dataset.map(preprocess_function, batched=True))

# Calculate the classification report
report = compute_metrics(predictions)
print(report)

Evaluation results: {'eval_loss': 0.6675610542297363, 'eval_runtime': 25.2157, 'eval_samples_per_second': 34.066, 'eval_steps_per_second': 0.555, 'epoch': 10.0}


Map:   0%|          | 0/859 [00:00<?, ? examples/s]

              precision    recall  f1-score   support

           0       0.82      0.83      0.83       204
           1       0.77      0.45      0.57        44
           2       0.93      0.96      0.94       611

    accuracy                           0.90       859
   macro avg       0.84      0.75      0.78       859
weighted avg       0.90      0.90      0.90       859



In [None]:
# Save the model and tokenizer
trainer.save_model("./model")
tokenizer.save_pretrained("./tokenizer")

('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/vocab.txt',
 './tokenizer/added_tokens.json',
 './tokenizer/tokenizer.json')

In [None]:
import os
import shutil
from datetime import datetime
# Define source and destination paths
source_path = './model'
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
destination_path = f'/content/drive/MyDrive/model_bert_task3'

# Ensure the source exists
if os.path.exists(source_path):
    # Copy the directory to Google Drive
    shutil.copytree(source_path, destination_path)
    print(f"Copied '{source_path}' to '{destination_path}'")
else:
    print(f"Source directory '{source_path}' does not exist!")

Copied './model' to '/content/drive/MyDrive/model_bert_task3'


In [None]:
import os
import shutil
from datetime import datetime
# Define source and destination paths
source_path = './tokenizer'
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
destination_path = f'/content/drive/MyDrive/tokenizer_task3_bert_{current_time}'

# Ensure the source exists
if os.path.exists(source_path):
    # Copy the directory to Google Drive
    shutil.copytree(source_path, destination_path)
    print(f"Copied '{source_path}' to '{destination_path}'")
else:
    print(f"Source directory '{source_path}' does not exist!")

Copied './tokenizer' to '/content/drive/MyDrive/tokenizer_task3_bert_20241211_144500'


In [None]:
test_dataset

Dataset({
    features: ['text', 'target', 'entity', 'attribute', 'label'],
    num_rows: 859
})

In [None]:
prediction_data = Dataset.from_dict({"text": ["Yum!"],
                                      "entity": ["FOOD"],
                                      "attribute": ["QUALITY"],
                                      "target": [""]})

prediction_data

Dataset({
    features: ['text', 'entity', 'attribute', 'target'],
    num_rows: 1
})

In [None]:
prediction_data = Dataset.from_dict({"text": ["Yum!"],
                                      "entity": ["FOOD"],
                                      "attribute": ["QUALITY"],
                                      "target": [""],
                                      "label":[2]
                                     })

# Apply preprocessing to the dataset
prediction_data = prediction_data.map(preprocess_function, batched=True)
pr = trainer2.predict(prediction_data).predictions.argmax(axis=1)[0]

print(pr)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

2


# LLM

In [None]:
!pip install transformers datasets evaluate accelerate bitsandbytes

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset
from evaluate import load
import json



In [None]:
# Setup configuration for quantization
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enable 4-bit quantization
    bnb_4bit_compute_dtype=torch.float16,  # Use FP16 for computation
    bnb_4bit_use_double_quant=True,  # Double quantization for better compression
    bnb_4bit_quant_type="nf4"  # Normalized float (NF4) quantization
)

In [None]:
model_name = "GoToCompany/gemma2-9b-cpt-sahabatai-v1-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",  # Automatically map to GPU
    quantization_config=quant_config
)

model.safetensors.index.json:   0%|          | 0.00/39.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.67G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 1100, in emit
    msg = self.format(record)
  File "/usr/lib/python3.10/logging/__init__.py", line 943, in format
    return fmt.format(record)
  File "/usr/lib/python3.10/logging/__init__.py", line 678, in format
    record.message = record.getMessage()
  File "/usr/lib/python3.10/logging/__init__.py", line 368, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    a

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [None]:
import re

def format_prompt(sentence, entity, attribute):
    prompt = f"""
    Task: Aspect Based Sentiment Analysis
    Sentence: {sentence}
    Entity: {entity}
    Attribute: {attribute}
    Question: What is the sentiment of the sentence based on the entity and attribute?
    Available answers: positive, negative, or neutral
    Answer: """
    return prompt


def evaluate_model(task_name, dataset):
    total = 0
    correct = 0

    for item in dataset.iterrows():
        sentence = item[1]['text']
        entity = item[1]['entity']
        attribute = item[1]['attribute']
        polarity = item[1]['polarity']

        # Prepare prompt
        prompt = format_prompt(sentence, entity, attribute)

        # Generate model response
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, max_new_tokens=20)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        matches = re.search(r'Answer:\s*(.+)', response, re.DOTALL)
        answer = matches.group(1).strip()
        if "positive" in answer and polarity == "positive":
            correct += 1
        elif "negative" in answer and polarity == "negative":
            correct += 1
        elif "neutral" in answer and polarity == "neutral":
            correct += 1
        total += 1

    accuracy = correct / total
    return accuracy, correct, total

In [None]:
sum = 0
sum_correct = 0
print("Evaluating Model...")
accuracy, correct, total, ans = evaluate_model("aspect-based-sentiment-analysis", train)
print(f"Accuracy for train dataset: {accuracy}")

Evaluating Model...

    Task: Aspect Based Sentiment Analysis
    Sentence: Judging from previous posts this used to be a good place, but not any longer.
    Entity: RESTAURANT
    Attribute: GENERAL
    Question: What is the sentiment of the sentence based on the entity and attribute?
    Available answers: positive, negative, or neutral
    Answer: neutral
    
    Task: Aspect Based Sentiment Analysis
    Sentence: I'm

    Task: Aspect Based Sentiment Analysis
    Sentence: We, there were four of us, arrived at noon - the place was empty - and the staff acted like we were imposing on them and they were very rude.
    Entity: SERVICE
    Attribute: GENERAL
    Question: What is the sentiment of the sentence based on the entity and attribute?
    Available answers: positive, negative, or neutral
    Answer: neutral
    
    Task: Aspect Based Sentiment Analysis
    Sentence: The food was

    Task: Aspect Based Sentiment Analysis
    Sentence: They never brought us complimentary noo

In [None]:
sum = 0
sum_correct = 0
print("Evaluating Model...")
accuracy, correct, total = evaluate_model("aspect-based-sentiment-analysis", train[:100])
print(f"Accuracy for train dataset: {accuracy}")
sum += total
sum_correct += correct
accuracy, correct, total = evaluate_model("aspect-based-sentiment-analysis", test[:100])
print(f"Accuracy for test dataset: {accuracy}")
sum += total
sum_correct += correct
print(f"Accuracy Total: {sum_correct/sum}")

Evaluating Model...
Accuracy for train dataset: 0.18
Accuracy for test dataset: 0.18
Accuracy Total: 0.18
