<a href="https://colab.research.google.com/github/lmassaron/ml4dummies_3ed/blob/main/ML4D3E_17_scoring_opinions_and_sentiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
text_1 = "The quick brown fox jumps over the lazy dog."
text_2 = "My dog is quick and can jump over fences."
text_3 = "Your dog is so lazy that it sleeps all the day."
corpus = [text_1, text_2, text_3]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=True, lowercase=False)
vectorizer.fit(corpus)
vectorized_text = vectorizer.transform(corpus)
print(vectorized_text.todense())

In [None]:
print(vectorizer.vocabulary_)

In [None]:
text_4 = "A black dog just passed by but my dog is brown."
corpus.append(text_4)
vectorizer = CountVectorizer()
vectorizer.fit(corpus)
vectorized_text = vectorizer.transform(corpus)
print(vectorized_text.todense()[-1])

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(norm="l1")
tfidf_mtx = tfidf.fit_transform(vectorized_text)

phrase = 3 # choose a number from 0 to 3

total = 0
for word in vectorizer.vocabulary_:
    pos = vectorizer.vocabulary_[word]
    value = list(tfidf_mtx.toarray()[phrase])[pos]
    if value !=0.0:
        print(f"{word:7s}: {value:0.3f}")
        total += value
print('\nSummed values of a phrase: %0.1f' % total)

In [None]:
bigrams = CountVectorizer(ngram_range=(2, 2))
print(bigrams.fit(corpus).vocabulary_)

In [None]:
import nltk
nltk.download("punkt_tab")
nltk.download("stopwords")

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

stemmer = PorterStemmer()
stop_words = stopwords.words("english")

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    stems = stem_tokens(tokens, stemmer)
    return stems

docs = ["Sam loves swimming so he swims all the time"]
vect = CountVectorizer(tokenizer=tokenize)
vec = vect.fit(docs)

sentence1 = vec.transform(["George loves swimming too! "])

print(vec.get_feature_names_out())
print(sentence1.toarray())

In [None]:
import pandas as pd

repository = (
    "https://github.com/lmassaron/ml4dummies_3ed/")
release = "releases/download/v1.0/"
filename = repository + release + "imdb_50k.csv"
reviews = pd.read_csv(filename)

In [None]:
reviews.sentiment.value_counts()

In [None]:
print(reviews.review.sample(1).values[0])

In [None]:
from sklearn.model_selection import train_test_split

train, temp = train_test_split(reviews, test_size=0.4, random_state=0)
valid, test = train_test_split(temp, test_size=0.5, random_state=0)

print(f"Train size: {len(train)}")
print(f"Validation size: {len(valid)}")
print(f"Test size: {len(test)}")

In [None]:
import os
os.environ["KERAS_BACKEND"] = "jax"

In [None]:
import keras

maxlen = 256
vocab_size_limit = 10000

text_vectorization = keras.layers.TextVectorization(
    max_tokens=vocab_size_limit,
    output_mode='int',
    output_sequence_length=maxlen,
    pad_to_max_tokens=True)

text_vectorization.adapt(train.review.values)

def vectorize_text_data(df, vectorizer):
    sequences = vectorizer(df.review.values)
    return sequences, df.sentiment.values

X, y = vectorize_text_data(train, text_vectorization)
Xv, yv = vectorize_text_data(valid, text_vectorization)
Xt, yt = vectorize_text_data(test, text_vectorization)

In [None]:
keras.utils.set_random_seed(0)

model = keras.models.Sequential()
vocab_size = text_vectorization.vocabulary_size()
embedding_dim = 64

model.add(keras.layers.Input(shape=(maxlen,)))
model.add(keras.layers.Embedding(input_dim=vocab_size,
                                 output_dim=embedding_dim))
model.add(keras.layers.Bidirectional(
    keras.layers.LSTM(32, return_sequences=True)))
model.add(keras.layers.Bidirectional(
    keras.layers.LSTM(32, return_sequences=False)))
model.add(keras.layers.Dropout(0.25))
model.add(keras.layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()

In [None]:
history = model.fit(X, y, epochs=2, batch_size=8,
                    validation_data=(Xv, yv))

In [None]:
from sklearn.metrics import accuracy_score

predictions = (model.predict(Xt) >= 0.5).astype(int)
test_accuracy = accuracy_score(yt, predictions)
print(f"Accuracy on test set: {test_accuracy}")

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer

model_name = "answerdotai/ModernBERT-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"],
                     padding="max_length",
                     truncation=True,
                     max_length=256)

def tokenize_dataset(data):
  data_dict = {'text': data['review'].values, 'labels': data['sentiment'].values}
  dataset = Dataset.from_dict(data_dict)
  return dataset.map(tokenize_function, batched=True)

tokenized_train_dataset = tokenize_dataset(train)
tokenized_valid_dataset = tokenize_dataset(valid)
tokenized_test_dataset = tokenize_dataset(test)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           num_labels=2)

In [None]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters in {model_name}: {total_params:,}")

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    report_to="none",
    eval_strategy="steps"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_valid_dataset
)

train_result = trainer.train()

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score

predictions = trainer.predict(tokenized_test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)
test_accuracy = accuracy_score(test['sentiment'].values, predicted_labels)
print(f"Accuracy on test set: {test_accuracy}")