<a href="https://colab.research.google.com/github/ncodexz/Amazon-review-insights/blob/main/Block1_Sentiment_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# SET UP & DRIVE
from google.colab import drive
drive.mount('/content/drive')

import json
import random
import pandas as pd
import numpy as np
import torch

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# Dataset path
DATASET_PATH = "/content/drive/MyDrive/amazon_reviews_2025/Data/Movies_and_TV.jsonl"

print("Setup OK")

Mounted at /content/drive
Setup OK


In [None]:
#FILTER FROM REVIESWS WITHOUT EXPERIENCE

def is_no_experience(text):
    text = text.lower()
    patterns = [
        "did not watch",
        "didn't watch",
        "never watched",
        "haven't watched",
        "nothing to say",
        "no comment",
        "no comments",
        "this was a gift",
        "was a gift",
        "item was canceled",
        "order was canceled",
        "arrived on time",
        "just as described"
    ]
    return any(p in text for p in patterns)


In [None]:
#MAPPING & RATING LABEL

def rating_to_label(rating):
    if rating <= 2:
        return "negative"
    elif rating == 3:
        return "neutral"
    else:
        return "positive"



In [None]:
#DATASET LOAD

SAMPLE_SIZE = 20_000
data = []

with open(DATASET_PATH, "r") as f:
    for line in f:
        review = json.loads(line)

        if "text" not in review or "rating" not in review:
            continue

        text = review["text"].strip()
        if not text:
            continue

        if is_no_experience(text):
            continue

        label = rating_to_label(review["rating"])

        data.append({
            "text": text,
            "label": label
        })

        if len(data) >= SAMPLE_SIZE:
            break

df = pd.DataFrame(data)

print(df["label"].value_counts())
print("Total:", len(df))


label
positive    16149
negative     2016
neutral      1835
Name: count, dtype: int64
Total: 20000


In [None]:
#LABEL ENCODING + SPLIT
from sklearn.model_selection import train_test_split

label2id = {
    "negative": 0,
    "neutral": 1,
    "positive": 2
}
id2label = {v: k for k, v in label2id.items()}

df["label_id"] = df["label"].map(label2id)

train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["label_id"],
    random_state=SEED
)

print("Train distribution:")
print(train_df["label"].value_counts(), "\n")

print("Test distribution:")
print(test_df["label"].value_counts(), "\n")

print("Train size:", len(train_df))
print("Test size:", len(test_df))


Train distribution:
label
positive    12919
negative     1613
neutral      1468
Name: count, dtype: int64 

Test distribution:
label
positive    3230
negative     403
neutral      367
Name: count, dtype: int64 

Train size: 16000
Test size: 4000


In [None]:
#TOKENIZER +DATASET
from transformers import AutoTokenizer
from datasets import Dataset

MODEL_CHECKPOINT = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

train_ds = Dataset.from_pandas(
    train_df[["text", "label_id"]].reset_index(drop=True)
)
test_ds = Dataset.from_pandas(
    test_df[["text", "label_id"]].reset_index(drop=True)
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
#TOKENIZATION

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

train_ds = train_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)


Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [None]:
#RENAME & TORCH

train_ds = train_ds.rename_column("label_id", "labels")
test_ds = test_ds.rename_column("label_id", "labels")

train_ds.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)
test_ds.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)

print(train_ds)
print(test_ds)


Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 16000
})
Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 4000
})


In [None]:
#MODEL

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

model = model.to("cuda")



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#CLASS WEIGHTS

class_weights = torch.tensor(
    [1.5, 1.0, 0.8],  # [negative, neutral, positive]
    dtype=torch.float
).to(model.device)


In [None]:
# TRAINER
from transformers import Trainer

class WeightedTrainer(Trainer):
    def compute_loss(
        self,
        model,
        inputs,
        return_outputs=False,
        num_items_in_batch=None  # <-- clave
    ):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss


In [None]:
#METRICS
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="macro", zero_division=0
    )

    acc = accuracy_score(labels, preds)

    return {
        "accuracy": acc,
        "macro_f1": f1,
        "macro_precision": precision,
        "macro_recall": recall
    }


In [None]:
#TRAINING ARGUMENTS
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=100,
    save_strategy="epoch",
    report_to="none"
)


In [None]:
#TRAINING
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


  trainer = WeightedTrainer(


Step,Training Loss
100,0.667
200,0.4922
300,0.4726
400,0.4263
500,0.4766
600,0.3985
700,0.4075
800,0.4381
900,0.3902
1000,0.4026


TrainOutput(global_step=3000, training_loss=0.32965040651957195, metrics={'train_runtime': 1004.2188, 'train_samples_per_second': 47.798, 'train_steps_per_second': 2.987, 'total_flos': 3179274264576000.0, 'train_loss': 0.32965040651957195, 'epoch': 3.0})

In [None]:
eval_results = trainer.evaluate()
print(eval_results)


{'eval_loss': 0.4644463062286377, 'eval_accuracy': 0.8725, 'eval_macro_f1': 0.6926489080016506, 'eval_macro_precision': 0.6984129565084869, 'eval_macro_recall': 0.6871641344000302, 'eval_runtime': 26.957, 'eval_samples_per_second': 148.384, 'eval_steps_per_second': 9.274, 'epoch': 3.0}


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

preds_output = trainer.predict(test_ds)
y_true = preds_output.label_ids
y_pred = preds_output.predictions.argmax(axis=1)

print(classification_report(y_true, y_pred, target_names=["negative", "neutral", "positive"]))
print(confusion_matrix(y_true, y_pred))


              precision    recall  f1-score   support

    negative       0.73      0.71      0.72       403
     neutral       0.42      0.41      0.42       367
    positive       0.94      0.95      0.94      3230

    accuracy                           0.87      4000
   macro avg       0.70      0.69      0.69      4000
weighted avg       0.87      0.87      0.87      4000

[[ 286   64   53]
 [  67  149  151]
 [  37  138 3055]]


In [None]:
#“Negative Sentiment Probability Analysis”
import numpy as np
import torch
from scipy.special import softmax

preds_output = trainer.predict(test_ds)

logits = preds_output.predictions
y_true = preds_output.label_ids

probs = softmax(logits, axis=1)

# Probabilidad de NEGATIVE
p_neg = probs[:, 0]


In [None]:
import pandas as pd

df_probs = pd.DataFrame({
    "p_negative": p_neg,
    "true_label": y_true
})

# Solo ejemplos verdaderamente negativos
df_probs[df_probs["true_label"] == 0]["p_negative"].describe()


Unnamed: 0,p_negative
count,403.0
mean,0.680068
std,0.389651
min,0.001067
25%,0.259836
50%,0.921622
75%,0.984149
max,0.992183


In [None]:
df_probs[df_probs["true_label"] != 0]["p_negative"].describe()


Unnamed: 0,p_negative
count,3597.0
mean,0.032781
std,0.141831
min,0.000735
25%,0.00096
50%,0.00126
75%,0.002999
max,0.991131


In [None]:
SAVE_PATH = "/content/drive/MyDrive/amazon_reviews_2025/Models/sentiment_v1"

trainer.save_model(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)


('/content/drive/MyDrive/amazon_reviews_2025/Models/sentiment_v1/tokenizer_config.json',
 '/content/drive/MyDrive/amazon_reviews_2025/Models/sentiment_v1/special_tokens_map.json',
 '/content/drive/MyDrive/amazon_reviews_2025/Models/sentiment_v1/vocab.txt',
 '/content/drive/MyDrive/amazon_reviews_2025/Models/sentiment_v1/added_tokens.json',
 '/content/drive/MyDrive/amazon_reviews_2025/Models/sentiment_v1/tokenizer.json')

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

SAVE_PATH = "/content/drive/MyDrive/amazon_reviews_2025/Models/sentiment_v1"

tokenizer = AutoTokenizer.from_pretrained(SAVE_PATH)
model = AutoModelForSequenceClassification.from_pretrained(SAVE_PATH)
model = model.to("cuda")
model.eval()


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
#funcion minima de prediccion
def predict_sentiment(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=256
    )

    inputs = {k: v.to("cuda") for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    probs = torch.softmax(logits, dim=1)
    pred_id = torch.argmax(probs, dim=1).item()

    return {
        "label": model.config.id2label[pred_id],
        "probabilities": probs.squeeze().cpu().tolist()
    }


In [None]:

# EXPLORATORY BLOCK — NEUTRAL REVIEWS BEHAVIOR ANALYSIS
# Inspect how the trained sentiment model behaves on

import json

# --- Parameters ---
DATASET_PATH = "/content/drive/MyDrive/amazon_reviews_2025/Data/Movies_and_TV.jsonl"
SKIP_LINES = 20000
N_NEUTRALS = 10

# --- Collect neutral examples based on star ratings ---
neutral_texts = []

with open(DATASET_PATH, "r") as f:
    for i, line in enumerate(f):
        if i < SKIP_LINES:
            continue

        review = json.loads(line)

        if "text" not in review or "rating" not in review:
            continue

        if review["rating"] != 3:
            continue

        text = review["text"].strip()
        if not text:
            continue

        neutral_texts.append(text)

        if len(neutral_texts) >= N_NEUTRALS:
            break

print(f"Collected {len(neutral_texts)} neutral reviews.")

# --- Run model predictions on neutral reviews ---
neutral_results = []

for idx, text in enumerate(neutral_texts, 1):
    pred = predict_sentiment(text)

    result = {
        "example_id": idx,
        "text": text,
        "predicted_label": pred["label"],
        "p_negative": pred["probabilities"][0],
        "p_neutral": pred["probabilities"][1],
        "p_positive": pred["probabilities"][2],
    }

    neutral_results.append(result)

    print(f"\n--- Neutral Example {idx} ---")
    print("Text:", text)
    print("Prediction:", pred)

# --- Summary table for later inspection ---
import pandas as pd

neutral_df = pd.DataFrame(neutral_results)

print("\nSummary of neutral predictions:")
display(neutral_df)




Collected 10 neutral reviews.

--- Neutral Example 1 ---
Text: Some of the video was very telling. If shorter it would be better
Prediction: {'label': 'neutral', 'probabilities': [0.021752430126070976, 0.7807475924491882, 0.19749994575977325]}

--- Neutral Example 2 ---
Text: Not a great movie but there are a few good laughs that make the rental worthwhile.
Prediction: {'label': 'neutral', 'probabilities': [0.0326407328248024, 0.9304335117340088, 0.036925770342350006]}

--- Neutral Example 3 ---
Text: It's OK.
Prediction: {'label': 'neutral', 'probabilities': [0.04760714992880821, 0.8800344467163086, 0.07235845923423767]}

--- Neutral Example 4 ---
Text: Saw this when it first came out, yes that long ago. Great at the time but clearly dated
Prediction: {'label': 'neutral', 'probabilities': [0.1667356789112091, 0.7918797731399536, 0.041384533047676086]}

--- Neutral Example 5 ---
Text: It's an ok movie.
Prediction: {'label': 'neutral', 'probabilities': [0.03179808706045151, 0.8649008870

Unnamed: 0,example_id,text,predicted_label,p_negative,p_neutral,p_positive
0,1,Some of the video was very telling. If shorter...,neutral,0.021752,0.780748,0.1975
1,2,Not a great movie but there are a few good lau...,neutral,0.032641,0.930434,0.036926
2,3,It's OK.,neutral,0.047607,0.880034,0.072358
3,4,"Saw this when it first came out, yes that long...",neutral,0.166736,0.79188,0.041385
4,5,It's an ok movie.,neutral,0.031798,0.864901,0.103301
5,6,As expected expect,neutral,0.089926,0.817298,0.092776
6,7,As expected expect,neutral,0.089926,0.817298,0.092776
7,8,Very slow! Good story.,neutral,0.15154,0.815706,0.032753
8,9,Boring for older kids. Would be good for young...,neutral,0.043598,0.905827,0.050575
9,10,I like it.,positive,0.001766,0.005912,0.992323


In [None]:

# EXPLORATORY BLOCK — NEGATIVE REVIEWS BEHAVIOR ANALYSIS
# Inspect how the trained sentiment model behaves on


import json
import pandas as pd

# --- Parameters ---
DATASET_PATH = "/content/drive/MyDrive/amazon_reviews_2025/Data/Movies_and_TV.jsonl"
SKIP_LINES = 20000
N_NEGATIVES = 10

# --- Collect negative examples based on star ratings ---
negative_texts = []

with open(DATASET_PATH, "r") as f:
    for i, line in enumerate(f):
        if i < SKIP_LINES:
            continue

        review = json.loads(line)

        if "text" not in review or "rating" not in review:
            continue

        if review["rating"] > 2:
            continue

        text = review["text"].strip()
        if not text:
            continue

        negative_texts.append(text)

        if len(negative_texts) >= N_NEGATIVES:
            break

print(f"Collected {len(negative_texts)} negative reviews.")

# --- Run model predictions on negative reviews ---
negative_results = []

for idx, text in enumerate(negative_texts, 1):
    pred = predict_sentiment(text)

    result = {
        "example_id": idx,
        "text": text,
        "predicted_label": pred["label"],
        "p_negative": pred["probabilities"][0],
        "p_neutral": pred["probabilities"][1],
        "p_positive": pred["probabilities"][2],
    }

    negative_results.append(result)

    print(f"\n--- Negative Example {idx} ---")
    print("Text:", text)
    print("Prediction:", pred)

# --- Summary table ---
negative_df = pd.DataFrame(negative_results)

print("\nSummary of negative predictions:")
display(negative_df)




Collected 10 negative reviews.

--- Negative Example 1 ---
Text: I was so hoping the critics were wrong! I knew it was a “remake” or “reimagining” of the classic 80’s film “Dirty Rotten Scoundrels” with female leads. I do like Rebel Wilson and Anne Hathaway in other roles, but this was a sad attempt at remaking a really good film.<br />If you must watch it, wait until it hits 99 cent rental status or premium cable.
Prediction: {'label': 'negative', 'probabilities': [0.9701244831085205, 0.026717888191342354, 0.0031577013432979584]}

--- Negative Example 2 ---
Text: Watch the previous version instead. Yuk.
Prediction: {'label': 'negative', 'probabilities': [0.8690094947814941, 0.11588061600923538, 0.015109943225979805]}

--- Negative Example 3 ---
Text: Okay introduction
Prediction: {'label': 'neutral', 'probabilities': [0.09109912812709808, 0.8446558117866516, 0.06424500048160553]}

--- Negative Example 4 ---
Text: So disappointed it’s supposed to be a fan fav totally weird and imo stup

Unnamed: 0,example_id,text,predicted_label,p_negative,p_neutral,p_positive
0,1,I was so hoping the critics were wrong! I knew...,negative,0.970124,0.026718,0.003158
1,2,Watch the previous version instead. Yuk.,negative,0.869009,0.115881,0.01511
2,3,Okay introduction,neutral,0.091099,0.844656,0.064245
3,4,So disappointed it’s supposed to be a fan fav ...,negative,0.988362,0.009102,0.002535
4,5,"not good, dull, had to glue myself in the chai...",negative,0.98991,0.00759,0.002501
5,6,awful,negative,0.990434,0.00633,0.003236
6,7,stupid,negative,0.990884,0.004955,0.004161
7,8,It's not a good move,negative,0.971626,0.024569,0.003805
8,9,They are not good moves,negative,0.982314,0.0143,0.003386
9,10,Do not work,negative,0.989657,0.00677,0.003572


In [None]:
for label, texts in examples.items():
    print(f"\n=== TRUE LABEL: {label.upper()} ===\n")
    for t in texts:
        result = predict_sentiment(t)
        print("TEXT:", t[:120], "...")
        print("PRED:", result)
        print("-" * 60)



=== TRUE LABEL: NEGATIVE ===

TEXT: I was so hoping the critics were wrong! I knew it was a “remake” or “reimagining” of the classic 80’s film “Dirty Rotten ...
PRED: {'label': 'negative', 'probabilities': [0.9701244831085205, 0.026717888191342354, 0.0031577013432979584]}
------------------------------------------------------------
TEXT: Watch the previous version instead. Yuk. ...
PRED: {'label': 'negative', 'probabilities': [0.8690094947814941, 0.11588061600923538, 0.015109943225979805]}
------------------------------------------------------------

=== TRUE LABEL: NEUTRAL ===

TEXT: Some of the video was very telling. If shorter it would be better ...
PRED: {'label': 'neutral', 'probabilities': [0.021752430126070976, 0.7807475924491882, 0.19749994575977325]}
------------------------------------------------------------
TEXT: Not a great movie but there are a few good laughs that make the rental worthwhile. ...
PRED: {'label': 'neutral', 'probabilities': [0.0326407328248024, 0.9304335

In [None]:
# ============================================================
# EXPLORATORY ANALYSIS — UNCERTAINTY ON VALID REVIEWS ONLY
# Purpose:
# Identify reviews where the sentiment model is uncertain,
# AFTER applying the same filtering rules used in production.
# This analysis reflects the real system behavior.
# ============================================================

import json
import torch
import numpy as np
from scipy.special import softmax

DATASET_PATH = "/content/drive/MyDrive/amazon_reviews_2025/Data/Movies_and_TV.jsonl"

UNCERTAINTY_THRESHOLD = 0.5
SKIP_LINES = 20_000
MAX_RESULTS = 5

def find_uncertain_valid_reviews(path):
    uncertain_reviews = []

    with open(path, "r") as f:
        for idx, line in enumerate(f):
            if idx < SKIP_LINES:
                continue

            review = json.loads(line)

            text = review.get("text", "").strip()
            if not text:
                continue

            # --- Apply production filter ---
            if is_no_experience(text):
                continue

            # --- Tokenization ---
            inputs = tokenizer(
                text,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=256
            )
            inputs = {k: v.to("cuda") for k, v in inputs.items()}

            # --- Model inference ---
            with torch.no_grad():
                logits = model(**inputs).logits

            probs = softmax(logits.cpu().numpy(), axis=1)[0]

            # --- Uncertainty condition ---
            if probs.max() < UNCERTAINTY_THRESHOLD:
                uncertain_reviews.append({
                    "text": text,
                    "p_negative": probs[0],
                    "p_neutral": probs[1],
                    "p_positive": probs[2],
                })

            if len(uncertain_reviews) >= MAX_RESULTS:
                break

    return uncertain_reviews


# --- Run uncertainty analysis (valid reviews only) ---
uncertain_examples = find_uncertain_valid_reviews(DATASET_PATH)

for i, ex in enumerate(uncertain_examples, 1):
    print(f"\n--- Uncertain VALID Example {i} ---")
    print("Text:", ex["text"])
    print(
        f"Probabilities -> "
        f"Negative: {ex['p_negative']:.3f}, "
        f"Neutral: {ex['p_neutral']:.3f}, "
        f"Positive: {ex['p_positive']:.3f}"
    )

# ============================================================
# END OF EXPLORATORY ANALYSIS
# ============================================================



--- Uncertain VALID Example 1 ---
Text: Eh
Probabilities -> Negative: 0.277, Neutral: 0.428, Positive: 0.295

--- Uncertain VALID Example 2 ---
Text: I don’t think I knew of one person who actually watched Nahnatchka Khan’s Don’t Trust the B---- in Apartment 23 (streaming on Netflix), and man, y’all don’t know what you were missing.  This show is flat-out hysterical.  Who knew James Van der Beek was so good at deadpan humor?  Part of the reason why he is so funny is because he’s playing (a caricature of?) himself.  But the show isn’t really about him.  It’s about, of course, the bitch in apartment 23 and her roommate-for-the-moment, June.<br /><br />So, when you first meet Chloe (a.k.a. “The Bitch”, a.k.a. Krysten Ritter [Breaking Bad]), she’s interviewing candidates to become her new roommate.  She settles on June (Dreama Walker - Gran Torino), a girl from the midwest who comes to New York to blah, blah, blah - it’s not important.  What is important, however, are two things: a) June 

In [None]:
def find_wrong_predictions(path, skip_lines=20000, max_found=5):
    wrong = []

    with open(path, "r") as f:
        for i, line in enumerate(f):
            if i < skip_lines:
                continue

            review = json.loads(line)
            if "text" not in review or "rating" not in review:
                continue

            text = review["text"].strip()
            rating = review["rating"]

            if not text:
                continue

            true_label = (
                "negative" if rating <= 2 else
                "neutral" if rating == 3 else
                "positive"
            )

            pred = predict_sentiment(text)

            if pred["label"] != true_label:
                wrong.append((text, true_label, pred))

            if len(wrong) >= max_found:
                break

    return wrong


In [None]:
wrong_examples = find_wrong_predictions(DATASET_PATH)
wrong_examples


[('I could not play it on my 3D player because it is not blu ray.  Was a little disapointed but got over it.',
  'positive',
  {'label': 'negative',
   'probabilities': [0.7531252503395081,
    0.234567329287529,
    0.012307458557188511]}),
 ('It was nice to see where the characters are at and how life is going for them. Probably time to wrap it up for them',
  'positive',
  {'label': 'neutral',
   'probabilities': [0.022136490792036057,
    0.6235362887382507,
    0.35432717204093933]}),
 ('Season 6 got a little boring.',
  'positive',
  {'label': 'neutral',
   'probabilities': [0.1296066790819168,
    0.8330605030059814,
    0.037332840263843536]}),
 ('I like it.',
  'neutral',
  {'label': 'positive',
   'probabilities': [0.0017655787523835897,
    0.005911807995289564,
    0.9923226237297058]}),
 ('Content of CD is excellent but quality of disc is inferior,  I could only use successfully one time; second time it skipped, paused and everything in between.  Disappointed in Virginia!!

In [None]:
# Parameters of predictions
INPUT_PATH = "/content/drive/MyDrive/amazon_reviews_2025/Data/Movies_and_TV.jsonl"
OUTPUT_PATH = "/content/drive/MyDrive/amazon_reviews_2025/Data/Movies_and_TV_sentiment_v1.jsonl"

MAX_REVIEWS = 50_000
START_LINE = 30_000   #no use in training


In [None]:
#The Core of the process

import json
from tqdm import tqdm

def build_sentiment_dataset(
    input_path: str,
    output_path: str,
    start_line: int,
    max_reviews: int
):
    processed = 0

    with open(input_path, "r") as fin, open(output_path, "w") as fout:
        for i, line in enumerate(tqdm(fin)):
            if i < start_line:
                continue

            if processed >= max_reviews:
                break

            review = json.loads(line)

            if "asin" not in review or "text" not in review:
                continue

            text = review["text"].strip()
            if not text:
                continue

            prediction = predict_sentiment(text)

            record = {
                "asin": review["asin"],
                "text": text,
                "sentiment": prediction["label"],
                "probs": {
                    "negative": prediction["probabilities"][0],
                    "neutral": prediction["probabilities"][1],
                    "positive": prediction["probabilities"][2]
                }
            }

            fout.write(json.dumps(record) + "\n")
            processed += 1

    print(f"Saved {processed} classified reviews to:")
    print(output_path)


In [None]:
#Ejecution
build_sentiment_dataset(
    input_path=INPUT_PATH,
    output_path=OUTPUT_PATH,
    start_line=START_LINE,
    max_reviews=MAX_REVIEWS
)


80001it [04:10, 319.41it/s]

Saved 50000 classified reviews to:
/content/drive/MyDrive/amazon_reviews_2025/Data/Movies_and_TV_sentiment_v1.jsonl



