In [1]:
!pip install datasets transformers sentence-transformers

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cu

### **Data Acquisition**

<img src="https://pentagram-production.imgix.net/b33a5493-679c-421a-9970-2dfea45a6162/emo_rottentomatoes_01.jpg?rect=120%2C0%2C2700%2C1688&w=1500&fit=crop&fm=jpg&q=70&auto=format&h=935" alt="Icona" width="375" height="225">

In [2]:
from datasets import load_dataset
# Load our data
data = load_dataset("rotten_tomatoes")
data

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [3]:
len(data["train"])

8530

In [4]:
# Print the first 5 examples in the training set
print("\Training Set Examples:")
for i in range(5):
  print(data["train"][i])
  print("-" * 20) # Separator between examples

# Print some examples from the validation set
print("\nValidation Set Examples:")
for i in range(2): # Print two examples
  print(data["validation"][i])
  print("-" * 20)

# Access and print specific fields of an example
example_index = 10
print(f"\nExample {example_index} from training set:")
print(f"Text: {data['train'][example_index]['text']}")
print(f"Label: {data['train'][example_index]['label']}")

\Training Set Examples:
{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'label': 1}
--------------------
{'text': 'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .', 'label': 1}
--------------------
{'text': 'effective but too-tepid biopic', 'label': 1}
--------------------
{'text': 'if you sometimes like to go to the movies to have fun , wasabi is a good place to start .', 'label': 1}
--------------------
{'text': "emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one .", 'label': 1}
--------------------

Validation Set Examples:
{'text': 'compassionately explores the seemingly irreconcilable situation between c

### **Text Preprocessing**

In [8]:
# Download required NLTK data
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
italian_stopwords = stopwords.words('italian')
print("Some italian stopwords:", italian_stopwords[:10])
english_stopwords = stopwords.words('english')
print("\nSome english stopwords:", english_stopwords[:10])

Some italian stopwords: ['ad', 'al', 'allo', 'ai', 'agli', 'all', 'agl', 'alla', 'alle', 'con']

Some english stopwords: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


#### Importing libraries





In [11]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from nltk.stem import PorterStemmer
from sklearn.utils import shuffle
import re
import pandas as pd

In [12]:
def preprocess_text(text):
    """
    Clean and preprocess the input text by applying several NLP techniques.
    Steps:
    1. Convert to lowercase
    2. Remove special characters and numbers
    3. Remove stopwords
    4. Apply stemming
    """
    # Convert to lowercase (standardizes the text)
    text = text.lower()

    # Remove special characters and numbers (keeps only letters and spaces)
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove stopwords (common words like 'the', 'is', 'at' that don't carry much meaning)
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    tokens = [token for token in tokens if token not in stop_words]

    # Apply stemming (reduce words to their root/stem form)
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Join tokens back into a single string
    return ' '.join(tokens)

#### example

In [13]:
# Example usage of preprocess_text function
example_texts = [
    "This is an EXAMPLE sentence with some special characters like !@#$%^&*() and numbers 123.",
    "Another example, this time with more STOPWORDS and punctuation.",
    "A third example, showing the EFFECT of stemming on words like running, runs, process, processing, processed.",
    "This is a sentence with mixed case and some common words.",
    "Final example:  Testing with different types of punctuation and capitalization."
]


for text in example_texts:
    processed_text = preprocess_text(text)
    print(f"Original text: {text}")
    print(f"Processed text: {processed_text}")
    print("-" * 20)

Original text: This is an EXAMPLE sentence with some special characters like !@#$%^&*() and numbers 123.
Processed text: exampl sentenc special charact like number
--------------------
Original text: Another example, this time with more STOPWORDS and punctuation.
Processed text: anoth exampl time stopword punctuat
--------------------
Original text: A third example, showing the EFFECT of stemming on words like running, runs, process, processing, processed.
Processed text: third exampl show effect stem word like run run process process process
--------------------
Original text: This is a sentence with mixed case and some common words.
Processed text: sentenc mix case common word
--------------------
Original text: Final example:  Testing with different types of punctuation and capitalization.
Processed text: final exampl test differ type punctuat capit
--------------------


#### Preprocess training and test data

In [14]:
train_df = pd.DataFrame({
    'text': data['train']['text'],
    'label': data['train']['label']
})

test_df = pd.DataFrame({
    'text': data['test']['text'],
    'label': data['test']['label']
})

# Shuffle both datasets
train_df = shuffle(train_df, random_state=42)
test_df = shuffle(test_df, random_state=42)

In [15]:
print("Preprocessing training data...")
X_train = [preprocess_text(text) for text in train_df['text']]
y_train = train_df['label'].values

Preprocessing training data...


In [16]:
print("\nFirst 5 elements of data['train']['text']:")
for i in range(5):
    print(data['train']['text'][i])
    print("-" * 20)

print("First 5 elements of X_train:")
for i in range(5):
    print(X_train[i])
    print("-" * 20)



First 5 elements of data['train']['text']:
the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .
--------------------
the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth .
--------------------
effective but too-tepid biopic
--------------------
if you sometimes like to go to the movies to have fun , wasabi is a good place to start .
--------------------
emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one .
--------------------
First 5 elements of X_train:
would take complet moron foul screen adapt oscar wild classic satir
--------------------
got ten littl indian meet friday th way clean sober film set carpent thing load actor your like

In [17]:
# Preprocess test data
print("Preprocessing test data...")
X_test = [preprocess_text(text) for text in test_df['text']]
y_test = test_df['label'].values

Preprocessing test data...


### **Text Preprocessing**

In [18]:
print("Converting text to count vectors...")
vectorizer = CountVectorizer(max_features=5000)
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

Converting text to count vectors...


In [19]:
from pprint import pprint
# Print the vocabulary (mapping of words to indices)
vocabulary = vectorizer.vocabulary_
print("\nVocabulary (word -> index mapping):")
pprint(dict(list(sorted(vocabulary.items()))[:20]))


Vocabulary (word -> index mapping):
{'abandon': 0,
 'abc': 1,
 'abil': 2,
 'abl': 3,
 'abli': 4,
 'aboveaverag': 5,
 'absenc': 6,
 'absolut': 7,
 'absorb': 8,
 'abstract': 9,
 'absurd': 10,
 'absurdist': 11,
 'absurdli': 12,
 'abund': 13,
 'abus': 14,
 'academi': 15,
 'accent': 16,
 'accept': 17,
 'access': 18,
 'accompani': 19}


### **Feature Engineering**

#### Convert count vectors to TF-IDF representation


In [20]:
print("Converting to TF-IDF representation...")
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

Converting to TF-IDF representation...


In [21]:
# Esempio di dati (aggiungi il tuo dataset qui)
documents = [
    "The cat sat on the mat.",
    "The dog barked at the postman.",
    "The quick brown fox jumped over the lazy dog."
]

# Step 1: Create a CountVectorizer and fit-transform the data to get word counts
count_vectorizer = CountVectorizer()
X_counts = count_vectorizer.fit_transform(documents)

# Step 2: Transform word counts to TF-IDF representation
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

# Step 3: Show vocabulary
vocabulary = count_vectorizer.get_feature_names_out()
print("\nVocabulary:")
print(vocabulary)

# Step 4: Print an example document and its transformations
doc_index = 0  # Change to 1 or 2 to see other examples
print(f"\nOriginal Document: {documents[doc_index]}")

# Word counts
print("\nWord Counts (Document-Term Matrix row):")
pprint(dict(zip(vocabulary, X_counts[doc_index].toarray().flatten())))

# TF-IDF representation
print("\nTF-IDF Representation (Document-Term Matrix row):")
pprint(dict(zip(vocabulary, X_tfidf[doc_index].toarray().flatten())))


Vocabulary:
['at' 'barked' 'brown' 'cat' 'dog' 'fox' 'jumped' 'lazy' 'mat' 'on' 'over'
 'postman' 'quick' 'sat' 'the']

Original Document: The cat sat on the mat.

Word Counts (Document-Term Matrix row):
{'at': 0,
 'barked': 0,
 'brown': 0,
 'cat': 1,
 'dog': 0,
 'fox': 0,
 'jumped': 0,
 'lazy': 0,
 'mat': 1,
 'on': 1,
 'over': 0,
 'postman': 0,
 'quick': 0,
 'sat': 1,
 'the': 2}

TF-IDF Representation (Document-Term Matrix row):
{'at': 0.0,
 'barked': 0.0,
 'brown': 0.0,
 'cat': 0.4305184979719882,
 'dog': 0.0,
 'fox': 0.0,
 'jumped': 0.0,
 'lazy': 0.0,
 'mat': 0.4305184979719882,
 'on': 0.4305184979719882,
 'over': 0.0,
 'postman': 0.0,
 'quick': 0.0,
 'sat': 0.4305184979719882,
 'the': 0.5085423203783267}


### **Modelling & Evaluation**




In [22]:
print("Training the Naive Bayes classifier...")
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

Training the Naive Bayes classifier...


#### Inference


In [23]:
print("Making predictions...")
y_pred = classifier.predict(X_test_tfidf)

Making predictions...


In [24]:
results_df = pd.DataFrame({
    'Text': data['test']['text'][:10],
    'True Label': ['positive' if label == 1 else 'negative' for label in y_test[:10]],
    'Predicted Label': ['positive' if label == 1 else 'negative' for label in y_pred[:10]],
    'Correct?': y_test[:10] == y_pred[:10]
})
# Display the table with better formatting
pd.set_option('display.max_colwidth', 50)  # Truncate long text for better display
print("\10 Predictions:")
print(results_df.to_string(index=False))

 Predictions:
                                                                                                                                                             Text True Label Predicted Label  Correct?
                             lovingly photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness .   positive        negative     False
                                                                                                                            consistently clever and suspenseful .   negative        positive     False
                           it's like a " big chill " reunion of the baader-meinhof gang , only these guys are more harmless pranksters than political activists .   positive        positive      True
                            the story gives ample opportunity for large-scale action and suspense , which director shekhar kapur supplies with tremendous skill .   negative        positive 

#### Evaluate the model

In [25]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.77      0.77       533
           1       0.77      0.77      0.77       533

    accuracy                           0.77      1066
   macro avg       0.77      0.77      0.77      1066
weighted avg       0.77      0.77      0.77      1066

