In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('ggplot')

In [2]:
pip install pandas scikit-learn transformers datasets torch accelerate

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [3]:
df = pd.read_csv('/kaggle/input/amazon-product-reviews/Reviews.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/amazon-product-reviews/Reviews.csv'

In [None]:
df.columns

In [None]:
df['Score'].value_counts()

In [None]:
# Custom sampling logic based on the 'Score' value
df = df.groupby('Score').apply(lambda x: x.sample(
    n=8500 if x.name in [3, 4, 5] else 12500,
    random_state=42  # Ensures reproducibility
)).reset_index(drop=True)

# Check the new distribution
print(df['Score'].value_counts())

In [None]:
def label_sentiments(rating):
    if (rating == 5) or (rating == 4) or (rating == 3):
        return 1
    else:
        return 0

df["Sentiment"] = df["Score"].apply(label_sentiments)

In [None]:
df['Sentiment'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split
# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
train_df.columns

In [None]:
from datasets import Dataset
# Convert DataFrame to Dataset
train_dataset = Dataset.from_pandas(train_df[['Text', 'Sentiment']])
test_dataset = Dataset.from_pandas(test_df[['Text', 'Sentiment']])


In [None]:
train_dataset[:5]

In [None]:
!pip install transformers

In [None]:
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification

In [None]:
from transformers import DistilBertTokenizerFast

In [None]:
# Load the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased', num_labels=2)


In [None]:
# Tokenize a sentence
input_text = "Transformers are amazing!"
encoded_input = tokenizer(input_text, padding=True, truncation=True, return_tensors="pt")

# Print the result
print(encoded_input)

In [None]:
# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples['Text'],
        padding='max_length',
        truncation=True,
        max_length=256
    )

In [None]:
# Tokenize the datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

In [None]:
# Encode labels
def encode_labels(examples):
    examples['Sentiment'] = [sentiment_mapping[sentiment] for sentiment in examples['sentiment']]
    return examples

tokenized_train = tokenized_train.map(encode_labels, batched=True)
tokenized_test = tokenized_test.map(encode_labels, batched=True)


In [None]:
tokenized_train

In [None]:
# Add the 'Sentiment' column as 'labels'
tokenized_train = tokenized_train.add_column('labels', tokenized_train['Sentiment'])

In [None]:
# Add the 'Sentiment' column as 'labels'
tokenized_test = tokenized_test.add_column('labels', tokenized_test['Sentiment'])

In [None]:
# Remove unnecessary columns
tokenized_train = tokenized_train.remove_columns(['Text', 'Sentiment'])
tokenized_test = tokenized_test.remove_columns(['Text', 'Sentiment'])

In [None]:
from transformers import DistilBertForSequenceClassification
#Load the pre-trained DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)


In [None]:
#Define Training Arguments
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs'
)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
# Create a Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test
)

In [None]:
# Train the model
trainer.train()

In [None]:
# Save the model
model.save_pretrained('./DistilBert_Model')


In [None]:
tokenizer.save_pretrained('./DistilBert_Model')

In [None]:
import os

# Check the current working directory
print(os.getcwd())

In [None]:
# List files in the working directory
print(os.listdir('/kaggle/working'))

In [None]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

# Load the saved model and tokenizer
model_path = './DistilBert_Model'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)

# Function to prepare input data for the model
def prepare_input(review):
    return tokenizer(review, padding=True, truncation=True, max_length=256, return_tensors='pt')

# Sentiment mapping
sentiment_mapping = {0: 'negative', 1: 'positive'}

# Interactive loop for user input
print("Welcome to the Sentiment Analysis App!")
print("Type your review and press Enter to get the sentiment.")
print("Type 'exit()' to quit the application.")

while True:
    user_input = input("\nEnter review: ")

    # Exit condition
    if user_input.lower() == 'exit()':
        print("Exiting the application.")
        break

    # Prepare input data
    encodings = prepare_input(user_input)

    # Make prediction
    with torch.no_grad():
        outputs = model(**encodings)
        logits = outputs.logits

    # Get predicted label
    prediction = torch.argmax(logits, dim=1).item()
    predicted_sentiment = sentiment_mapping[prediction]

    # Display the result
    print(f"Predicted Sentiment: {predicted_sentiment}")

In [None]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

# Load the saved model and tokenizer
model_path = './DistilBert_Model'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)

# Function to prepare input data for the model
def prepare_input(reviews):
    return tokenizer(reviews, padding=True, truncation=True, max_length=256, return_tensors='pt')

# Sentiment mapping
sentiment_mapping = {0: 'negative', 1: 'positive'}

# Interactive loop for user input
print("Welcome to the Sentiment Analysis App!")
print("Type your reviews separated by commas and press Enter to get the sentiments.")
print("Type 'exit()' to quit the application.")

while True:
    user_input = input("\nEnter reviews (comma separated): ")

    # Exit condition
    if user_input.lower() == 'exit()':
        print("Exiting the application.")
        break

    # Split user input into a list of reviews
    reviews = [review.strip() for review in user_input.split(',')]

    # Prepare input data
    encodings = prepare_input(reviews)

    # Make predictions
    with torch.no_grad():
        outputs = model(**encodings)
        logits = outputs.logits

    # Get predicted labels for all reviews
    predictions = torch.argmax(logits, dim=1).tolist()
    predicted_sentiments = [sentiment_mapping[prediction] for prediction in predictions]

    # Display the results
    for review, sentiment in zip(reviews, predicted_sentiments):
        print(f"Review: '{review}' -> Predicted Sentiment: {sentiment}")


In [None]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import torch.nn.functional as F

# Load the saved model and tokenizer
model_path = './DistilBert_Model'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)

# Example reviews (replace with your actual test data)
test_reviews = [
    "I love this product!",     # True label: positive
    "This is the worst service ever!",  # True label: negative
    "Absolutely fantastic experience!",  # True label: positive
    "Not what I expected."     # True label: negative
]

# Function to prepare input data for the model
def prepare_input(reviews):
    return tokenizer(reviews, padding=True, truncation=True, max_length=256, return_tensors='pt')

# Prepare input data for the test reviews
encodings = prepare_input(test_reviews)

# Make predictions
with torch.no_grad():
    outputs = model(**encodings)
    logits = outputs.logits

# Calculate probabilities using softmax
probabilities = F.softmax(logits, dim=1)

# Display the probabilities for each review
for review, prob in zip(test_reviews, probabilities):
    print(f"Review: '{review}'")
    print(f"Probabilities: {prob.tolist()}")  # List of probabilities for each class
    print(f"Predicted Sentiment: {torch.argmax(prob).item()}")  # Predicted class index
    print("------")

In [None]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from sklearn.metrics import accuracy_score

# Load the saved model and tokenizer
model_path = './DistilBert_Model'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)

# Example test dataset (replace with your actual test data)
test_reviews = [
    "I love this product!",        # True label: positive
    "This is the worst service ever!",  # True label: negative
    "Absolutely fantastic experience!",  # True label: positive
    "Not what I expected."          # True label: negative
]
true_labels = [1, 0, 1, 0]  # Corresponding true labels (1 for positive, 0 for negative)

# Function to prepare input data for the model
def prepare_input(reviews):
    return tokenizer(reviews, padding=True, truncation=True, max_length=256, return_tensors='pt')

# Prepare input data for the test reviews
encodings = prepare_input(test_reviews)

# Make predictions
with torch.no_grad():
    outputs = model(**encodings)
    logits = outputs.logits

# Get predicted labels for all reviews
predictions = torch.argmax(logits, dim=1).tolist()

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f'Accuracy: {accuracy * 100:.2f}%')


In [None]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import torch.nn.functional as F

# Load the saved model and tokenizer
#model_path = './DistilBert_Model'
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Example reviews (replace with your actual test data)
test_reviews = [
    "I love this product!",     # True label: positive
    "This is the worst service ever!",  # True label: negative
    "Absolutely fantastic experience!",  # True label: positive
    "Not what I expected."     # True label: negative
]

# Function to prepare input data for the model
def prepare_input(reviews):
    return tokenizer(reviews, padding=True, truncation=True, max_length=256, return_tensors='pt')

# Prepare input data for the test reviews
encodings = prepare_input(test_reviews)

# Make predictions
with torch.no_grad():
    outputs = model(**encodings)
    logits = outputs.logits

# Calculate probabilities using softmax
probabilities = F.softmax(logits, dim=1)

# Display the probabilities for each review
for review, prob in zip(test_reviews, probabilities):
    print(f"Review: '{review}'")
    print(f"Probabilities: {prob.tolist()}")  # List of probabilities for each class
    print(f"Predicted Sentiment: {torch.argmax(prob).item()}")  # Predicted class index
    print("------")

In [None]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from sklearn.metrics import accuracy_score

# Load the saved model and tokenizer
#model_path = './DistilBert_Model'
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Example test dataset (replace with your actual test data)
test_reviews = [
    "I love this product!",        # True label: positive
    "This is the worst service ever!",  # True label: negative
    "Absolutely fantastic experience!",  # True label: positive
    "Not what I expected."          # True label: negative
]
true_labels = [1, 0, 1, 0]  # Corresponding true labels (1 for positive, 0 for negative)

# Function to prepare input data for the model
def prepare_input(reviews):
    return tokenizer(reviews, padding=True, truncation=True, max_length=256, return_tensors='pt')

# Prepare input data for the test reviews
encodings = prepare_input(test_reviews)

# Make predictions
with torch.no_grad():
    outputs = model(**encodings)
    logits = outputs.logits

# Get predicted labels for all reviews
predictions = torch.argmax(logits, dim=1).tolist()

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f'Accuracy: {accuracy * 100:.2f}%')


In [None]:
import streamlit as st
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import torch.nn.functional as F

# Load the saved model and tokenizer
model_path = './DistilBert_Model'  # Adjust the path as needed
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)

# Set up the Streamlit app
st.title("Sentiment Analysis App")
st.write("Enter a review below and click 'Predict' to get the sentiment.")

# Input field for user review
user_input = st.text_area("Enter your review:")

# Button to make a prediction
if st.button("Predict"):
    if user_input:
        # Prepare input data
        encodings = tokenizer(user_input, padding=True, truncation=True, max_length=256, return_tensors='pt')

        # Make prediction
        with torch.no_grad():
            outputs = model(**encodings)
            logits = outputs.logits

        # Calculate probabilities and get predicted label
        probabilities = F.softmax(logits, dim=1)
        prediction = torch.argmax(probabilities).item()

        # Sentiment mapping
        sentiment_mapping = {0: 'negative', 1: 'positive'}
        predicted_sentiment = sentiment_mapping[prediction]

        # Display the result
        st.write(f"Predicted Sentiment: **{predicted_sentiment}**")
        st.write(f"Probabilities: {probabilities.tolist()}")
    else:
        st.write("Please enter a review before predicting.")

# Run the app using `streamlit run app.py` in the terminal


In [None]:
!streamlit run /opt/conda/lib/python3.10/site-packages/ipykernel_launcher.py