### Python for Data Science and AI Final Project

Product Review Classification for E-commerce

In [None]:
! pip install transformers datasets
! pip install nltk

In [None]:
from datasets import load_dataset
from transformers import pipeline
import pandas as pd
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from functools import reduce

nltk.download('punkt_tab')

In [None]:
# Loading in the dataset
ds = load_dataset("SzilvasiPeter/amazon-shoe-review", split = "train")
ds[:5]

About the dataset

In [None]:
# Loading in the model
pipe = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

Exploratory Data Analysis

In [None]:
# EDA
splits = {'train': 'data/train-00000-of-00001-b73d46e3c7526716.parquet', 'test': 'data/test-00000-of-00001-036b4d091fdd5ccd.parquet'}
eda_df = pd.read_parquet("hf://datasets/SzilvasiPeter/amazon-shoe-review/" + splits["train"])

# check for missing values:
eda_df.isnull().sum()

In [None]:
display(eda_df)
display(eda_df.describe())
display(eda_df['labels'].value_counts())

In [None]:
# Rating score distribution

x = eda_df['labels'].unique()
y = eda_df['labels'].value_counts()[x]
plt.bar(x,y, color = 'teal')

plt.xlabel('Rating')
plt.ylabel('Count')
plt.title('Rating Distribution')

plt.show()

In [None]:
# Rating score vs. Average character length
eda_df['text_length'] = eda_df['text'].apply(len)
avg_text_length = eda_df.groupby('labels')['text_length'].mean()
plt.plot(avg_text_length.index, avg_text_length.values, marker='o', linestyle='-', color='teal')

plt.xlabel('Rating Score')
plt.ylabel('Average Number of Characters in Text')
plt.title('Average Character Length per Rating Score')
plt.xticks(avg_text_length.index)

plt.grid()
plt.show()

About the model

In [None]:
# Text preprocessing
# Lowercasing and removing special characters

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"<.*?>", "", text) # Remove html tags such as <br>
    words = word_tokenize(text) # Tokenize text
    s = PorterStemmer() # Stem words
    text = " ".join(s.stem(w) for w in words)
    text = ''.join(char for char in text if (char.isalnum() or char == " ")) # remove special characters
    return text

# Apply preprocessing to the dataset (if needed)
dataset = ds.map(lambda x: {'text': preprocess_text(x['text'])})
dataset[:5]

Model

In [None]:
# Take a smaller sample of the data
sample_ds = dataset[1000:1200]
sample_ds

In [None]:
# Applying the model
candidate_labels = ['positive', 'negative', 'neutral']
results = pipe(sample_ds['text'], candidate_labels, multi_label=True)
results[:5]

Evaluating the model

In [None]:
# Actual labels
true_labels = sample_ds['labels']  

# Mapping the labels to postive, negative, and neutral
true_labels_mapped = ['positive' if label > 3 else 'neutral' if label == 3 else 'negative' for label in true_labels]

# Taking the top predicted label
predicted_labels = [result['labels'][0] for result in results]

# Comparative dataframe for true and predicted labels
comparison_df = pd.DataFrame({
    'True Label': true_labels_mapped,
    'Predicted Label': predicted_labels,
    'Confidence Score': [result['scores'][0] for result in results]
})
comparison_df


In [None]:
# Accuracy
from sklearn.metrics import accuracy_score, confusion_matrix

accuracy = accuracy_score(true_labels_mapped, predicted_labels)
f"Accuracy: {accuracy * 100:.2f}%"

In [None]:
# Confusion matrix
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['POSITIVE', 'NEGATIVE', 'NEUTRAL'], yticklabels=['POSITIVE', 'NEGATIVE', "NEUTRAL"])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

Model : https://huggingface.co/facebook/bart-large-mnli

Dataset : https://huggingface.co/datasets/SzilvasiPeter/amazon-shoe-review

About Zero Shot Learning : https://joeddav.github.io/blog/2020/05/29/ZSL.html

Huggingface Zero Shot Classification: https://huggingface.co/tasks/zero-shot-classification

