<a href="https://colab.research.google.com/github/machiwao/CCMACLRL_PROJECT_COM222/blob/main/project_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis of Shopee Computer Product Reviews

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import seaborn as sns
import re
import os, types

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, precision_score, recall_score, accuracy_score, balanced_accuracy_score, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from nltk import FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download("punkt")
nltk.download('wordnet')
nltk.download('stopwords')

!pip install stanza
import stanza

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Collecting stanza
  Downloading stanza-1.9.2-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading stanza-1.9.2-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.14.0-py3-none-any.whl (586 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji, stanza
Successfully installed emoji-2.14.0 stanza-1.9.2


## Data Loading

In [2]:
df = pd.read_csv("hf://datasets/magixxixx/shopee-product-reviews-on-computer-category/combined-dataset-shuffled-cased-40K.csv")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
df.head()

Unnamed: 0,text,sentiment
0,mali yung dineliver 2l order ko naging 1l bulok!,0
1,xl order ko dumating large.nkalagay sa sticker...,0
2,"sobrang bilis ma lowbat, hindi masyado nakaka ...",0
3,first time ko umorder dito. di naman gumagana ...,0
4,"not well packed, wala itong box. hindi sya ma ...",0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       40000 non-null  object
 1   sentiment  40000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 625.1+ KB


In [5]:
df.isnull().sum()

Unnamed: 0,0
text,0
sentiment,0


In [6]:
df.duplicated().sum()

1

In [7]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
0,20000
1,20000


## Data Preprocessing

In [9]:
df.drop_duplicates(inplace = True)

In [11]:
df['text'] = df['text'].str.lower()

In [12]:
df['text'] = df['text'].str.replace('[^a-zA-Z]', ' ')
df['text'] = df['text'].str.replace('http\S+|www.\S+', '', case=False)
df['text'] = df['text'].str.replace(r'\d+', '')
df['text'] = df['text'].str.replace(r'[^\w\s]+', '')
df['text'] = df['text'].str.strip()
df['text'] = df['text'].str.replace(r'\s+', ' ')
df['text'] = df['text'].str.replace(r'^\s+', '')
df['text'] = df['text'].str.replace(r'\s+$', '')

In [14]:
filipino_stopwords = set([
        'akin', 'aking', 'ako', 'alin', 'am', 'amin', 'aming', 'ang', 'ano',
        'anumang', 'apat', 'at', 'atin', 'ating', 'ay', 'ba', 'bababa', 'bago',
        'bakit', 'bawat', 'bilang', 'dahil', 'dalawa', 'dapat', 'din', 'dito',
        'doon', 'eh', 'gagawin', 'gayunman', 'ginagawa', 'ginawa', 'ginawang',
        'gumawa', 'gusto', 'ha', 'habang', 'hanggang', 'hindi', 'huwag', 'hoy',
        'iba', 'ibaba', 'ibabaw', 'ibig', 'ikaw', 'ilagay', 'ilalim', 'ilan',
        'inyong', 'isa', 'isang', 'itaas', 'ito', 'iyo', 'iyon', 'iyang','iyong', 'ka',
        'kahit', 'kailangan', 'kailanman', 'kami', 'kanila', 'kanilang',
        'kanino', 'kanya', 'kanyang', 'kapag', 'kapwa', 'karamihan',
        'katiyakan', 'katulad', 'kaya', 'kaysa', 'ko', 'kong', 'kulang',
        'kumuha', 'kung', 'laban', 'lahat', 'lamang', 'likod', 'lima', 'maaari',
        'maaaring', 'maging', 'mahusay', 'makita', 'marami', 'marapat',
        'masyado', 'may', 'mayroon', 'mga', 'minsan', 'mismo', 'mula', 'muli', 'na',
        'nabanggit', 'naging', 'nagkaroon', 'nais', 'nakita', 'namin', 'nang',
        'napaka', 'narito', 'nasaan', 'ng', 'ngayon', 'ngunit', 'ni', 'nila', 'nilang',
        'nito', 'niya', 'niyang', 'noon', 'o', 'pa', 'paano', 'pababa',
        'paggawa', 'pagitan', 'pagkakaroon', 'pagkatapos', 'pala', 'palabas',
        'pamamagitan', 'panahon', 'pangalawa', 'para', 'paraan', 'pareho',
        'pataas', 'pero', 'po', 'pumunta', 'pumupunta', 'sa', 'saan', 'sabi',
        'sabihin', 'sarili', 'sila', 'sino', 'siya',
        'tatlo', 'tayo', 'tulad', 'tungkol', 'una', 'walang', 'yan', 'yung', 'username'
    ])

english_stopwords = set(stopwords.words('english'))
combined_stopwords = filipino_stopwords.union(english_stopwords)

df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (combined_stopwords)]))

In [15]:
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
'''porter = PorterStemmer()
df['text'] = df['text'].apply(lambda x: ' '.join([porter.stem(word) for word in x.split()]))'''

"porter = PorterStemmer()\ndf['text'] = df['text'].apply(lambda x: ' '.join([porter.stem(word) for word in x.split()]))"

## Training the Model

In [16]:
X = df['text']
y = df['sentiment']

In [18]:
X_train, y_train, X_test, y_test = train_test_split(test_size=0.2, random_state=42)

ValueError: not enough values to unpack (expected 4, got 2)