In [2]:
import numpy as np
import re
import pandas as pd
import os
from bs4 import BeautifulSoup               # Dùng để xóa HTML
import nltk
from nltk.corpus import stopwords     # Danh sách stopwords có sẵn
# TF-IDF Vectorizer của sklearn
train = pd.read_csv("labeledTrainData.tsv", header=0, \
                    delimiter="\t", quoting=3)
test =  pd.read_csv("testData.tsv", header=0, \
                    delimiter="\t", quoting=3)
#Here, "header=0" indicates that the first line of the file contains column names,
#"delimiter=\t" indicates that the fields are separated by tabs, and quoting=3 tells Python to ignore doubled quotes, otherwise you may encounter errors trying to read the file.
train.shape

(25000, 3)

In [3]:
train.columns.values

array(['id', 'sentiment', 'review'], dtype=object)

In [4]:
print(train["review"][0])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [5]:
train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [10]:
test.head()

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [None]:
#############CLEANING DATA##################

# Nếu chưa tải stopwords thì mở dòng này 1 lần:
nltk.download('stopwords')
# ========= 1. SAMPLE RAW DATA =========
# ========= 2. HÀM CLEANING =========
def clean_text(text):
    # 2.1 Xóa HTML bằng BeautifulSoup (chính xác hơn Regex)
    text = BeautifulSoup(text, "html.parser").get_text()

    # 2.2 Xóa HTML còn sót lại bằng Regex (dự phòng)
    text = re.sub(r'<.*?>', '', text)

    # 2.3 Chuyển toàn bộ về chữ thường (lowercase) để tránh phân biệt "Học" vs "học"
    text = text.lower()

    # 2.4 Xóa ký tự đặc biệt, số, dấu câu — chỉ giữ lại chữ cái (có hỗ trợ tiếng Việt)
    text = re.sub(r'[^a-zA-Zà-ỹÀ-Ỹ\s]', '', text)

    # 2.5 Xóa khoảng trắng dư thừa
    text = re.sub(r'\s+', ' ', text).strip()

    # 2.6 Xóa stopwords (bao gồm cả tiếng Anh + tiếng Việt tự thêm vào)
    stop_words = set(stopwords.words('english')) | {"là", "và", "của", "đang", "rất", "một", "hơn"}
    text = ' '.join([word for word in text.split() if word not in stop_words])

    return text

# Áp dụng cleaning cho toàn bộ documents
clean_review = clean_text( train["review"][0] )


stuff going moment mj ive started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mjs feeling towards press also obvious message drugs bad mkayvisually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice himthe actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pescis character ranted wanted people know supplying drugs etc dunno maybe hates mjs musiclots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# Get the number of reviews based on the dataframe column size
num_reviews = train["review"].size

# Initialize an empty list to hold the clean reviews
clean_train_reviews = []

# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list 
for i in range( 0, num_reviews ):
    # Call our function for each one, and add the result to the list of
    # clean reviews
    clean_train_reviews.append( clean_text( train["review"][i] ) )

In [None]:
################## VECTORIZE ################################

In [12]:
train["review"] = clean_train_reviews
train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,stuff going moment mj ive started listening mu...
1,"""2381_9""",1,classic war worlds timothy hines entertaining ...
2,"""7759_3""",0,film starts manager nicholas bell giving welco...
3,"""3630_4""",0,must assumed praised film greatest filmed oper...
4,"""9495_8""",1,superbly trashy wondrously unpretentious explo...
