<a href="https://colab.research.google.com/github/sangcamap/reviews_prediction/blob/main/reviews_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/AI/reviews_prediction

Mounted at /content/drive
/content/drive/My Drive/AI/reviews_prediction


In [4]:
data = pd.read_csv('./train.tsv', sep = '\t', quoting = 3)
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


# Clean text data

## Loại bỏ stopword ra khỏi câu

In [5]:
import nltk
import re

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
from nltk.corpus import stopwords 

In [8]:
data['Review'][0]

'Wow... Loved this place.'

In [9]:
review = re.sub('[^a-zA-Z]', ' ', data['Review'][0])

In [10]:
review 

'Wow    Loved this place '

In [11]:
review = review.split()

In [12]:
review

['Wow', 'Loved', 'this', 'place']

In [13]:
preview = []
for word in review:
  if word not in stopwords.words('english'):
    preview.append(word)

In [14]:
preview

['Wow', 'Loved', 'place']

In [15]:
# Cách rút gọn
review = [word for word in review if word not in stopwords.words('english')]
review

['Wow', 'Loved', 'place']

## Stemmer 

In [16]:
from nltk.stem.porter import PorterStemmer
p_stemmer = PorterStemmer()

In [17]:
review = [p_stemmer.stem(word) for word in review]
review

['wow', 'love', 'place']

In [18]:
review  = " ".join(review)
review

'wow love place'

## Clear

In [19]:
corpus = []

for review in range(len(data)):
  review = re.sub('[^a-zA-Z]', ' ', data['Review'][review])
  review = review.lower()
  review = review.split()
  review = [word for word in review if word not in stopwords.words('english')]
  review = [p_stemmer.stem(word) for word in review]
  review  = " ".join(review)
  # print(review)
  corpus.append(review)

In [20]:
corpus[:5]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price']

# Mô hình Bag of word 

In [21]:
from sklearn.feature_extraction.text import CountVectorizer 

In [22]:
cv = CountVectorizer(max_features= 1000)

In [23]:
x = cv.fit_transform(corpus).toarray()
x.shape

(1000, 1000)

In [24]:
y = data.iloc[:,1].values
y[:10]

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1])

# Naive Baiyes 

In [25]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split (x, y , train_size = 0.8, test_size = 0.2, random_state = 0)

x_train.shape, x_test.shape

((800, 1000), (200, 1000))

In [50]:
# from sklearn.naive_bayes import GaussianNB
# classifier = GaussianNB()

# from sklearn.naive_bayes import MultinomialNB
# classifier = MultinomialNB()

# from sklearn.neighbors import KNeighborsClassifier
# classifier = KNeighborsClassifier(n_neighbors=3)

from sklearn.svm import SVC
classifier = SVC()

In [51]:
classifier.fit(x_train, y_train)

SVC()

In [64]:
y_pred = classifier.predict(x_test)

In [65]:
from sklearn.metrics import accuracy_score 

accuracy_score(y_test, y_pred)

0.73

In [49]:
# GaussianNB: 73%
# MultinomialNB: 76%
# KNeighborsClassifier: 61%
# SVC: 73%

# Phân loại

In [66]:
data['pred_label'] = classifier.predict(x).tolist()
data.head()

Unnamed: 0,Review,Liked,pred_label
0,Wow... Loved this place.,1,1
1,Crust is not good.,0,1
2,Not tasty and the texture was just nasty.,0,0
3,Stopped by during the late May bank holiday of...,1,1
4,The selection on the menu was great and so wer...,1,1


In [68]:
data.to_csv("./output.tsv", sep = "\t", encoding = "UTF-8", index = False)