## ตัวอย่างการจำแนกความรู้สึกที่แสดงออกทางข้อความภาษาไทย 
(Thai Sentiment Analysis based on Classification Concept)  




ติดตั้ง Library สำหรับจัดการข้อมูลประเภทข้อความภาษาไทย

In [None]:
 # install library for Thai NLP
!pip install pandas matplotlib pythainlp

In [2]:
import re
import string
from pythainlp import word_tokenize
from pythainlp.corpus.common import thai_stopwords

**ขอสิทธิ์การเข้าใช้งาน Colab Notebook ซึ่งเก็บฟอนต์ภาษาไทยที่ใช้แสดงผลไว้**

In [None]:
from google.colab import drive 

drive.mount('/content/gdrive')

import os
os.chdir("/content/gdrive/My Drive/Colab Notebooks")

อ่านข้อมูลจากไฟล์ตัวอย่างที่เก็บไว้ใน Github

In [None]:
import pandas as pd

data = pd.read_csv('https://github.com/PyThaiNLP/thai-sentiment-analysis-dataset/raw/master/review_shopping.csv',sep='\t', names=['text', 'sentiment'], header=None )
data.columns=['review','sentiment']
print(data)

**ทดสอบให้พิมพ์ stopword ภาษาไทย

In [None]:
thai_stopwords = list(thai_stopwords())
thai_stopwords

# Data Preprocessing  สร้างเป็นฟังก์ชันเพื่อเรียกใช้ซ้ำ

In [6]:
def data_preprocessing(cleaned_data):
    # กำจัด URL
    cleaned_data = " ".join(re.sub(r"https?://[A-Za-z0-9./]+", '', w ) for w in cleaned_data.split())

       # กำจัด stop words
    cleaned_data = " ".join(w for w in word_tokenize(cleaned_data) if w not in thai_stopwords)

    # ตัดเครื่องหมายวรรคตอน ... remove punctuation
    cleaned_data =  " ".join([w for w in cleaned_data.split() if w not in list(string.punctuation)])

    # กำจัดตัวเลข
    cleaned_data = " ".join(re.sub(r"[0-9.]+%", '', w ) for w in cleaned_data.split()) 

    # กำจัดข้อความภาษาอังกฤษ
    cleaned_data = " ".join(re.sub(r"[A-Za-z]+", '', w ) for w in cleaned_data.split()) 

    return cleaned_data


เรียกใช้งานฟังก์ชัน data_preprocessing

In [None]:
data['review'] = data['review'].apply(data_preprocessing)
# แสดงผลข้อมูลหลังจาก Data Preprocessing
print(data)

สร้าง Word Cloud สำหรับ Positive และ Negative รีวิว

In [8]:
from wordcloud import WordCloud
import nltk
import matplotlib.pyplot as plt
fp = 'THSarabunNew.ttf'

Word Cloud สำหรับ Positive รีวิว

In [None]:
data_positive = data[data['sentiment'] == 'pos']
frequency_positive_dist = [ ]
for message in data_positive['review']:
       words = word_tokenize(message)
       for w in words:
              frequency_positive_dist.append(w)
frequency_positive_dist = nltk.FreqDist(frequency_positive_dist)
positive_wcloud = WordCloud( background_color = 'white', max_words=2000, height = 2000, width=4000, font_path=fp).generate_from_frequencies(frequency_positive_dist)

plt.figure(figsize = (12,8))
plt.imshow(positive_wcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

Word Cloud สำหรับ  Negative รีวิว

In [None]:
data_negative = data[data['sentiment'] == 'neg']
frequency_negative_dist = [ ]
for message in data_negative['review']:
       words = word_tokenize(message)
       for w in words:
              frequency_negative_dist.append(w)
frequency_negative_dist = nltk.FreqDist(frequency_negative_dist)
negative_wcloud = WordCloud(background_color = 'white', max_words=2000, height = 2000, width=4000,font_path=fp).generate_from_frequencies(frequency_negative_dist)

plt.figure(figsize = (12,8))
plt.imshow(negative_wcloud)
plt.axis('off')
plt.show()

# สร้าง TF-IDF vectors

In [11]:
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
encoder = preprocessing.LabelEncoder()
y = encoder.fit_transform(data['sentiment'])

tfidf_vect = TfidfVectorizer(max_features=5000)
tfidf_vect.fit(data['review'])
x_tfidf = tfidf_vect.transform(data['review'])


สร้าง Training และ Testing set

In [12]:
# Split the data into train and validation
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(x_tfidf, y)


In [None]:
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
# Model building and evaluation
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
models = []
models.append(('NB', MultinomialNB()))
models.append(('Log-reg', LogisticRegression()))
models.append(('SVC', SVC()))
models.append(('RandomForest', RandomForestClassifier()))


# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, clf in models:
  model = clf
  model.fit(X_train,y_train)
  Predited_class = model.predict(X_test)
  accuracy=accuracy_score(y_test, Predited_class)
  results.append(accuracy)
  names.append(name)
  msg = "%s: %f " % (name, accuracy)
  print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.bar(names,results)

plt.show()

# Apply the best model

In [None]:
model = MultinomialNB()
model.fit(X_train,y_train)

Appication form for using sentiment analysis for English reviw based on data classifcaion concept model

In [None]:
#@title Enter Review
review = " \u0E40\u0E22\u0E35\u0E48\u0E22\u0E21\u0E21\u0E32\u0E01" #@param {type:"string"}
# use the classifier to predict the sentiment of given text
review = data_preprocessing(review)
xtest_tfidf = tfidf_vect.transform([review])
result = model.predict(xtest_tfidf)
result = encoder.inverse_transform(result)
if result[0] == 'pos':
      print("\U0001F600")
else :
      print("\U0001F612")
