In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
pd.set_option('display.max_colwidth', -1)

import os
import re
import emoji
import html
import numpy as np

# __Load Dataset__

In [2]:
directory = 'dataset/wisesight-sentiment-master/'
negative_path = os.path.join(directory, 'neg.txt')
neutral_path = os.path.join(directory, 'neu.txt')
positive_path = os.path.join(directory, 'pos.txt')
question_path = os.path.join(directory, 'q.txt')

In [3]:
with open(negative_path) as file:
  negative_texts = file.read().splitlines(False)
  
with open(neutral_path) as file:
  neutral_texts = file.read().splitlines(False)
  
with open(positive_path) as file:
  positive_texts = file.read().splitlines(False)
  
with open(question_path) as file:
  question_texts = file.read().splitlines(False)

In [4]:
cols = ['text', 'label']
sentiment_df = pd.DataFrame(columns=cols)
 
all_texts = (positive_texts, neutral_texts, negative_texts)
all_labels = ('positive', 'neutral', 'negative')

for texts, label in zip(all_texts, all_labels):
  tmp = pd.DataFrame({"text": texts, 'label': label})
  sentiment_df = sentiment_df.append(tmp, ignore_index=True)

# __Data Exploration__

In [5]:
values = sentiment_df['label'].value_counts()

print(f"Positive Sentences: {values['positive']} sentences")
print(f"Neutral Sentences: {values['neutral']} sentences")
print(f"Negative Sentences: {values['negative']} sentences")

Positive Sentences: 4778 sentences
Neutral Sentences: 14573 sentences
Negative Sentences: 6824 sentences


# __Data Preprocessing__

In [6]:
from preprocess import normalize_thai_number, normalize_number, remove_markup_tag, normalize_link, normalize_mention 
from preprocess import normalize_email, normalize_laugh, unescape_html, normalize_emoji, extract_hashtag
from preprocess import normalize_hashtag, replace_with_actual_hashtag, tokenize

In [7]:
sentiment_df['preprocessed'] = sentiment_df['text'].apply(lambda text: text.lower())
sentiment_df['preprocessed'] = sentiment_df['preprocessed'].apply(normalize_thai_number)
sentiment_df['preprocessed'] = sentiment_df['preprocessed'].apply(unescape_html)
sentiment_df['preprocessed'] = sentiment_df['preprocessed'].apply(remove_markup_tag)
sentiment_df['preprocessed'] = sentiment_df['preprocessed'].apply(normalize_link)
sentiment_df['preprocessed'] = sentiment_df['preprocessed'].apply(normalize_mention)
sentiment_df['preprocessed'] = sentiment_df['preprocessed'].apply(normalize_email)
sentiment_df['preprocessed'] = sentiment_df['preprocessed'].apply(normalize_laugh)
sentiment_df['preprocessed'] = sentiment_df['preprocessed'].apply(lambda text: normalize_number(text, place_holder=''))
sentiment_df['preprocessed'] = sentiment_df['preprocessed'].apply(normalize_emoji)
sentiment_df['hashtags'] = sentiment_df['preprocessed'].apply(extract_hashtag)
sentiment_df['preprocessed'] = sentiment_df['preprocessed'].apply(lambda text: normalize_hashtag(text, place_holder=''))

In [8]:
from pythainlp.corpus import thai_stopwords
from string import punctuation

stopwords = thai_stopwords()
punctuation += '“” ️'

In [9]:
sentiment_df['tokens'] = sentiment_df['preprocessed'].apply(lambda text: tokenize(text, punctuation=punctuation))
sentiment_df['tokens'] = sentiment_df.apply(lambda row: replace_with_actual_hashtag(row['tokens'], row['hashtags']), axis=1) 

In [10]:
sentiment_df.sample(5)

Unnamed: 0,text,label,preprocessed,hashtags,tokens
7264,อันนี้ดีก่าสา,neutral,อันนี้ดีก่าสา,[],"[อันนี้, ดีก่า, สา]"
9199,สนใจไหมช่วงเมษาไปกัน,neutral,สนใจไหมช่วงเมษาไปกัน,[],"[สนใจ, ไหม, ช่วง, เมษา, ไป, กัน]"
3187,เจ้ๆ เห็นว่าใช้กานิเย่ #กระเป๋าสวยดี 😂😂,positive,เจ้ๆ เห็นว่าใช้กานิเย่ 😂 😂,[#กระเป๋าสวยดี],"[เจ้, ๆ, เห็น, ว่า, ใช้, กา, นิ, เย่, 😂, 😂]"
25525,ถ้าแบบนั้นคงต้องดูยอดขายของ CR-V ตัวเองว่าดีกว่ามั้ย แต่จริง ๆ ผมชอบโฉมนี้มากกว่า แคมรี่ใหม่นะ แต่ถ้าเป็นในเมืองไทย รถราคาประมาณล้านครึ่งถึงล้านปลายแบบแอคคอร์ท ผมก็คงเล่น SUV นั่นแหละ,negative,ถ้าแบบนั้นคงต้องดูยอดขายของ cr-v ตัวเองว่าดีกว่ามั้ย แต่จริง ๆ ผมชอบโฉมนี้มากกว่า แคมรี่ใหม่นะ แต่ถ้าเป็นในเมืองไทย รถราคาประมาณล้านครึ่งถึงล้านปลายแบบแอคคอร์ท ผมก็คงเล่น suv นั่นแหละ,[],"[ถ้า, แบบ, นั้น, คง, ต้อง, ดู, ยอด, ขายของ, ตัวเอง, ว่า, ดีกว่า, มั้ย, แต่, ผม, ชอบ, โฉม, นี้, มากกว่า, แคม, รี่, ใหม่, นะ, แต่, ถ้า, เป็น, ใน, เมือง, ไทย, รถ, ราคา, ประมาณ, ล้าน, ครึ่ง, ถึง, ล้าน, ปลาย, แบบ, แอคคอร์, ท, ผม, ก็, คง, เล่น, suv, นั่นแหละ]"
9158,ไม่มีจำหน่ายค่าาา ^^,neutral,ไม่มีจำหน่ายค่าาา ^^,[],"[ไม่มี, จำหน่าย, ค่า, าา]"


# __Training Sentiment Classifier__

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from preprocess import _return_token

In [12]:
pipeline = Pipeline([
  ('vectorzier', TfidfVectorizer(preprocessor= _return_token, 
                                 tokenizer= _return_token, 
                                 min_df= 5, 
                                 max_df= 0.85,
                                 ngram_range= (1,2)
                                )),
  ('model', LogisticRegression(random_state=0,
                               solver='liblinear',
                               multi_class='ovr',
                               class_weight='balanced'
                              ))])

In [13]:
pipeline.fit(sentiment_df['tokens'], sentiment_df['label'])

Pipeline(memory=None,
         steps=[('vectorzier',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.85, max_features=None,
                                 min_df=5, ngram_range=(1, 2), norm='l2',
                                 preprocessor=<function _return_token at 0x11b2a92f0>,
                                 smooth_idf=True, stop_words=None,
                                 strip_accents=N...
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function _return_token at 0x11b2a92f0>,
                                 use_idf=True, vocabulary=None)),
                ('model',
                 LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          

# __Evaluating Sentiment Classifier__

In [14]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

In [15]:
def kfold_cross_validate_prf1(splits, X, y, pipeline, average_method):
    kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=0)
    
    accuracy = []
    precision = []
    recall = []
    f1 = []
    
    for train, test in kfold.split(X, y):
        lr_fit = pipeline.fit(X[train], y[train])
        prediction = lr_fit.predict(X[test])
        scores = lr_fit.score(X[test],y[test])
        
        accuracy.append(scores * 100)
        precision.append(precision_score(y[test], prediction, average=average_method)*100)
        recall.append(recall_score(y[test], prediction, average=average_method)*100)
        f1.append(f1_score(y[test], prediction, average=average_method)*100)

        print(classification_report(y[test], prediction, digits=4))

    print("accuracy: %.2f%% (+/- %.2f%%)" % (np.mean(accuracy), np.std(accuracy)))
    print("precision: %.2f%% (+/- %.2f%%)" % (np.mean(precision), np.std(precision)))
    print("recall: %.2f%% (+/- %.2f%%)" % (np.mean(recall), np.std(recall)))
    print("f1 score: %.2f%% (+/- %.2f%%)" % (np.mean(f1), np.std(f1)))

In [16]:
kfold_cross_validate_prf1(10, sentiment_df['tokens'], sentiment_df['label'], pipeline, 'weighted')

              precision    recall  f1-score   support

    negative     0.6875    0.7258    0.7061       682
     neutral     0.7681    0.7791    0.7736      1458
    positive     0.5847    0.5126    0.5463       478

    accuracy                         0.7166      2618
   macro avg     0.6801    0.6725    0.6753      2618
weighted avg     0.7136    0.7166    0.7145      2618

              precision    recall  f1-score   support

    negative     0.7117    0.7408    0.7260       683
     neutral     0.7602    0.7831    0.7715      1457
    positive     0.5517    0.4686    0.5068       478

    accuracy                         0.7147      2618
   macro avg     0.6745    0.6642    0.6681      2618
weighted avg     0.7095    0.7147    0.7113      2618

              precision    recall  f1-score   support

    negative     0.7191    0.7423    0.7305       683
     neutral     0.7707    0.8051    0.7875      1457
    positive     0.5959    0.4874    0.5362       478

    accuracy        

# __Predicting Text__

In [22]:
text = 'กระเพราอร่อยมากกกก'
tokens = tokenize(text)
pipeline.predict_proba([tokens])

array([[0.12151496, 0.1676746 , 0.71081044]])

# __Save Trained Model__

In [18]:
import pickle

In [19]:
filename = 'models/tfidf_lr.pkl'

pickle.dump(pipeline, open(filename, 'wb'))