In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
path = 'drive/MyDrive/'

In [None]:
!pip install pythainlp
!pip install emoji
!pip install pycrfsuite-spacing

In [2]:
import pandas as pd
from pandas import DataFrame
import pythainlp
from pythainlp.tokenize import word_tokenize
from pythainlp.util import normalize
from pythainlp.corpus.common import thai_stopwords
from pythainlp import thai_punctuations
import numpy as np
import math
import emoji
import string
import unicodedata
from pythainlp.ulmfit import *
from sklearn.model_selection import train_test_split

In [None]:
# Define proprocessing function 
def preprocessing(text):
  text = unicodedata.normalize("NFKD", text)
  # step 1: word tokenization 
  token = word_tokenize(text, engine="longest", keep_whitespace=False)
  # step 2: word normalization 
  normalized_token = []
  for item in token:
    normalized_token.append(normalize(item))
  #step 3: remove stop words
  # stopwords = thai_stopwords()
  # woStopword_token = []
  # for item in normalized_token:
  #   if item not in stopwords:
  #     woStopword_token.append(item)
  # in this dataset has better performance without removing stopword
  #step 4: remove punctuation
  en_punctuation = string.punctuation
  th_punctuation = thai_punctuations
  punctuation = en_punctuation+th_punctuation
  final_token = []
  for item in normalized_token:
	  if item not in punctuation:
		  final_token.append(item) 
  return final_token

In [None]:
all_df = pd.read_csv(path+'Training_data.csv',names=["sentiment", "word"])

all_df.head()

Unnamed: 0,sentiment,word
0,neu,🚗💨💨 ซิ่งเป็นบางเวลา ซ่อกแซ่กได้ทุกที่ << ขับสน...
1,pos,สนใจ ฟอจูนเนอร์ สีขาวครับ
2,neg,Nissan คงใกล้จบแล้วแน่ๆ
3,neu,สุดเท่กับชุดแต่ง RS มาพร้อมภายนอก... สปอร์ต ดุ...
4,pos,แดงโดนใจจริงๆ 😲 Honda Civic Hatchback Rallye R...


In [None]:
all_df['sentiment'].unique()

array(['neu', 'pos', 'neg', 'q'], dtype=object)

In [None]:
for idx,row in all_df.iterrows():
  all_df.at[idx,'tokenize']=all_df.at[idx,'word']

In [None]:
all_df['tokenize'] = all_df['tokenize'].apply(lambda x:preprocessing(x))
all_df.head()

Unnamed: 0,sentiment,word,tokenize
0,neu,🚗💨💨 ซิ่งเป็นบางเวลา ซ่อกแซ่กได้ทุกที่ << ขับสน...,"[🚗💨💨, ซิ่ง, เป็น, บางเวลา, ซ่อก, แซ่, ก, ได้, ..."
1,pos,สนใจ ฟอจูนเนอร์ สีขาวครับ,"[สนใจ, ฟอ, จูน, เนอ, ร์, สี, ขาว, ครับ]"
2,neg,Nissan คงใกล้จบแล้วแน่ๆ,"[nissan, คง, ใกล้, จบ, แล้ว, แน่ๆ]"
3,neu,สุดเท่กับชุดแต่ง RS มาพร้อมภายนอก... สปอร์ต ดุ...,"[สุด, เท่, กับ, ชุด, แต่ง, rs, มา, พร้อม, ภาย,..."
4,pos,แดงโดนใจจริงๆ 😲 Honda Civic Hatchback Rallye R...,"[แดง, โดนใจ, จริงๆ, 😲, honda, civic, hatchback..."


In [None]:
allwordlist = all_df.sum()['tokenize']
len(allwordlist)

65371

In [None]:
setallword = set(allwordlist)
len(setallword)

7671

In [None]:
fre_in_all = {}
for i in setallword:
  fre_in_all[i] = sum(all_df['tokenize'].apply(lambda x: i in x))
print(fre_in_all)



In [None]:
for word in setallword:
  all_df[word] = ''
all_df.head()

Unnamed: 0,sentiment,word,tokenize,เจ๋ง,สมราคา,👍👍,gls,...ค,อร์ค,privilege,วิจัย,เปน,แบ็ก,ฐานทัพ,ขับๆ,แม็คกำ,hb,ร.,เพรียวลม,นิร,วาม,kodo,นยำ,🛒,ผ่าน,ram,จังหวะ,159,titanium,เลิฟๆ,64,ใช้,ปัญหา,คิง,หลอด,มอง,กัมพู,ร่วมมือ,เทพฯ,วิกฤติ,...,020,แปซิฟิก,ข่ๅว,ใส้,จังหวัด,l200,คนรุ่นใหม่,เมา,พื้นฐาน,ที่อยู่,วิภาวดีรังสิต,ติดตั้ง,nissan,ช็อค,ความต้องการ,ไซน์,ความร้อน,ซุสุ,โน๊ต,เติ้ล,เช็ค,รุ่นๆ,bangkok,รต,ยบค,วรัส,ใส่,ใจเสีย,นี้,american,เรียกคืน,ยัน,cp,ไฟ,5j,เปลี่ยนเกียร์,ลม์,สี่,ยอมรับผิด,measuring
0,neu,🚗💨💨 ซิ่งเป็นบางเวลา ซ่อกแซ่กได้ทุกที่ << ขับสน...,"[🚗💨💨, ซิ่ง, เป็น, บางเวลา, ซ่อก, แซ่, ก, ได้, ...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,pos,สนใจ ฟอจูนเนอร์ สีขาวครับ,"[สนใจ, ฟอ, จูน, เนอ, ร์, สี, ขาว, ครับ]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,neg,Nissan คงใกล้จบแล้วแน่ๆ,"[nissan, คง, ใกล้, จบ, แล้ว, แน่ๆ]",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,neu,สุดเท่กับชุดแต่ง RS มาพร้อมภายนอก... สปอร์ต ดุ...,"[สุด, เท่, กับ, ชุด, แต่ง, rs, มา, พร้อม, ภาย,...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,pos,แดงโดนใจจริงๆ 😲 Honda Civic Hatchback Rallye R...,"[แดง, โดนใจ, จริงๆ, 😲, honda, civic, hatchback...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
for words in setallword:
  idx = 0
  N = all_df.shape[0]
  for i in all_df['tokenize']:
    if words in i:
      # tf = i.count(words)/len(i)
      i_count = i.count(words)
      if i_count == 0:
        tf = 0
      else:
        tf = 1 + math.log(i_count)
      idf = math.log(N/fre_in_all[words])
      all_df.at[idx,words] = tf*idf
    else:
      all_df.at[idx,words] = 0
    idx += 1
all_df.head()

Unnamed: 0,sentiment,word,tokenize,เจ๋ง,สมราคา,👍👍,gls,...ค,อร์ค,privilege,วิจัย,เปน,แบ็ก,ฐานทัพ,ขับๆ,แม็คกำ,hb,ร.,เพรียวลม,นิร,วาม,kodo,นยำ,🛒,ผ่าน,ram,จังหวะ,159,titanium,เลิฟๆ,64,ใช้,ปัญหา,คิง,หลอด,มอง,กัมพู,ร่วมมือ,เทพฯ,วิกฤติ,...,020,แปซิฟิก,ข่ๅว,ใส้,จังหวัด,l200,คนรุ่นใหม่,เมา,พื้นฐาน,ที่อยู่,วิภาวดีรังสิต,ติดตั้ง,nissan,ช็อค,ความต้องการ,ไซน์,ความร้อน,ซุสุ,โน๊ต,เติ้ล,เช็ค,รุ่นๆ,bangkok,รต,ยบค,วรัส,ใส่,ใจเสีย,นี้,american,เรียกคืน,ยัน,cp,ไฟ,5j,เปลี่ยนเกียร์,ลม์,สี่,ยอมรับผิด,measuring
0,neu,🚗💨💨 ซิ่งเป็นบางเวลา ซ่อกแซ่กได้ทุกที่ << ขับสน...,"[🚗💨💨, ซิ่ง, เป็น, บางเวลา, ซ่อก, แซ่, ก, ได้, ...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,pos,สนใจ ฟอจูนเนอร์ สีขาวครับ,"[สนใจ, ฟอ, จูน, เนอ, ร์, สี, ขาว, ครับ]",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,neg,Nissan คงใกล้จบแล้วแน่ๆ,"[nissan, คง, ใกล้, จบ, แล้ว, แน่ๆ]",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,2.69222,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,neu,สุดเท่กับชุดแต่ง RS มาพร้อมภายนอก... สปอร์ต ดุ...,"[สุด, เท่, กับ, ชุด, แต่ง, rs, มา, พร้อม, ภาย,...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,pos,แดงโดนใจจริงๆ 😲 Honda Civic Hatchback Rallye R...,"[แดง, โดนใจ, จริงๆ, 😲, honda, civic, hatchback...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
Y = all_df['sentiment']
print(Y)

0       neu
1       pos
2       neg
3       neu
4       pos
       ... 
2564    neu
2565    neg
2566    pos
2567      q
2568      q
Name: sentiment, Length: 2569, dtype: object


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

df = all_df.copy()
df = df.drop(['sentiment','word','tokenize'], axis=1)
df.head()

Unnamed: 0,เจ๋ง,สมราคา,👍👍,gls,...ค,อร์ค,privilege,วิจัย,เปน,แบ็ก,ฐานทัพ,ขับๆ,แม็คกำ,hb,ร.,เพรียวลม,นิร,วาม,kodo,นยำ,🛒,ผ่าน,ram,จังหวะ,159,titanium,เลิฟๆ,64,ใช้,ปัญหา,คิง,หลอด,มอง,กัมพู,ร่วมมือ,เทพฯ,วิกฤติ,เบาะ,preawva29,auris,...,020,แปซิฟิก,ข่ๅว,ใส้,จังหวัด,l200,คนรุ่นใหม่,เมา,พื้นฐาน,ที่อยู่,วิภาวดีรังสิต,ติดตั้ง,nissan,ช็อค,ความต้องการ,ไซน์,ความร้อน,ซุสุ,โน๊ต,เติ้ล,เช็ค,รุ่นๆ,bangkok,รต,ยบค,วรัส,ใส่,ใจเสีย,นี้,american,เรียกคืน,ยัน,cp,ไฟ,5j,เปลี่ยนเกียร์,ลม์,สี่,ยอมรับผิด,measuring
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,2.69222,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
def PredictandEvaluate(X_train,y_train,X_test,y_test):
  from sklearn import svm

  clf = svm.SVC(kernel='linear')
  clf.fit(X_train, y_train)

  y_pred = clf.predict(X_test)

  from sklearn import metrics
  from sklearn.metrics import confusion_matrix
  from sklearn.metrics import classification_report

  # Step 5: evaluate the classifiers
  print("Classification Performance for SVM\n")
  # Model Accuracy: how often is the classifier correct?
  print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
  print()
  print(classification_report(y_test, y_pred))
  print(confusion_matrix(y_test, y_pred))

In [None]:
# Import train_test_split function
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(df, Y, test_size=0.3,random_state=109) # 70% training and 30% test

PredictandEvaluate(X_train,y_train,X_test,y_test)

Classification Performance for SVM

Accuracy: 0.6472114137483788

              precision    recall  f1-score   support

         neg       0.58      0.56      0.57       176
         neu       0.72      0.77      0.75       453
         pos       0.46      0.38      0.41       109
           q       0.36      0.24      0.29        33

    accuracy                           0.65       771
   macro avg       0.53      0.49      0.50       771
weighted avg       0.63      0.65      0.64       771

[[ 99  68   8   1]
 [ 53 351  36  13]
 [ 17  51  41   0]
 [  3  17   5   8]]


with normalization

In [None]:
from sklearn import preprocessing

X_train, X_test, y_train, y_test = train_test_split(df, Y, test_size=0.3,random_state=109)
temp = preprocessing.normalize(pd.concat([X_train,X_test]))
X_train = temp[:1798,:]
X_test = temp[1798:,:]

PredictandEvaluate(X_train,y_train,X_test,y_test)

Classification Performance for SVM

Accuracy: 0.7146562905317769

              precision    recall  f1-score   support

         neg       0.70      0.52      0.59       176
         neu       0.70      0.94      0.80       453
         pos       0.92      0.32      0.48       109
           q       0.00      0.00      0.00        33

    accuracy                           0.71       771
   macro avg       0.58      0.44      0.47       771
weighted avg       0.70      0.71      0.68       771

[[ 91  83   2   0]
 [ 27 425   1   0]
 [ 11  63  35   0]
 [  1  32   0   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


with standard scaler(normalization)

In [None]:
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(df, Y, test_size=0.3,random_state=109)
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

PredictandEvaluate(X_train,y_train,X_test,y_test)

Classification Performance for SVM

Accuracy: 0.6186770428015564

              precision    recall  f1-score   support

         neg       0.59      0.53      0.56       176
         neu       0.69      0.75      0.72       453
         pos       0.37      0.31      0.34       109
           q       0.31      0.27      0.29        33

    accuracy                           0.62       771
   macro avg       0.49      0.47      0.48       771
weighted avg       0.61      0.62      0.61       771

[[ 93  73   7   3]
 [ 51 341  46  15]
 [ 11  62  34   2]
 [  2  17   5   9]]


Bigram

In [None]:
bi_df = pd.read_csv(path+'Training_data.csv',names=["sentiment", "word"])
bi_df.head()

In [None]:
for idx,row in bi_df.iterrows():
  bi_df.at[idx,'tokenize']=bi_df.at[idx,'word']
bi_df.head()

In [None]:
for idx,row in bi_df.iterrows():
  bi_df.at[idx,'tokenize']=preprocessing(bi_df.at[idx,'tokenize'])
bi_df.head()

In [None]:
temp = bi_df.copy()
for idx,row in temp.iterrows():
  token = temp.at[idx,'tokenize']
  n_token = len(token)
  bi_df.at[idx,'tokenize'] = [token[i]+token[i+1] for i in range(n_token-1)]
bi_df.head()

In [None]:
bi_allwordlist = bi_df.sum()['tokenize']
print(len(bi_allwordlist))
bi_setallword = set(bi_allwordlist)
print(len(bi_setallword))

60234
54188


In [None]:
bi_fre_in_all = {}
for i in bi_setallword:
  bi_fre_in_all[i] = sum(bi_df['tokenize'].apply(lambda x: i in x))
print(bi_fre_in_all)



In [None]:
for word in bi_setallword:
  bi_df[word] = ''
for words in bi_setallword:
  idx = 0
  N = bi_df.shape[0]
  for i in bi_df['tokenize']:
    if words in i:
      # tf = i.count(words)/len(i)
      i_count = i.count(words)
      if i_count == 0:
        tf = 0
      else:
        tf = 1 + math.log(i_count)
      idf = math.log(N/bi_fre_in_all[words])
      bi_df.at[idx,words] = tf*idf
    else:
      bi_df.at[idx,words] = 0
    idx += 1
bi_df.head()

In [None]:
y = bi_df['sentiment']
y

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

dfbigram = bi_df.copy()
dfbigram = dfbigram.drop(['sentiment','word','tokenize'], axis=1)
dfbigram.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dfbigram, y, test_size=0.3,random_state=109)

PredictandEvaluate(X_train,y_train,X_test,y_test)

Classification Performance for SVM

Accuracy: 0.5901426718547341

              precision    recall  f1-score   support

         neg       0.67      0.01      0.02       176
         neu       0.59      0.99      0.74       453
         pos       0.44      0.04      0.07       109
           q       0.00      0.00      0.00        33

    accuracy                           0.59       771
   macro avg       0.43      0.26      0.21       771
weighted avg       0.56      0.59      0.45       771

[[  2 173   1   0]
 [  1 449   3   0]
 [  0 105   4   0]
 [  0  32   1   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
X_train, X_test, y_train, y_test = train_test_split(dfbigram, y, test_size=0.3,random_state=109)
temp = preprocessing.normalize(pd.concat([X_train,X_test]))
X_train = temp[:1798,:]
X_test = temp[1798:,:]

PredictandEvaluate(X_train,y_train,X_test,y_test)

Classification Performance for SVM

Accuracy: 0.5901426718547341

              precision    recall  f1-score   support

         neg       0.00      0.00      0.00       176
         neu       0.59      1.00      0.74       453
         pos       0.67      0.02      0.04       109
           q       0.00      0.00      0.00        33

    accuracy                           0.59       771
   macro avg       0.31      0.25      0.19       771
weighted avg       0.44      0.59      0.44       771

[[  0 176   0   0]
 [  0 453   0   0]
 [  0 107   2   0]
 [  0  32   1   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


word2vec

In [None]:
word2vec_df = pd.read_csv(path+'Training_data.csv',names=["sentiment", "word"])
for idx,row in all_df.iterrows():
    word2vec_df.at[idx,'tokenize']=word2vec_df.at[idx,'word']
for idx,row in all_df.iterrows():
    word2vec_df.at[idx,'tokenize']=preprocessing(word2vec_df.at[idx,'tokenize'])

In [None]:
from gensim.models import Word2Vec

model = Word2Vec.load("TNCc5model.bin") 

In [None]:
model.__dict__

In [None]:
def sumvector(listtxt):
    sumvec = np.zeros(model.vector_size)
    for i in listtxt:
        try:
            sumvec += model.wv.get_vector(i)
        except:
            pass
    return sumvec

In [None]:
sumvec = pd.DataFrame(word2vec_df['tokenize'].apply(sumvector))
sumvec.head()

In [None]:
for i in range(model.vector_size):
    sumvec[i] = float(0)
for idx,row in sumvec.iterrows():
    for i in range(model.vector_size):
        sumvec.at[idx,i] = sumvec.at[idx,'tokenize'][i]

In [None]:
sumvec = sumvec.drop(['tokenize'], axis=1)
sumvec.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(sumvec, y, test_size=0.3,random_state=109)

PredictandEvaluate(X_train,y_train,X_test,y_test)

Classification Performance for SVM

Accuracy: 0.6329442282749675

              precision    recall  f1-score   support

         neg       0.57      0.39      0.46       176
         neu       0.66      0.86      0.75       453
         pos       0.57      0.28      0.37       109
           q       0.00      0.00      0.00        33

    accuracy                           0.63       771
   macro avg       0.45      0.38      0.40       771
weighted avg       0.60      0.63      0.60       771

[[ 69  97   8   2]
 [ 43 389  15   6]
 [ 10  68  30   1]
 [  0  33   0   0]]


BERT (Fully pretrain from Wangchanberta)

In [7]:
BERT_df = pd.read_csv(path+'Training_data.csv',names=["sentiment", "word"])
Y = BERT_df['sentiment']

In [None]:
!pip -q install transformers==3.5.0 
!pip install --quiet --pre torchvision -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
!pip install torch torchvision
!pip install torch==1.4.0
!pip install datasets

In [3]:
import numpy as np
from tqdm.auto import tqdm
import torch

#datasets
from datasets import load_dataset

#transformers
from transformers import (
    AutoTokenizer,
    pipeline,
)

In [4]:
public_models = ['xlm-roberta-base', 'bert-base-multilingual-cased'] 

model_name = "wangchanberta-base-att-spm-uncased" 

tokenizer = AutoTokenizer.from_pretrained('airesearch/wangchanberta-base-att-spm-uncased',revision='main',model_max_length=416,)


Downloading:   0%|          | 0.00/546 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/905k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/282 [00:00<?, ?B/s]

In [5]:
dataset_name = "wisesight_sentiment"
classify_multiclass = pipeline(task='sentiment-analysis',
         tokenizer=tokenizer,
         model = 'airesearch/wangchanberta-base-att-spm-uncased',
         revision = 'finetuned@wisesight_sentiment')

Downloading:   0%|          | 0.00/423M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/716 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/421M [00:00<?, ?B/s]

In [8]:
bert_pred = []
word = BERT_df['word']
for i in range(2569):
  bert_pred.append(classify_multiclass(word[i])[0]['label'])
print(bert_pred)

['neu', 'neu', 'neu', 'neu', 'neu', 'neu', 'neu', 'neu', 'neu', 'neg', 'neu', 'neg', 'neu', 'neu', 'pos', 'neu', 'neg', 'neu', 'neu', 'neu', 'pos', 'neu', 'pos', 'neu', 'neu', 'neu', 'neu', 'neu', 'neu', 'neu', 'neu', 'neg', 'neg', 'neu', 'neu', 'neu', 'neu', 'neu', 'neu', 'neu', 'neu', 'pos', 'neu', 'neu', 'neu', 'neu', 'neg', 'neu', 'neg', 'neg', 'neg', 'neu', 'neu', 'neg', 'neu', 'neg', 'neu', 'neu', 'neu', 'neu', 'neg', 'neg', 'neu', 'neu', 'neg', 'neu', 'pos', 'pos', 'neg', 'neu', 'neg', 'pos', 'neg', 'pos', 'neu', 'neu', 'neg', 'neu', 'neu', 'neg', 'pos', 'neu', 'pos', 'pos', 'neg', 'neu', 'neu', 'neu', 'neg', 'neg', 'neu', 'neu', 'neu', 'neu', 'neu', 'neu', 'neu', 'neu', 'neu', 'neu', 'neu', 'neu', 'neu', 'neg', 'neu', 'neu', 'neu', 'neg', 'neg', 'neu', 'neu', 'pos', 'neu', 'neu', 'neu', 'neu', 'neu', 'neg', 'neu', 'neu', 'neu', 'neg', 'neu', 'neu', 'neu', 'neu', 'neg', 'pos', 'pos', 'neu', 'neg', 'neu', 'neu', 'neu', 'neu', 'neu', 'neu', 'neg', 'neu', 'neu', 'neu', 'neu', 'neu'

In [9]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print("Classification Performance for wisesight\n")
print("Accuracy:",metrics.accuracy_score(Y, bert_pred))
print()
print(classification_report(Y, bert_pred))
print(confusion_matrix(Y, bert_pred))

Classification Performance for wisesight

Accuracy: 0.8404048267808486

              precision    recall  f1-score   support

         neg       0.89      0.84      0.87       586
         neu       0.82      0.95      0.88      1484
         pos       0.84      0.65      0.73       389
           q       0.83      0.09      0.16       110

    accuracy                           0.84      2569
   macro avg       0.85      0.63      0.66      2569
weighted avg       0.84      0.84      0.82      2569

[[ 492   87    7    0]
 [  37 1406   39    2]
 [  21  117  251    0]
 [   1   98    1   10]]
