In [43]:
# Read dataset file

filename = "./dataset/tweet8000.csv"

In [44]:
import numpy as np
import pandas as pd
import csv
import re
from pythainlp.tokenize import word_tokenize
from pythainlp.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.model_selection import train_test_split

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix

In [45]:
def tokenizer(tweet):
    pattern = re.compile(r"[^\u0E00-\u0E4C]|^'|'$|''|\u0E46|\u0E3F")
    char_to_remove = re.findall(pattern, tweet)
    list_with_char_removed = [char for char in tweet if not char in char_to_remove]
    result_string = ''.join(list_with_char_removed)
    return word_tokenize(result_string, engine='deepcut')

In [46]:
## Read a file and make a numpy array of labels
df = pd.read_csv(filename)
X = np.array(df['tweet'])
y1 = np.array(df['anger'])
y2 = np.array(df['anticipation'])
y3 = np.array(df['disgust'])
y4 = np.array(df['fear'])
y5 = np.array(df['joy'])
y6 = np.array(df['sadness'])
y7 = np.array(df['surprise'])
y8 = np.array(df['trust'])
l = np.array(df['label'])

In [47]:
def train_model(X,y,n,C):
    features = X
    label =y
    
    R = 0.0
    P = 0.0
    F = 0.0
    
    skf = StratifiedKFold(n_splits=n, random_state=30, shuffle=True)
    
    for train_index, test_index in skf.split(features, label):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = label[train_index], label[test_index]
        tclf = svm.LinearSVC(class_weight='balanced', C=C, random_state=0).fit(X_train,y_train)
        predict = tclf.predict(X_test)
        
        r=recall_score(y_test,predict)
        p=precision_score(y_test,predict)
        f=f1_score(y_test,predict)
        
        R = R + r
        P = P + p
        F = F + f
        
        print('Prediction')
        print(predict)
        print('y')
        print(y_test)
        print('scores: recall, precision, f1')
        print(r,p,f)
        print()
        
    print()
    print('Average')
    print(R/n,P/n,F/n)

In [None]:
def train_model_multi(X,y,n,C):
    features = X
    label =y
    
    R = 0.0
    P = 0.0
    F = 0.0
    C = np.zeros(64)
    
    skf = StratifiedKFold(n_splits=n, random_state=30, shuffle=True)
    
    for train_index, test_index in skf.split(features, label):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = label[train_index], label[test_index]
        tclf = svm.LinearSVC(class_weight='balanced', C=C, random_state=0).fit(X_train,y_train)
        predict = tclf.predict(X_test)
        
        r=recall_score(y_test,predict,average='micro')
        p=precision_score(y_test,predict,average='micro')
        f=f1_score(y_test,predict,average='micro')
        cm=confusion_matrix(y_test,predict)
        
        R = R + r
        P = P + p
        F = F + f
        C = C + cm
        
        print('Prediction')
        print(predict)
        print('y')
        print(y_test)
        print('scores: recall, precision, f1')
        print(r,p,f)
        print('confusion matrix')
        print(cm)
        print()
        
    print()
    print('Average')
    print(R/n,P/n,F/n)
    print(C)

In [None]:
v = TfidfVectorizer(tokenizer=tokenizer, stop_words=stopwords.words('thai'))
train_data_feature = v.fit_transform(X).toarray()
#print(v.get_feature_names())
        
X = train_data_feature

In [None]:
train_model(X,y1,5,1)

In [None]:
train_model_multi(X,l,5,1)