In [None]:
# Read dataset file

filename = "./dataset/tweet8000.csv"

In [1]:
## import dependencies
import numpy as np
import pandas as pd
import csv
import re
from pythainlp.tokenize import word_tokenize
from pythainlp.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.model_selection import train_test_split

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix

In [None]:
## define tokenizer method for TFIDFvectorizer

def tokenizer(tweet):
    pattern = re.compile(r"[^\u0E00-\u0E4C]|^'|'$|''|\u0E46|\u0E3F")
    char_to_remove = re.findall(pattern, tweet)
    list_with_char_removed = [char for char in tweet if not char in char_to_remove]
    result_string = ''.join(list_with_char_removed)
    return word_tokenize(result_string, engine='deepcut')

In [None]:
## Read a file and make a numpy array of labels

df = pd.read_csv(filename)
X = np.array(df['tweet'])
y1 = np.array(df['anger'])
y2 = np.array(df['anticipation'])
y3 = np.array(df['disgust'])
y4 = np.array(df['fear'])
y5 = np.array(df['joy'])
y6 = np.array(df['sadness'])
y7 = np.array(df['surprise'])
y8 = np.array(df['trust'])
l = np.array(df['label'])

In [None]:
## train a model with CV

def train_model(X,y,n,C):
    features = X
    label =y
    
    R = 0.0
    P = 0.0
    F = 0.0
    
    skf = StratifiedKFold(n_splits=n, random_state=30, shuffle=True)
    
    for train_index, test_index in skf.split(features, label):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = label[train_index], label[test_index]
        tclf = svm.LinearSVC(class_weight='balanced', C=C, random_state=0).fit(X_train,y_train)
        predict = tclf.predict(X_test)
        
        r=recall_score(y_test,predict)
        p=precision_score(y_test,predict)
        f=f1_score(y_test,predict)
        
        R = R + r
        P = P + p
        F = F + f
        
        print('Prediction')
        print(predict)
        print('y')
        print(y_test)
        print('scores: recall, precision, f1')
        print(r,p,f)
        print()
        
    print()
    print('Average')
    print(R/n,P/n,F/n)

In [None]:
## train model for multi-class

def train_model_multi(X,y,s):
    features = X
    label = y
    
    X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=s, random_state=0)
    tclf = svm.LinearSVC(class_weight='balanced', C=1, random_state=0).fit(X_train,y_train)
    predict = tclf.predict(X_test)

    r=recall_score(y_test,predict,average=None)
    p=precision_score(y_test,predict,average=None)
    f=f1_score(y_test,predict,average=None)
    cm=confusion_matrix(y_test,predict)
    print(predict)
    print(y_test)
    print(r,p,f)
    print(cm)

In [None]:
## Vectorized TF-IDF
v = TfidfVectorizer(tokenizer=tokenizer, stop_words=stopwords.words('thai'))
train_data_feature = v.fit_transform(X).toarray()
#print(v.get_feature_names())
        
X = train_data_feature

In [None]:
## Anger
train_model(X,y1,5,1)

In [None]:
## Multiclasses
train_model_multi(X,l,0.2)

In [None]:
len(l)

In [None]:
train_model(X,y2,5,1)

In [None]:
train_model(X,y3,5,1)

In [None]:
train_model(X,y4,5,1)

In [None]:
train_model(X,y5,5,1)

In [None]:
train_model(X,y6,5,1)

In [None]:
train_model(X,y7,5,1)

In [None]:
train_model(X,y8,5,1)

In [None]:
## Remove hashtag

tweets = np.array(df['tweet'])
TH = np.array(df['th'])

for idx,th in enumerate(TH):
    tweets[idx] = tweets[idx].replace('#'+th,'')
print(tweets)

In [None]:
v2 = TfidfVectorizer(tokenizer=tokenizer, stop_words=stopwords.words('thai'))
X2 = v2.fit_transform(tweets).toarray()

In [None]:
train_model(X2,y1,5,1)

In [None]:
train_model_multi(X2,l,0.2)

In [None]:
## Train with hashtag test without
tclf = svm.LinearSVC(class_weight='balanced', C=1.0, random_state=0).fit(X,y1)
predict = tclf.predict(v.transform(tweets))

r=recall_score(y1,predict)
p=precision_score(y1,predict)
f=f1_score(y1,predict)

print(r,p,f)

In [None]:
tclf = svm.LinearSVC(class_weight='balanced', C=1.0, random_state=0).fit(X,y2)
predict = tclf.predict(v.transform(tweets))

r=recall_score(y2,predict)
p=precision_score(y2,predict)
f=f1_score(y2,predict)

print(r,p,f)

In [None]:
print(tclf.predict(v.transform(["โกรธแล้ว","ดีใจมากเลย","หิวข้าว"]).toarray()))

In [None]:
anger = svm.LinearSVC(class_weight='balanced', C=1.0, random_state=0).fit(X,y1)

In [None]:
print(anger.predict(v.transform(["อีห่านนนนน เบียดกูจังเลย นมกูยิ่งไม่มีอยู่"]).toarray()))

In [None]:
predict = anger.predict(X)

r=recall_score(y1,predict)
p=precision_score(y1,predict)
f=f1_score(y1,predict)

print(r,p,f)

In [None]:
mclf = svm.LinearSVC(class_weight='balanced', C=1, random_state=0).fit(X,l)
mpredict = mclf.predict(v.transform(tweets))

r=recall_score(l,mpredict,average=None)
p=precision_score(l,mpredict,average=None)
f=f1_score(l,mpredict,average=None)
cm=confusion_matrix(l,mpredict)
print(predict)
print(l)
print(r,p,f)
print(cm)

In [None]:
## tokenized tweets
tk = []
tw = np.array(df['tweet'])
for idx,tweet in enumerate(tw):
    #print(tweet)
    tokenized = tokenizer(str(tweet))
    tk.append(tokenized)
tk

In [None]:
## write tokenized tweets into a file

tokenized_file = open('./dataset/tokenized_tweets.csv', 'w', encoding='utf8', newline='')
with tokenized_file:
    writer = csv.writer(tokenized_file)
    writer.writerow(['tweet','th','eng','anger','anticipation','disgust','fear','joy','sadness','surprise','trust','label'])
    for i in range(len(tk)):
        string = ' '.join(tk[i])
        #print(string)
        writer.writerow([string,df['th'][i],df['eng'][i],y1[i],y2[i],y3[i],y4[i],y5[i],y6[i],y7[i],y8[i],l[i]])

In [None]:
df2 = pd.read_csv('./dataset/tokenized_tweets.csv')
t = df['tweet']

In [None]:
t

In [None]:
tokenizer(t[10])

In [None]:
new_v = TfidfVectorizer()
train_data_feature = new_v.fit_transform(t.values.astype('U'))

In [None]:
tclf2 = svm.LinearSVC(class_weight='balanced', C=1.0, random_state=0).fit(train_data_feature,y1)
predict = tclf2.predict(v.transform(tweets))

r=recall_score(y1,predict)
p=precision_score(y1,predict)
f=f1_score(y1,predict)

print(r,p,f)

In [None]:
train_model(train_data_feature,y1,5,1)

In [None]:
np.array(df['tweet'])

In [2]:
dataset_name = "./dataset/tokenized_tweets_all.csv"

df = pd.read_csv(dataset_name)
X = np.array(df['tweet'])
y1 = np.array(df['anger'])
y2 = np.array(df['anticipation'])
y3 = np.array(df['disgust'])
y4 = np.array(df['fear'])
y5 = np.array(df['joy'])
y6 = np.array(df['sadness'])
y7 = np.array(df['surprise'])
y8 = np.array(df['trust'])
l = np.array(df['label'])

In [3]:
def tokenize_split(sentence):
    s = sentence.split(" ")
    #print(t)
    #remove ''
    ans = [word for word in s if word not in ['','#']]
    #print(ans)
    return ans

In [4]:
vect = TfidfVectorizer(tokenizer=tokenize_split)
train_data_feature = vect.fit_transform(X).toarray()
#vect.get_feature_names()

In [None]:
train_model(train_data_feature,y1,5,1)

In [None]:
len(vect.get_feature_names())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_data_feature,y1,test_size=0.2,random_state=0)
#clf = svm.LinearSVC(class_weight='balanced',C=1).fit(X_train,y_train)
#p = clf.predict(X_test)
#print(recall_score(y_test,p),precision_score(y_test,p),f1_score(y_test,p))

In [None]:
len(train_data_feature[1])

In [None]:
vect.get_feature_names()[58427]

In [None]:
clf = svm.LinearSVC(class_weight='balanced',C=1).fit(train_data_feature[:100000],y1[:100000])

In [5]:
len(y1)

292122

In [27]:
clf.predict(vect.transform(["โว้ยยย อะไรเนี่ย"]).toarray())

array([1])