In [None]:
%%capture
!pip install gdown
!pip install pythainlp
!pip install emoji

In [None]:
!gdown --id 1I6FkY-wppSCt3eB1czmP0hHfcScwMc3s
!unzip 'sentiment-assignment.zip'
!rm 'sentiment-assignment.zip'

In [12]:
# Import require library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from pythainlp import word_tokenize
from pythainlp.ulmfit import process_thai
from pythainlp.tag.named_entity import ThaiNameTagger

In [49]:
def load_data(path):
  data = pd.read_csv( path, sep="\n", header=None)
  data.columns = ['texts']

  #set lower
  data['texts'] = data.texts.map(lambda x: x.lower())

  return data

In [50]:
neg_file_path = os.path.join(os.getcwd(), 'sentiment-assignment/neg.txt')
neu_file_path = os.path.join(os.getcwd(), 'sentiment-assignment/neu.txt')
pos_file_path = os.path.join(os.getcwd(), 'sentiment-assignment/pos.txt')
q_file_path = os.path.join(os.getcwd(), 'sentiment-assignment/q.txt')

df = load_data(neg_file_path)
df = df.append(load_data(neu_file_path))
df = df.append(load_data(pos_file_path))
df = df.append(load_data(q_file_path))
df = df.sample(frac=1).reset_index(drop=True)
print(df.shape)
df.head()

(28055, 1)


Unnamed: 0,texts
0,‡∏á‡∏∑‡∏°
1,‡∏Ñ‡∏¥‡∏î‡∏ß‡πà‡∏≤‡πÑ‡∏°‡πà‡πÄ‡πÄ‡∏û‡∏á‡∏ô‡∏∞‡πÄ‡∏û‡∏£‡∏≤‡∏∞‡∏Å‡∏¥‡∏ô‡∏ï‡∏±‡πâ‡∏á13‡∏Ñ‡∏ô‡∏Ñ‡∏¥‡∏î‡πÄ‡πÄ‡∏•‡πâ‡∏ß‡∏Å‡πá‡∏Ñ‡∏ô‡∏õ‡∏£‡∏∞...
2,‡∏ä‡πâ‡∏≤‡∏á1..‡∏™‡∏¥‡∏á‡∏´‡πå1..‡πÑ‡∏Æ‡πÄ‡∏ô‡πÄ‡∏Å‡πâ‡∏ô1..üò≠üò≠üò≠üò≠üò≠
3,‡πÄ‡∏ö‡∏µ‡∏¢‡∏£‡πå‡∏ä‡πâ‡∏≤‡∏á‡∏î‡∏∑‡πà‡∏°‡πÅ‡∏•‡πâ‡∏ß‡∏û‡∏π‡∏î‡∏†‡∏≤‡∏©‡∏≤‡∏≠‡∏±‡∏á‡∏Å‡∏§‡∏©‡∏Ñ‡∏•‡πà‡∏≠‡∏á‡∏î‡πâ‡∏ß‡∏¢‡∏Ñ‡∏£‡∏±‡∏ö
4,‡πÅ‡∏û‡πâ‡∏ú‡πâ‡∏≤‡∏≠‡∏ô‡∏≤‡∏°‡∏±‡∏¢ t t


# Preprocessing

In [51]:
# Split data
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size = 0.15, random_state=0)
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

df_train.shape[0], df_test.shape[0]

(23846, 4209)

## tagging

In [55]:
def tag_name(text):
    tagged = ner.get_ner(text, pos=True)
    word = []
    pos_tag = []
    ner_tag = []
    for tag in tagged:
        word.append(tag[0])
        pos_tag.append(tag[1])
        ner_tag.append(tag[2])
    return word, pos_tag, ner_tag

def tag_df(df):
    word_list = []
    pos_tag_list = []
    ner_tag_list = []
    for text in df['texts']:
        word, pos_tag, ner_tag = tag_name(text)
        word_list.append(word)
        pos_tag_list.append(pos_tag)
        ner_tag_list.append(ner_tag)
    
    data = []
    for i in range(len(word_list)):
        d = [word_list[i], pos_tag_list[i], ner_tag_list[i]]
        data.append(d)
    
    df = pd.DataFrame(data = data, columns = ['words', 'pos', 'ner'])
    return df

In [56]:
ner = ThaiNameTagger()

print('shape before tag :', df_train.shape)
df_train = tag_df(df_train)
df_test = tag_df(df_test)
print('shape after tag :', df_train.shape)

shape before tag : (23846, 1)
shape after tag : (23846, 3)


In [62]:
df_train.sample(10)

Unnamed: 0,words,pos,ner
772,"[‡∏ï‡∏∞, ‡πÄ‡∏ï‡∏∑‡∏≠‡∏ô, ‡πÑ‡∏ï]","[NN, VV, NN]","[O, O, O]"
14300,"[‡∏Ñ‡∏∏‡∏ì, ‡∏•‡∏π‡∏Å‡∏Ñ‡πâ‡∏≤, ‡∏™‡∏≤‡∏°‡∏≤‡∏£‡∏ñ, ‡πÄ‡∏Ç‡πâ‡∏≤‡πÑ‡∏õ, ‡∏≠‡πà‡∏≤‡∏ô, ‡∏£‡∏≤‡∏¢‡∏•‡∏∞‡πÄ‡∏≠‡∏µ‡∏¢‡∏î...","[NN, NN, AX, VV, VV, NN, NN, AV, VV, PS, PU, N...","[B-PERSON, I-PERSON, O, O, O, O, O, O, O, O, O..."
2655,"[25, ‡∏ô‡∏∞]","[NU, PA]","[B-ORGANIZATION, I-ORGANIZATION]"
17235,"[‡∏°‡∏≤, ‡∏ó‡∏≤‡∏ô, ‡∏ó‡∏µ‡πà, ‡∏™‡∏≤‡∏Ç‡∏≤, ‡πÄ‡∏ã, ‡∏ô‡∏ó, ‡∏£‡∏±‡∏•, ‡∏£‡∏±‡∏ï‡∏ô‡∏≤‡∏ò‡∏¥‡πÄ‡∏ö‡∏®‡∏£‡πå...","[AV, VV, PS, NN, NN, NN, NN, NN, PU, AX, NN, N...","[O, O, O, B-LOCATION, I-LOCATION, I-LOCATION, ..."
10069,"[‡πÉ‡∏ä‡πâ, ‡∏ô‡∏≤‡∏ß‡∏≤, ‡∏£‡∏≤, , ‡∏°‡∏≤, , 4, , ‡∏õ‡∏µ, , ‡πÑ‡∏°, ‡∏ß‡∏¥‡πà...","[VV, NN, NN, PU, AV, PU, NU, PU, CL, PU, NN, V...","[O, O, O, O, O, O, B-TIME, I-TIME, I-TIME, O, ..."
1685,"[‡∏£‡∏ñ‡∏¢‡∏ô‡∏ï‡πå, ‡πÇ‡∏ï‡πÇ‡∏¢‡∏ï‡πâ‡∏≤, ‡πÑ‡∏Æ‡∏•‡∏±‡∏Å‡∏ã‡πå, ‡∏£‡∏∏‡πà‡∏ô, ‡∏ó‡∏µ‡πà, ‡∏´‡∏≤‡∏¢, ‡πÑ‡∏õ,...","[NN, NN, NN, NN, CC, VV, AV, VV, PU, NN, PU, N...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
5695,"[‡∏™‡∏á‡∏™‡∏≤‡∏£, ‡∏ô‡∏≤‡∏á, ‡∏ô‡∏∞, , ‡πÄ‡∏à‡∏≠, ‡∏ö‡∏∏‡∏´‡∏£‡∏µ‡πà, ‡πÑ‡∏ü‡∏ü‡πâ‡∏≤, ‡∏Å‡πá, ‡∏¢‡∏∂...","[VV, NN, PA, PU, VV, NN, NN, CC, VV, AV, VV, P...","[O, B-PERSON, I-PERSON, I-PERSON, I-PERSON, I-..."
5821,"[‡∏Å‡∏ß‡πà‡∏≤, ‡∏à‡∏∞, ‡∏£‡πá‡∏≠‡∏Å, ‡πÄ‡∏ó‡πà‡∏≤, ‡∏ß‡∏±‡∏ô‡∏ô‡∏µ‡πâ, , ‡πÉ‡∏ô, ‡∏ó‡∏∏‡∏Å‡∏ß‡∏±‡∏ô, ...","[CC, AX, VV, VV, NN, PU, PS, NN, NN, PR, CC, V...","[O, O, O, O, B-DATE, O, O, O, O, O, O, O, O, O..."
15738,"[‡∏≠‡∏¢‡∏∏‡∏ò‡∏¢‡∏≤, ‡πÑ‡∏°‡πà, ‡πÄ‡∏´‡πá‡∏ô, ‡∏°‡∏µ]","[NN, NG, VV, VV]","[B-LOCATION, O, O, O]"
5242,"[‡πÉ‡∏Ñ‡∏£, ‡∏à‡∏∞, ‡πÉ‡∏™‡πà, ‡∏ä‡∏∏‡∏î, ‡πÑ‡∏ó‡∏¢, ‡πÑ‡∏õ, ‡∏Å‡∏¥‡∏ô, , ‡∏Ñ‡πà‡∏≤, ‡∏ä‡∏∏‡∏î,...","[PR, AX, VV, NN, NN, AV, VV, PU, NN, NN, VV, A...","[O, O, O, O, B-LOCATION, O, O, O, O, O, O, O, ..."


## padding

In [None]:
from keras.preprocessing import sequence