In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
from collections import Counter
import re

# nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder



---

Sentiment classification

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
tweets = pd.read_csv('/content/drive/MyDrive/01NLP/data/twitter_sentiment.csv', encoding='latin1')

In [None]:
tweets.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL frie...
1,2,0,I missed the New Moon trail...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,i think mi bf is cheating on me!!! ...


In [None]:
tweets=tweets.drop(['ItemID'],axis=1)

In [None]:
tweets.head()

Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my APL frie...
1,0,I missed the New Moon trail...
2,1,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,0,i think mi bf is cheating on me!!! ...


In [None]:
target_cnt = Counter(tweets.Sentiment)

print(target_cnt.keys(), target_cnt.values())

dict_keys([0, 1]) dict_values([43532, 56457])


In [None]:
tweets['Sentiment'].value_counts()

1    56457
0    43532
Name: Sentiment, dtype: int64

In [None]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text) # re.sub 取代
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) # :-) ;-( =-D :-P :D :-(
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [None]:
tweets.SentimentText = tweets.SentimentText.map(lambda x: preprocessor(x))

In [None]:
tweets.head()

Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my apl friend
1,0,i missed the new moon trailer
2,1,omg its already 7 30 o
3,0,omgaga im sooo im gunna cry i ve been at this...
4,0,i think mi bf is cheating on me t_t


In [None]:
tokenizer = keras.preprocessing.text.Tokenizer() # for encoding
tokenizer.fit_on_texts(tweets.SentimentText)

vocab_size = len(tokenizer.word_index) + 1 # pad: 0
print("Total words", vocab_size)

Total words 103192


In [None]:
df = pd.DataFrame()

In [None]:
df['SentimentText'] = tokenizer.texts_to_sequences(tweets.SentimentText) # label encoding

In [None]:
df['Sentiment'] = tweets['Sentiment']

In [None]:
df.head()

Unnamed: 0,SentimentText,Sentiment
0,"[13, 19, 126, 10, 8, 22001, 261]",0
1,"[1, 247, 3, 111, 1072, 1722]",0
2,"[243, 82, 205, 536, 526, 229]",1
3,"[35235, 70, 538, 70, 1850, 568, 1, 104, 96, 35...",0
4,"[1, 73, 2336, 1488, 13, 4472, 18, 15, 12, 12]",0


In [None]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=1, stratify=df['Sentiment'])
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))

TRAIN size: 89990
TEST size: 9999


In [None]:
x_train = keras.preprocessing.sequence.pad_sequences(df_train.SentimentText.values, maxlen=256) # pad 0 to the left of sentence to length of maxlen
x_test = keras.preprocessing.sequence.pad_sequences(df_test.SentimentText.values, maxlen=256)

In [None]:
x_train

array([[    0,     0,     0, ..., 67600,   796,  2554],
       [    0,     0,     0, ...,  2704,   122,   114],
       [    0,     0,     0, ...,    26,   481,    39],
       ...,
       [    0,     0,     0, ...,    23,    31,     2],
       [    0,     0,     0, ...,   245,   168,  9358],
       [    0,     0,     0, ...,  3264,   667, 36938]], dtype=int32)

In [None]:
y_test = df_test.Sentiment.values
y_test

array([0, 1, 1, ..., 0, 1, 0])

In [None]:
y_train = df_train.Sentiment.values.reshape(-1,1) # 1-d 轉 2-d, binary classification
y_test = df_test.Sentiment.values.reshape(-1,1)

print("y_train",y_train.shape)
print("y_test",y_test.shape)

y_train (89990, 1)
y_test (9999, 1)


In [None]:
y_test

array([[0],
       [1],
       [1],
       ...,
       [0],
       [1],
       [0]])

In [None]:
max_features = vocab_size  # 要考慮作為特徵的語詞數量
maxlen = 256  # 當句子的長度超過256個語詞的部份,就把它刪除掉
batch_size = 16
ebd_features = 300 # embedding features' dimension.

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding

model = Sequential()   # 輸入 （sample size，time steps） with label encoding
model.add(Embedding(max_features, ebd_features, input_length=maxlen)) # => 參數(input features, output features, time steps)
model.add(Dropout(0.3)) # 承接 embedding 輸出 （sample size，time steps, output features）
model.add(LSTM(16, return_sequences=True)) # 輸入 （sample size，time steps, input features）, LSTM's output features=??
model.add(Dropout(0.3))
model.add(LSTM(4)) # LSTM's output features=??
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid')) # binary classification output features=1

model.summary()

model.compile(optimizer=Adam(1e-4),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# history = model.fit(x_train, y_train,          # 簡易版設定
#                     epochs=4,
#                     batch_size=batch_size,
#                     validation_split=0.2)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 256, 300)          30957600  
                                                                 
 dropout (Dropout)           (None, 256, 300)          0         
                                                                 
 lstm (LSTM)                 (None, 256, 16)           20288     
                                                                 
 dropout_1 (Dropout)         (None, 256, 16)           0         
                                                                 
 lstm_1 (LSTM)               (None, 4)                 336       
                                                                 
 dropout_2 (Dropout)         (None, 4)                 0         
                                                                 
 dense (Dense)               (None, 1)                 5

In [None]:
checkpoint = keras.callbacks.ModelCheckpoint('./tok.h5', monitor='val_accuracy', mode="max", save_best_only=True, verbose=1)
earlystopping = keras.callbacks.EarlyStopping(monitor='val_accuracy', mode="max", patience=3, verbose=1)
rlr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, verbose=1, mode='auto', min_delta=0.0001)

model.fit(x_train, y_train, epochs=10, batch_size=batch_size,
     validation_split=0.2, callbacks=[checkpoint,earlystopping,rlr])

Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.75670, saving model to ./tok.h5
Epoch 2/10
Epoch 2: val_accuracy improved from 0.75670 to 0.76436, saving model to ./tok.h5
Epoch 3/10
Epoch 3: val_accuracy did not improve from 0.76436
Epoch 4/10
Epoch 4: val_accuracy did not improve from 0.76436

Epoch 4: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-06.
Epoch 5/10
Epoch 5: val_accuracy did not improve from 0.76436
Epoch 5: early stopping


<keras.callbacks.History at 0x7ff1d746e2d0>

In [None]:
model = keras.models.load_model('./tok.h5') # best model

In [None]:
model.evaluate(x_test, y_test, batch_size=16)



[0.4875333607196808, 0.7714771628379822]



---

